[PATCH] sched: Make select_idle_sibling search domain configurable

Xi Wang Tue, 28 Jul 2020 00:03:18 -0700

The scope of select_idle_sibling idle cpu search is LLC. This
becomes a problem for the AMD CCX architecture, as the sd_llc is only
4 cores. On a many core machine, the range of search is too small to
reach a satisfactory level of statistical multiplexing / efficient
utilization of short idle time slices.


With this patch idle sibling search is detached from LLC and it
becomes run time configurable. To reduce search and migration
overheads, a presearch domain is added. The presearch domain will be
searched first before the "main search" domain, e.g.:

sysctl_sched_wake_idle_domain == 2 ("MC" domain)
sysctl_sched_wake_idle_presearch_domain == 1 ("DIE" domain)

Presearch will go through 4 cores of a CCX. If no idle cpu is found
during presearch, full search will go through the remaining cores of
a cpu socket.

Heuristics including sd->avg_scan_cost and sds->have_idle_cores
are only active for the main search.

On a 128 core (2 socket * 64 core, 256 hw threads) AMD machine ran
hackbench as

hackbench -g 20 -f 20 --loops 10000

A snapshot of run time was

Baseline: 11.8
With the patch: 7.6    (configured as in the example above)

Signed-off-by: Xi Wang <x...@google.com>
---
 block/blk-mq.c                 |   2 +-
 block/blk-softirq.c            |   2 +-
 include/linux/cpuset.h         |  10 +-
 include/linux/sched/topology.h |  11 +-
 kernel/cgroup/cpuset.c         |  32 ++++--
 kernel/sched/core.c            |  10 +-
 kernel/sched/fair.c            | 191 +++++++++++++++++++++------------
 kernel/sched/sched.h           |   9 +-
 kernel/sched/topology.c        |  87 ++++++++++-----
 kernel/sysctl.c                |  25 +++++
 10 files changed, 256 insertions(+), 123 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e0d173beaa3..20aee9f047e2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -626,7 +626,7 @@ void blk_mq_force_complete_rq(struct request *rq)
 
        cpu = get_cpu();
        if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
-               shared = cpus_share_cache(cpu, ctx->cpu);
+               shared = cpus_share_sis(cpu, ctx->cpu);
 
        if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
                rq->csd.func = __blk_mq_complete_request_remote;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 6e7ec87d49fa..dd38ac0e1f2e 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -108,7 +108,7 @@ void __blk_complete_request(struct request *req)
         */
        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) {
                if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
-                       shared = cpus_share_cache(cpu, ccpu);
+                       shared = cpus_share_sis(cpu, ccpu);
        } else
                ccpu = cpu;
 
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 04c20de66afc..8b243aa8462e 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -117,6 +117,7 @@ static inline int cpuset_do_slab_mem_spread(void)
 extern bool current_cpuset_is_being_rebound(void);
 
 extern void rebuild_sched_domains(void);
+extern void rebuild_sched_domains_force(void);
 
 extern void cpuset_print_current_mems_allowed(void);
 
@@ -173,7 +174,7 @@ static inline void cpuset_force_rebuild(void) { }
 
 static inline void cpuset_update_active_cpus(void)
 {
-       partition_sched_domains(1, NULL, NULL);
+       partition_sched_domains(1, NULL, NULL, 0);
 }
 
 static inline void cpuset_wait_for_hotplug(void) { }
@@ -259,7 +260,12 @@ static inline bool current_cpuset_is_being_rebound(void)
 
 static inline void rebuild_sched_domains(void)
 {
-       partition_sched_domains(1, NULL, NULL);
+       partition_sched_domains(1, NULL, NULL, 0);
+}
+
+static inline void rebuild_sched_domains_force(void)
+{
+       partition_sched_domains(1, NULL, NULL, 1);
 }
 
 static inline void cpuset_print_current_mems_allowed(void)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fb11091129b3..aff9739cf516 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -151,16 +151,17 @@ static inline struct cpumask *sched_domain_span(struct 
sched_domain *sd)
 
 extern void partition_sched_domains_locked(int ndoms_new,
                                           cpumask_var_t doms_new[],
-                                          struct sched_domain_attr *dattr_new);
+                                          struct sched_domain_attr *dattr_new,
+                                          int force_update);
 
 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-                                   struct sched_domain_attr *dattr_new);
+                                   struct sched_domain_attr *dattr_new, int 
force_update);
 
 /* Allocate an array of sched domains, for partition_sched_domains(). */
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
-bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_sis(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -199,7 +200,7 @@ struct sched_domain_attr;
 
 static inline void
 partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
-                              struct sched_domain_attr *dattr_new)
+                              struct sched_domain_attr *dattr_new, int 
force_update)
 {
 }
 
@@ -209,7 +210,7 @@ partition_sched_domains(int ndoms_new, cpumask_var_t 
doms_new[],
 {
 }
 
-static inline bool cpus_share_cache(int this_cpu, int that_cpu)
+static inline bool cpus_share_sis(int this_cpu, int that_cpu)
 {
        return true;
 }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 642415b8c3c9..5087b90c4c47 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -962,10 +962,10 @@ static void rebuild_root_domains(void)
 
 static void
 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-                                   struct sched_domain_attr *dattr_new)
+                                   struct sched_domain_attr *dattr_new, int 
force_update)
 {
        mutex_lock(&sched_domains_mutex);
-       partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+       partition_sched_domains_locked(ndoms_new, doms_new, dattr_new, 
force_update);
        rebuild_root_domains();
        mutex_unlock(&sched_domains_mutex);
 }
@@ -981,7 +981,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, 
cpumask_var_t doms_new[],
  *
  * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_locked(int force_update)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
@@ -1007,23 +1007,33 @@ static void rebuild_sched_domains_locked(void)
        ndoms = generate_sched_domains(&doms, &attr);
 
        /* Have scheduler rebuild the domains */
-       partition_and_rebuild_sched_domains(ndoms, doms, attr);
+       partition_and_rebuild_sched_domains(ndoms, doms, attr, force_update);
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_locked(int force_update)
 {
 }
 #endif /* CONFIG_SMP */
 
-void rebuild_sched_domains(void)
+void __rebuild_sched_domains(int force_update)
 {
        get_online_cpus();
        percpu_down_write(&cpuset_rwsem);
-       rebuild_sched_domains_locked();
+       rebuild_sched_domains_locked(force_update);
        percpu_up_write(&cpuset_rwsem);
        put_online_cpus();
 }
 
+void rebuild_sched_domains(void)
+{
+       __rebuild_sched_domains(0);
+}
+
+void rebuild_sched_domains_force(void)
+{
+       __rebuild_sched_domains(1);
+}
+
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -1437,7 +1447,7 @@ static void update_cpumasks_hier(struct cpuset *cs, 
struct tmpmasks *tmp)
        rcu_read_unlock();
 
        if (need_rebuild_sched_domains)
-               rebuild_sched_domains_locked();
+               rebuild_sched_domains_locked(0);
 }
 
 /**
@@ -1837,7 +1847,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
                cs->relax_domain_level = val;
                if (!cpumask_empty(cs->cpus_allowed) &&
                    is_sched_load_balance(cs))
-                       rebuild_sched_domains_locked();
+                       rebuild_sched_domains_locked(0);
        }
 
        return 0;
@@ -1903,7 +1913,7 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
        spin_unlock_irq(&callback_lock);
 
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-               rebuild_sched_domains_locked();
+               rebuild_sched_domains_locked(0);
 
        if (spread_flag_changed)
                update_tasks_flags(cs);
@@ -1994,7 +2004,7 @@ static int update_prstate(struct cpuset *cs, int val)
        if (parent->child_ecpus_count)
                update_sibling_cpumasks(parent, cs, &tmp);
 
-       rebuild_sched_domains_locked();
+       rebuild_sched_domains_locked(0);
 out:
        free_cpumasks(NULL, &tmp);
        return err;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e15543cb8481..e28548fc63f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2350,9 +2350,9 @@ void wake_up_if_idle(int cpu)
        rcu_read_unlock();
 }
 
-bool cpus_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_sis(int this_cpu, int that_cpu)
 {
-       return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+       return per_cpu(sd_sis_id, this_cpu) == per_cpu(sd_sis_id, that_cpu);
 }
 
 static inline bool ttwu_queue_cond(int cpu, int wake_flags)
@@ -2361,7 +2361,7 @@ static inline bool ttwu_queue_cond(int cpu, int 
wake_flags)
         * If the CPU does not share cache, then queue the task on the
         * remote rqs wakelist to avoid accessing remote data.
         */
-       if (!cpus_share_cache(smp_processor_id(), cpu))
+       if (!cpus_share_sis(smp_processor_id(), cpu))
                return true;
 
        /*
@@ -6501,7 +6501,7 @@ static void cpuset_cpu_active(void)
                 * operation in the resume sequence, just build a single sched
                 * domain, ignoring cpusets.
                 */
-               partition_sched_domains(1, NULL, NULL);
+               partition_sched_domains(1, NULL, NULL, 0);
                if (--num_cpus_frozen)
                        return;
                /*
@@ -6522,7 +6522,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
                cpuset_update_active_cpus();
        } else {
                num_cpus_frozen++;
-               partition_sched_domains(1, NULL, NULL);
+               partition_sched_domains(1, NULL, NULL, 0);
        }
        return 0;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..0ed71f2f3a81 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5736,8 +5736,8 @@ static void record_wakee(struct task_struct *p)
  * at a frequency roughly N times higher than one of its wakees.
  *
  * In order to determine whether we should let the load spread vs consolidating
- * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.
+ * sis domain, we look for a minimum 'flip' frequency of sis_size in one 
partner,
+ * and a factor of sis_size higher frequency in the other.
  *
  * With both conditions met, we can be relatively sure that the relationship is
  * non-monogamous, with partner count exceeding socket size.
@@ -5750,7 +5750,7 @@ static int wake_wide(struct task_struct *p)
 {
        unsigned int master = current->wakee_flips;
        unsigned int slave = p->wakee_flips;
-       int factor = __this_cpu_read(sd_llc_size);
+       int factor = __this_cpu_read(sd_sis_size);
 
        if (master < slave)
                swap(master, slave);
@@ -5786,7 +5786,7 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
         * a cpufreq perspective, it's better to have higher utilisation
         * on one CPU.
         */
-       if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, 
prev_cpu))
+       if (available_idle_cpu(this_cpu) && cpus_share_sis(this_cpu, prev_cpu))
                return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 
        if (sync && cpu_rq(this_cpu)->nr_running == 1)
@@ -5978,7 +5978,7 @@ static inline void set_idle_cores(int cpu, int val)
 {
        struct sched_domain_shared *sds;
 
-       sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+       sds = rcu_dereference(per_cpu(sd_sis_shared, cpu));
        if (sds)
                WRITE_ONCE(sds->has_idle_cores, val);
 }
@@ -5987,7 +5987,7 @@ static inline bool test_idle_cores(int cpu, bool def)
 {
        struct sched_domain_shared *sds;
 
-       sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+       sds = rcu_dereference(per_cpu(sd_sis_shared, cpu));
        if (sds)
                return READ_ONCE(sds->has_idle_cores);
 
@@ -5996,7 +5996,7 @@ static inline bool test_idle_cores(int cpu, bool def)
 
 /*
  * Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
+ * information in sd_sis_shared->has_idle_cores.
  *
  * Since SMT siblings share all cache levels, inspecting this limited remote
  * state should be fairly cheap.
@@ -6024,13 +6024,12 @@ void __update_idle_core(struct rq *rq)
 }
 
 /*
- * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * Scan the entire sis domain for idle cores; this dynamically switches off if
  * there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * sd_sis->shared->has_idle_cores and enabled through update_idle_core() above.
  */
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, 
int target)
+static int select_idle_core(struct task_struct *p, struct cpumask *cpus, int 
target)
 {
-       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
        int core, cpu;
 
        if (!static_branch_likely(&sched_smt_present))
@@ -6039,18 +6038,18 @@ static int select_idle_core(struct task_struct *p, 
struct sched_domain *sd, int
        if (!test_idle_cores(target, false))
                return -1;
 
-       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
        for_each_cpu_wrap(core, cpus, target) {
                bool idle = true;
 
+               if (core != cpumask_first(cpu_smt_mask(core)))
+                       continue;
+
                for_each_cpu(cpu, cpu_smt_mask(core)) {
                        if (!available_idle_cpu(cpu)) {
                                idle = false;
                                break;
                        }
                }
-               cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
 
                if (idle)
                        return core;
@@ -6099,45 +6098,45 @@ static inline int select_idle_smt(struct task_struct 
*p, int target)
 #endif /* CONFIG_SCHED_SMT */
 
 /*
- * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * Scan the sis domain for idle CPUs; this is dynamically regulated by
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
  */
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
+static int select_idle_cpu(struct task_struct *p, struct cpumask *cpus,
+       bool main_search, unsigned int span_weight, int target)
 {
-       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
        struct sched_domain *this_sd;
        u64 avg_cost, avg_idle;
        u64 time;
        int this = smp_processor_id();
        int cpu, nr = INT_MAX;
 
-       this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
-       if (!this_sd)
-               return -1;
+       if (main_search) {
+               this_sd = rcu_dereference(*this_cpu_ptr(&sd_sis));
+               if (!this_sd)
+                       return -1;
 
-       /*
-        * Due to large variance we need a large fuzz factor; hackbench in
-        * particularly is sensitive here.
-        */
-       avg_idle = this_rq()->avg_idle / 512;
-       avg_cost = this_sd->avg_scan_cost + 1;
+               /*
+                * Due to large variance we need a large fuzz factor; hackbench 
in
+                * particularly is sensitive here.
+                */
+               avg_idle = this_rq()->avg_idle / 512;
+               avg_cost = this_sd->avg_scan_cost + 1;
 
-       if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
-               return -1;
+               if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
+                       return -1;
 
-       if (sched_feat(SIS_PROP)) {
-               u64 span_avg = sd->span_weight * avg_idle;
-               if (span_avg > 4*avg_cost)
-                       nr = div_u64(span_avg, avg_cost);
-               else
-                       nr = 4;
+               if (sched_feat(SIS_PROP)) {
+                       u64 span_avg = span_weight * avg_idle;
+                       if (span_avg > 4*avg_cost)
+                               nr = div_u64(span_avg, avg_cost);
+                       else
+                               nr = 4;
+               }
        }
 
        time = cpu_clock(this);
 
-       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
        for_each_cpu_wrap(cpu, cpus, target) {
                if (!--nr)
                        return -1;
@@ -6145,8 +6144,10 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
                        break;
        }
 
-       time = cpu_clock(this) - time;
-       update_avg(&this_sd->avg_scan_cost, time);
+       if (main_search) {
+               time = cpu_clock(this) - time;
+               update_avg(&this_sd->avg_scan_cost, time);
+       }
 
        return cpu;
 }
@@ -6186,19 +6187,21 @@ select_idle_capacity(struct task_struct *p, struct 
sched_domain *sd, int target)
 }
 
 /*
- * Try and locate an idle core/thread in the LLC cache domain.
+ * Try and locate an idle core/thread in the sis domain.
  */
 static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
-       struct sched_domain *sd;
-       int i, recent_used_cpu;
+       struct sched_domain *sd_asym;
+       struct sched_domain *sd[2];
+       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+       int i, r, recent_used_cpu;
 
        /*
         * For asymmetric CPU capacity systems, our domain of interest is
-        * sd_asym_cpucapacity rather than sd_llc.
+        * sd_asym_cpucapacity rather than sd_sis.
         */
        if (static_branch_unlikely(&sched_asym_cpucapacity)) {
-               sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+               sd_asym = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
                /*
                 * On an asymmetric CPU capacity system where an exclusive
                 * cpuset defines a symmetric island (i.e. one unique
@@ -6207,10 +6210,10 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
                 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
                 * capacity path.
                 */
-               if (!sd)
+               if (!sd_asym)
                        goto symmetric;
 
-               i = select_idle_capacity(p, sd, target);
+               i = select_idle_capacity(p, sd_asym, target);
                return ((unsigned)i < nr_cpumask_bits) ? i : target;
        }
 
@@ -6221,7 +6224,7 @@ static int select_idle_sibling(struct task_struct *p, int 
prev, int target)
        /*
         * If the previous CPU is cache affine and idle, don't be stupid:
         */
-       if (prev != target && cpus_share_cache(prev, target) &&
+       if (prev != target && cpus_share_sis(prev, target) &&
            (available_idle_cpu(prev) || sched_idle_cpu(prev)))
                return prev;
 
@@ -6243,7 +6246,7 @@ static int select_idle_sibling(struct task_struct *p, int 
prev, int target)
        recent_used_cpu = p->recent_used_cpu;
        if (recent_used_cpu != prev &&
            recent_used_cpu != target &&
-           cpus_share_cache(recent_used_cpu, target) &&
+           cpus_share_sis(recent_used_cpu, target) &&
            (available_idle_cpu(recent_used_cpu) || 
sched_idle_cpu(recent_used_cpu)) &&
            cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
                /*
@@ -6254,21 +6257,35 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
                return recent_used_cpu;
        }
 
-       sd = rcu_dereference(per_cpu(sd_llc, target));
-       if (!sd)
-               return target;
+       for (i = 0; ; i++) {
+               if (i == 0) {
+                       sd[0] = rcu_dereference(per_cpu(sd_sis_pre, target));
+                       if (!sd[0])
+                               continue;
+                       cpumask_and(cpus, sched_domain_span(sd[0]), 
p->cpus_ptr);
+               } else if (i == 1) {
+                       sd[1] = rcu_dereference(per_cpu(sd_sis, target));
+                       if (!sd[1])
+                               continue;
+                       cpumask_and(cpus, sched_domain_span(sd[1]), 
p->cpus_ptr);
+                       if (sd[0])
+                               cpumask_andnot(cpus, cpus, 
sched_domain_span(sd[0]));
+               } else {
+                       break;
+               }
 
-       i = select_idle_core(p, sd, target);
-       if ((unsigned)i < nr_cpumask_bits)
-               return i;
+               r = select_idle_core(p, cpus, target);
+               if ((unsigned)r < nr_cpumask_bits)
+                       return r;
 
-       i = select_idle_cpu(p, sd, target);
-       if ((unsigned)i < nr_cpumask_bits)
-               return i;
+               r = select_idle_cpu(p, cpus, (i == 1), sd[i]->span_weight, 
target);
+               if ((unsigned)r < nr_cpumask_bits)
+                       return r;
 
-       i = select_idle_smt(p, target);
-       if ((unsigned)i < nr_cpumask_bits)
-               return i;
+               r = select_idle_smt(p, target);
+               if ((unsigned)r < nr_cpumask_bits)
+                       return r;
+       }
 
        return target;
 }
@@ -6718,6 +6735,46 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int sd_flag, int wake_f
        return new_cpu;
 }
 
+
+#ifdef CONFIG_SMP
+
+extern int sysctl_sched_wake_idle_domain;
+extern int sysctl_sched_wake_idle_presearch_domain;
+
+DEFINE_MUTEX(wake_idle_domain_mutex);
+
+int proc_sched_wake_idle_domain_handler(struct ctl_table *table,
+               int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct ctl_table tmp = *table;
+       int *sysctl = tmp.data;
+       int val = *sysctl;
+       int min = -1, max = INT_MAX;
+       int rc;
+
+       tmp.extra1 = &min;
+       tmp.extra2 = &max;
+       tmp.data = &val;
+
+       rc = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+       if (rc || !write)
+               return rc;
+
+       mutex_lock(&wake_idle_domain_mutex);
+       *sysctl = val;
+       rebuild_sched_domains_force();
+       mutex_unlock(&wake_idle_domain_mutex);
+
+       pr_info("Idle cpu search (select_idle_sibling) domains changed to: "
+         "sched_wake_idle_domain %d sched_wake_idle_presearch domain %d\n",
+         sysctl_sched_wake_idle_domain, 
sysctl_sched_wake_idle_presearch_domain);
+
+       return 0;
+}
+
+#endif
+
 static void detach_entity_cfs_rq(struct sched_entity *se);
 
 /*
@@ -10136,21 +10193,21 @@ static void nohz_balancer_kick(struct rq *rq)
                 * cache use, instead we want to embrace asymmetry and only
                 * ensure tasks have enough CPU capacity.
                 *
-                * Skip the LLC logic because it's not relevant in that case.
+                * Skip the sis logic because it's not relevant in that case.
                 */
                goto unlock;
        }
 
-       sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+       sds = rcu_dereference(per_cpu(sd_sis_shared, cpu));
        if (sds) {
                /*
-                * If there is an imbalance between LLC domains (IOW we could
-                * increase the overall cache use), we need some less-loaded LLC
+                * If there is an imbalance between sis domains (IOW we could
+                * increase the overall cache use), we need some less-loaded sis
                 * domain to pull some load. Likewise, we may need to spread
-                * load within the current LLC domain (e.g. packed SMT cores but
+                * load within the current sis domain (e.g. packed SMT cores but
                 * other CPUs are idle). We can't really know from here how busy
                 * the others are - so just get a nohz balance going if it looks
-                * like this LLC domain has tasks we could move.
+                * like this sis domain has tasks we could move.
                 */
                nr_busy = atomic_read(&sds->nr_busy_cpus);
                if (nr_busy > 1) {
@@ -10170,7 +10227,7 @@ static void set_cpu_sd_state_busy(int cpu)
        struct sched_domain *sd;
 
        rcu_read_lock();
-       sd = rcu_dereference(per_cpu(sd_llc, cpu));
+       sd = rcu_dereference(per_cpu(sd_sis, cpu));
 
        if (!sd || !sd->nohz_idle)
                goto unlock;
@@ -10200,7 +10257,7 @@ static void set_cpu_sd_state_idle(int cpu)
        struct sched_domain *sd;
 
        rcu_read_lock();
-       sd = rcu_dereference(per_cpu(sd_llc, cpu));
+       sd = rcu_dereference(per_cpu(sd_sis, cpu));
 
        if (!sd || sd->nohz_idle)
                goto unlock;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 877fb08eb1b0..641a5bacdf77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1415,10 +1415,11 @@ static inline struct sched_domain 
*lowest_flag_domain(int cpu, int flag)
        return sd;
 }
 
-DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
-DECLARE_PER_CPU(int, sd_llc_size);
-DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_sis);
+DECLARE_PER_CPU(int, sd_sis_size);
+DECLARE_PER_CPU(int, sd_sis_id);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_sis_shared);
+DECLARE_PER_CPU(struct sched_domain *, sd_sis_pre);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..bdda783c5148 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -605,41 +605,75 @@ static void destroy_sched_domains(struct sched_domain *sd)
 }
 
 /*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
- *
- * Also keep a unique ID per domain (we use the first CPU number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_cache().
+ * sd_sis is the select_idle_sibling search domain. It is generalized sd_llc
+ * not limited by the SD_SHARE_PKG_RESOURCE flag. With the sysctls sd_sis is
+ * also run time configurable.
+ * To limit overheads from searching / migrating among cores that don't share
+ * llc, a presearch domain can be enabled such that most searches / migrations
+ * still happen inside a smaller domain when the machine is lightly loaded.
+ *
+ * Keep a special pointer for this allows us to avoid some pointer chasing in
+ * select_idle_sibling(). Also keep a unique ID per domain (we use the first 
CPU
+ * number in the cpumask of the domain), this allows us to quickly tell if
+ * two CPUs are in the same sis domain, see cpus_share_sis().
  */
-DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
-DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_sis);
+DEFINE_PER_CPU(int, sd_sis_size);
+DEFINE_PER_CPU(int, sd_sis_id);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_sis_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
+int sysctl_sched_wake_idle_domain = -1;
+int sysctl_sched_wake_idle_presearch_domain = -1;
+DEFINE_PER_CPU(struct sched_domain *, sd_sis_pre);
+
 static void update_top_cache_domain(int cpu)
 {
        struct sched_domain_shared *sds = NULL;
-       struct sched_domain *sd;
+       struct sched_domain *sd, *sdp;
        int id = cpu;
        int size = 1;
+       int level;
+
+       if (sysctl_sched_wake_idle_domain < 0) {
+               sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+       } else {
+               level = 0;
+               for_each_domain(cpu, sd) {
+                       if (level == sysctl_sched_wake_idle_domain)
+                               break;
+                       level++;
+               }
+       }
 
-       sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
                size = cpumask_weight(sched_domain_span(sd));
                sds = sd->shared;
        }
 
-       rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
-       per_cpu(sd_llc_size, cpu) = size;
-       per_cpu(sd_llc_id, cpu) = id;
-       rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+       rcu_assign_pointer(per_cpu(sd_sis, cpu), sd);
+       per_cpu(sd_sis_size, cpu) = size;
+       per_cpu(sd_sis_id, cpu) = id;
+       rcu_assign_pointer(per_cpu(sd_sis_shared, cpu), sds);
+
+       sdp = NULL;
+       if (sd && sysctl_sched_wake_idle_presearch_domain >= 0) {
+               level = 0;
+               for_each_domain(cpu, sdp) {
+                       if (sdp == sd) {
+                               sdp = NULL;
+                               break;
+                       }
+                       if (level == sysctl_sched_wake_idle_presearch_domain)
+                               break;
+                       level++;
+               }
+       }
+       rcu_assign_pointer(per_cpu(sd_sis_pre, cpu), sdp);
 
        sd = lowest_flag_domain(cpu, SD_NUMA);
        rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -1400,14 +1434,12 @@ sd_init(struct sched_domain_topology_level *tl,
        }
 
        /*
-        * For all levels sharing cache; connect a sched_domain_shared
-        * instance.
+        * Connect sched_domain_shared instances. As sd_sis can be changed at 
run
+        * time, link all domains.
         */
-       if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-               sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-               atomic_inc(&sd->shared->ref);
-               atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-       }
+       sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+       atomic_inc(&sd->shared->ref);
+       atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
 
        sd->private = sdd;
 
@@ -2204,7 +2236,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, 
int idx_cur,
  * Call with hotplug lock and sched_domains_mutex held
  */
 void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
-                                   struct sched_domain_attr *dattr_new)
+                                   struct sched_domain_attr *dattr_new, int 
force_update)
 {
        bool __maybe_unused has_eas = false;
        int i, j, n;
@@ -2217,6 +2249,7 @@ void partition_sched_domains_locked(int ndoms_new, 
cpumask_var_t doms_new[],
 
        /* Let the architecture update CPU core mappings: */
        new_topology = arch_update_cpu_topology();
+       new_topology |= force_update;
 
        if (!doms_new) {
                WARN_ON_ONCE(dattr_new);
@@ -2310,9 +2343,9 @@ void partition_sched_domains_locked(int ndoms_new, 
cpumask_var_t doms_new[],
  * Call with hotplug lock held
  */
 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-                            struct sched_domain_attr *dattr_new)
+                            struct sched_domain_attr *dattr_new, int 
force_update)
 {
        mutex_lock(&sched_domains_mutex);
-       partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+       partition_sched_domains_locked(ndoms_new, doms_new, dattr_new, 0);
        mutex_unlock(&sched_domains_mutex);
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index db1ce7af2563..b474851e1a66 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -144,6 +144,10 @@ static const int cap_last_cap = CAP_LAST_CAP;
 #ifdef CONFIG_DETECT_HUNG_TASK
 static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
 #endif
+#ifdef CONFIG_SMP
+extern int sysctl_sched_wake_idle_domain;
+extern int sysctl_sched_wake_idle_presearch_domain;
+#endif
 
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
@@ -202,6 +206,11 @@ static int max_extfrag_threshold = 1000;
 
 #endif /* CONFIG_SYSCTL */
 
+#ifdef CONFIG_SMP
+int proc_sched_wake_idle_domain_handler(struct ctl_table *table,
+               int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
 static int bpf_stats_handler(struct ctl_table *table, int write,
                             void __user *buffer, size_t *lenp,
@@ -1834,6 +1843,22 @@ static struct ctl_table kern_table[] = {
                .extra2         = SYSCTL_ONE,
        },
 #endif
+#ifdef CONFIG_SMP
+       {
+               .procname       = "sched_wake_idle_domain",
+               .data           = &sysctl_sched_wake_idle_domain,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_sched_wake_idle_domain_handler,
+       },
+       {
+               .procname       = "sched_wake_idle_presearch_domain",
+               .data           = &sysctl_sched_wake_idle_presearch_domain,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_sched_wake_idle_domain_handler,
+       },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
-- 
2.28.0.rc0.142.g3c755180ce-goog

[PATCH] sched: Make select_idle_sibling search domain configurable

Reply via email to