Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

Vincent Guittot Wed, 07 Apr 2021 02:42:29 -0700

Le mercredi 07 avril 2021 à 09:17:18 (+0200), Peter Zijlstra a écrit :
> On Tue, Apr 06, 2021 at 11:26:37AM -0400, Rik van Riel wrote:
> > I would be happy to pull the static branch out of select_idle_smt()
> > and place it into this if condition, though. You are right that
> > would save some overhead on non-smt systems.
> > 
> > Peter, would you prefer a follow-up patch for that or a version 4
> > of the patch?
> 
> Sorry, I was side-tracked with that core scheduling crap.. Something
> like the below then?
> 
> (Also fixed that stray line-wrap)
> 
> ---
> Subject: sched/fair: Bring back select_idle_smt(), but differently
> From: Rik van Riel <r...@surriel.com>
> Date: Fri, 26 Mar 2021 15:19:32 -0400
> 
> From: Rik van Riel <r...@surriel.com>
> 
> Mel Gorman did some nice work in 9fe1f127b913 ("sched/fair: Merge
> select_idle_core/cpu()"), resulting in the kernel being more efficient
> at finding an idle CPU, and in tasks spending less time waiting to be
> run, both according to the schedstats run_delay numbers, and according
> to measured application latencies. Yay.
> 
> The flip side of this is that we see more task migrations (about 30%
> more), higher cache misses, higher memory bandwidth utilization, and
> higher CPU use, for the same number of requests/second.
> 
> This is most pronounced on a memcache type workload, which saw a
> consistent 1-3% increase in total CPU use on the system, due to those
> increased task migrations leading to higher L2 cache miss numbers, and
> higher memory utilization. The exclusive L3 cache on Skylake does us
> no favors there.
> 
> On our web serving workload, that effect is usually negligible.
> 
> It appears that the increased number of CPU migrations is generally a
> good thing, since it leads to lower cpu_delay numbers, reflecting the
> fact that tasks get to run faster. However, the reduced locality and
> the corresponding increase in L2 cache misses hurts a little.
> 
> The patch below appears to fix the regression, while keeping the
> benefit of the lower cpu_delay numbers, by reintroducing
> select_idle_smt with a twist: when a socket has no idle cores, check
> to see if the sibling of "prev" is idle, before searching all the
> other CPUs.
> 
> This fixes both the occasional 9% regression on the web serving
> workload, and the continuous 2% CPU use regression on the memcache
> type workload.
> 
> With Mel's patches and this patch together, task migrations are still
> high, but L2 cache misses, memory bandwidth, and CPU time used are
> back down to what they were before. The p95 and p99 response times for
> the memcache type application improve by about 10% over what they were
> before Mel's patches got merged.
> 
> Signed-off-by: Rik van Riel <r...@surriel.com>
> Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
> Link: https://lkml.kernel.org/r/20210326151932.2c187...@imladris.surriel.com
> ---
>  kernel/sched/fair.c |   39 +++++++++++++++++++++++++++++++++++++--
>  1 file changed, 37 insertions(+), 2 deletions(-)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6112,6 +6112,27 @@ static int select_idle_core(struct task_
>       return -1;
>  }
>  
> +/*
> + * Scan the local SMT mask for idle CPUs.
> + */
> +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, 
> int target)
> +{
> +     int cpu;
> +
> +     if (!static_branch_likely(&sched_smt_present))
> +             return -1;
> +
> +     for_each_cpu(cpu, cpu_smt_mask(target)) {
> +             if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> +                 !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> +                     continue;
> +             if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> +                     return cpu;
> +     }
> +
> +     return -1;
> +}
> +
>  #else /* CONFIG_SCHED_SMT */
>  
>  static inline void set_idle_cores(int cpu, int val)
> @@ -6128,6 +6149,11 @@ static inline int select_idle_core(struc
>       return __select_idle_cpu(core);
>  }
>  
> +static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
> *sd, int target)
> +{
> +     return -1;
> +}
> +
>  #endif /* CONFIG_SCHED_SMT */
>  
>  /*
> @@ -6135,7 +6161,7 @@ static inline int select_idle_core(struc
>   * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
>   * average idle time for this rq (as found in rq->avg_idle).
>   */
> -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
> int target)
> +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
> int prev, int target)
>  {
>       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
>       int i, cpu, idle_cpu = -1, nr = INT_MAX;
> @@ -6148,6 +6174,15 @@ static int select_idle_cpu(struct task_s
>       if (!this_sd)
>               return -1;
>  
> +     /* If we have SMT but there are no idle cores */
> +     if (static_branch_likely(&sched_smt_presernt) && !smt) {
> +             if (cpus_share_cache(prev, target)) {
> +                     i = select_idle_smt(p, sd, prev);
> +                     if ((unsigned int)i < nr_cpumask_bits)
> +                             return i;
> +             }
> +     }
> +
>       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
>  
>       if (sched_feat(SIS_PROP) && !smt) {
> @@ -6321,7 +6356,7 @@ static int select_idle_sibling(struct ta
>       if (!sd)
>               return target;
>  
> -     i = select_idle_cpu(p, sd, target);
> +     i = select_idle_cpu(p, sd, prev, target);
>       if ((unsigned)i < nr_cpumask_bits)
>               return i;


I would really prefer to keep that out of select_idle_cpu which aims to merge 
in one
single loop the walk through sd_llc. In the case of select_idle_smt, this is 
done outside
the loop:

I would prefer the below which also removed a number of useless and duplicated 
static_branch_likely(&sched_smt_presernt) in test_idle_cores and select_idle_smt

---
 kernel/sched/fair.c | 48 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6d73bdbb2d40..01d0bacedc8d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6038,11 +6038,9 @@ static inline bool test_idle_cores(int cpu, bool def)
 {
        struct sched_domain_shared *sds;
 
-       if (static_branch_likely(&sched_smt_present)) {
-               sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-               if (sds)
-                       return READ_ONCE(sds->has_idle_cores);
-       }
+       sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+       if (sds)
+               return READ_ONCE(sds->has_idle_cores);
 
        return def;
 }
@@ -6112,6 +6110,25 @@ static int select_idle_core(struct task_struct *p, int 
core, struct cpumask *cpu
        return -1;
 }
 
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int
+target)
+{
+       int cpu;
+
+       for_each_cpu(cpu, cpu_smt_mask(target)) {
+               if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+                   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+                       continue;
+               if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+                       return cpu;
+       }
+
+       return -1;
+}
+
 #else /* CONFIG_SCHED_SMT */
 
 static inline void set_idle_cores(int cpu, int val)
@@ -6128,6 +6145,11 @@ static inline int select_idle_core(struct task_struct 
*p, int core, struct cpuma
        return __select_idle_cpu(core);
 }
 
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
*sd, int target)
+{
+       return -1;
+}
+
 #endif /* CONFIG_SCHED_SMT */
 
 /*
@@ -6135,11 +6157,10 @@ static inline int select_idle_core(struct task_struct 
*p, int core, struct cpuma
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
  */
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
bool smt, int target)
 {
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
        int i, cpu, idle_cpu = -1, nr = INT_MAX;
-       bool smt = test_idle_cores(target, false);
        int this = smp_processor_id();
        struct sched_domain *this_sd;
        u64 time;
@@ -6242,6 +6263,7 @@ static inline bool asym_fits_capacity(int task_util, int 
cpu)
  */
 static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
+       bool smt = false;
        struct sched_domain *sd;
        unsigned long task_util;
        int i, recent_used_cpu;
@@ -6317,11 +6339,21 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
                }
        }
 
+       if (static_branch_likely(&sched_smt_present)) {
+               smt = test_idle_cores(target, false);
+               if (!smt && cpus_share_cache(prev, target)) {
+                       /* No idle core. Check if prev has an idle sibling. */
+                       i = select_idle_smt(p, sd, prev);
+                       if ((unsigned int)i < nr_cpumask_bits)
+                               return i;
+               }
+       }
+
        sd = rcu_dereference(per_cpu(sd_llc, target));
        if (!sd)
                return target;
 
-       i = select_idle_cpu(p, sd, target);
+       i = select_idle_cpu(p, sd, smt, target);
        if ((unsigned)i < nr_cpumask_bits)
                return i;
 
-- 
2.17.1

>

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

Reply via email to