Le mercredi 07 avril 2021 à 09:17:18 (+0200), Peter Zijlstra a écrit : > On Tue, Apr 06, 2021 at 11:26:37AM -0400, Rik van Riel wrote: > > I would be happy to pull the static branch out of select_idle_smt() > > and place it into this if condition, though. You are right that > > would save some overhead on non-smt systems. > > > > Peter, would you prefer a follow-up patch for that or a version 4 > > of the patch? > > Sorry, I was side-tracked with that core scheduling crap.. Something > like the below then? > > (Also fixed that stray line-wrap) > > --- > Subject: sched/fair: Bring back select_idle_smt(), but differently > From: Rik van Riel <r...@surriel.com> > Date: Fri, 26 Mar 2021 15:19:32 -0400 > > From: Rik van Riel <r...@surriel.com> > > Mel Gorman did some nice work in 9fe1f127b913 ("sched/fair: Merge > select_idle_core/cpu()"), resulting in the kernel being more efficient > at finding an idle CPU, and in tasks spending less time waiting to be > run, both according to the schedstats run_delay numbers, and according > to measured application latencies. Yay. > > The flip side of this is that we see more task migrations (about 30% > more), higher cache misses, higher memory bandwidth utilization, and > higher CPU use, for the same number of requests/second. > > This is most pronounced on a memcache type workload, which saw a > consistent 1-3% increase in total CPU use on the system, due to those > increased task migrations leading to higher L2 cache miss numbers, and > higher memory utilization. The exclusive L3 cache on Skylake does us > no favors there. > > On our web serving workload, that effect is usually negligible. > > It appears that the increased number of CPU migrations is generally a > good thing, since it leads to lower cpu_delay numbers, reflecting the > fact that tasks get to run faster. However, the reduced locality and > the corresponding increase in L2 cache misses hurts a little. > > The patch below appears to fix the regression, while keeping the > benefit of the lower cpu_delay numbers, by reintroducing > select_idle_smt with a twist: when a socket has no idle cores, check > to see if the sibling of "prev" is idle, before searching all the > other CPUs. > > This fixes both the occasional 9% regression on the web serving > workload, and the continuous 2% CPU use regression on the memcache > type workload. > > With Mel's patches and this patch together, task migrations are still > high, but L2 cache misses, memory bandwidth, and CPU time used are > back down to what they were before. The p95 and p99 response times for > the memcache type application improve by about 10% over what they were > before Mel's patches got merged. > > Signed-off-by: Rik van Riel <r...@surriel.com> > Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> > Link: https://lkml.kernel.org/r/20210326151932.2c187...@imladris.surriel.com > --- > kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++-- > 1 file changed, 37 insertions(+), 2 deletions(-) > > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -6112,6 +6112,27 @@ static int select_idle_core(struct task_ > return -1; > } > > +/* > + * Scan the local SMT mask for idle CPUs. > + */ > +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, > int target) > +{ > + int cpu; > + > + if (!static_branch_likely(&sched_smt_present)) > + return -1; > + > + for_each_cpu(cpu, cpu_smt_mask(target)) { > + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || > + !cpumask_test_cpu(cpu, sched_domain_span(sd))) > + continue; > + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) > + return cpu; > + } > + > + return -1; > +} > + > #else /* CONFIG_SCHED_SMT */ > > static inline void set_idle_cores(int cpu, int val) > @@ -6128,6 +6149,11 @@ static inline int select_idle_core(struc > return __select_idle_cpu(core); > } > > +static inline int select_idle_smt(struct task_struct *p, struct sched_domain > *sd, int target) > +{ > + return -1; > +} > + > #endif /* CONFIG_SCHED_SMT */ > > /* > @@ -6135,7 +6161,7 @@ static inline int select_idle_core(struc > * comparing the average scan cost (tracked in sd->avg_scan_cost) against the > * average idle time for this rq (as found in rq->avg_idle). > */ > -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, > int target) > +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, > int prev, int target) > { > struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); > int i, cpu, idle_cpu = -1, nr = INT_MAX; > @@ -6148,6 +6174,15 @@ static int select_idle_cpu(struct task_s > if (!this_sd) > return -1; > > + /* If we have SMT but there are no idle cores */ > + if (static_branch_likely(&sched_smt_presernt) && !smt) { > + if (cpus_share_cache(prev, target)) { > + i = select_idle_smt(p, sd, prev); > + if ((unsigned int)i < nr_cpumask_bits) > + return i; > + } > + } > + > cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); > > if (sched_feat(SIS_PROP) && !smt) { > @@ -6321,7 +6356,7 @@ static int select_idle_sibling(struct ta > if (!sd) > return target; > > - i = select_idle_cpu(p, sd, target); > + i = select_idle_cpu(p, sd, prev, target); > if ((unsigned)i < nr_cpumask_bits) > return i;
I would really prefer to keep that out of select_idle_cpu which aims to merge in one single loop the walk through sd_llc. In the case of select_idle_smt, this is done outside the loop: I would prefer the below which also removed a number of useless and duplicated static_branch_likely(&sched_smt_presernt) in test_idle_cores and select_idle_smt --- kernel/sched/fair.c | 48 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6d73bdbb2d40..01d0bacedc8d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6038,11 +6038,9 @@ static inline bool test_idle_cores(int cpu, bool def) { struct sched_domain_shared *sds; - if (static_branch_likely(&sched_smt_present)) { - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) - return READ_ONCE(sds->has_idle_cores); - } + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + return READ_ONCE(sds->has_idle_cores); return def; } @@ -6112,6 +6110,25 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu return -1; } +/* + * Scan the local SMT mask for idle CPUs. + */ +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int +target) +{ + int cpu; + + for_each_cpu(cpu, cpu_smt_mask(target)) { + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + return cpu; + } + + return -1; +} + #else /* CONFIG_SCHED_SMT */ static inline void set_idle_cores(int cpu, int val) @@ -6128,6 +6145,11 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core); } +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + #endif /* CONFIG_SCHED_SMT */ /* @@ -6135,11 +6157,10 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma * comparing the average scan cost (tracked in sd->avg_scan_cost) against the * average idle time for this rq (as found in rq->avg_idle). */ -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool smt, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; - bool smt = test_idle_cores(target, false); int this = smp_processor_id(); struct sched_domain *this_sd; u64 time; @@ -6242,6 +6263,7 @@ static inline bool asym_fits_capacity(int task_util, int cpu) */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { + bool smt = false; struct sched_domain *sd; unsigned long task_util; int i, recent_used_cpu; @@ -6317,11 +6339,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } } + if (static_branch_likely(&sched_smt_present)) { + smt = test_idle_cores(target, false); + if (!smt && cpus_share_cache(prev, target)) { + /* No idle core. Check if prev has an idle sibling. */ + i = select_idle_smt(p, sd, prev); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; - i = select_idle_cpu(p, sd, target); + i = select_idle_cpu(p, sd, smt, target); if ((unsigned)i < nr_cpumask_bits) return i; -- 2.17.1 >