On Fri, Feb 09, 2018 at 05:59:12PM +0000, Jon Maloy wrote: > Command for TCP: > "netperf TCP_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t > TCP_STREAM -l 10 -- -O THROUGHPUT)" > Command for TIPC: > "netperf TIPC_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t > TCP_STREAM -l 10 -- -O THROUGHPUT)"
That looks like identical tests to me. And my netperf (debian testing) doesn't appear to have -t TIPC_STREAM. Please try a coherent report and I'll have another look. Don't (again) forget to mention what kind of setup you're running this on. On my IVB-EP (2 sockets, 10 cores, 2 threads), performance cpufreq, PTI=n RETPOLINE=n, I get: CPUS=`grep -c ^processor /proc/cpuinfo` for test in TCP_STREAM do for i in 1 $((CPUS/4)) $((CPUS/2)) $((CPUS)) $((CPUS*2)) do echo -n $test-$i ": " ( for ((j=0; j<i; j++)) do netperf -t $test -4 -c -C -l 60 -P0 | head -1 & done wait ) | awk '{ n++; v+=$5; } END { print "Avg: " v/n }' done done NO_WA_OLD WA_IDLE WA_WEIGHT: TCP_STREAM-1 : Avg: 44139.8 TCP_STREAM-10 : Avg: 27301.6 TCP_STREAM-20 : Avg: 12701.5 TCP_STREAM-40 : Avg: 5711.62 TCP_STREAM-80 : Avg: 2870.16 WA_OLD NO_WA_IDLE NO_WA_WEIGHT: TCP_STREAM-1 : Avg: 25293.1 TCP_STREAM-10 : Avg: 28196.3 TCP_STREAM-20 : Avg: 12463.7 TCP_STREAM-40 : Avg: 5566.83 TCP_STREAM-80 : Avg: 2630.03 --- include/linux/sched/topology.h | 4 ++ kernel/sched/fair.c | 99 +++++++++++++++++++++++++++++++++++++----- kernel/sched/features.h | 2 + 3 files changed, 93 insertions(+), 12 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 26347741ba50..2cb74343c252 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -72,6 +72,10 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; + + unsigned long nr_running; + unsigned long load; + unsigned long capacity; }; struct sched_domain { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eb3ffc9be84..4a561311241a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5680,6 +5680,68 @@ static int wake_wide(struct task_struct *p) return 1; } +struct llc_stats { + unsigned long nr_running; + unsigned long load; + unsigned long capacity; + int has_capacity; +}; + +static bool get_llc_stats(struct llc_stats *stats, int cpu) +{ + struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + + if (!sds) + return false; + + stats->nr_running = READ_ONCE(sds->nr_running); + stats->load = READ_ONCE(sds->load); + stats->capacity = READ_ONCE(sds->capacity); + stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); + + return true; +} + +static int +wake_affine_old(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int prev_cpu, int sync) +{ + struct llc_stats prev_stats, this_stats; + s64 this_eff_load, prev_eff_load; + unsigned long task_load; + + if (!get_llc_stats(&prev_stats, prev_cpu) || + !get_llc_stats(&this_stats, this_cpu)) + return nr_cpumask_bits; + + if (sync) { + unsigned long current_load = task_h_load(current); + if (current_load > this_stats.load) + return this_cpu; + + this_stats.load -= current_load; + } + + if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) + return nr_cpumask_bits; + + if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) + return this_cpu; + + task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_stats.capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_stats.capacity; + + this_eff_load *= this_stats.load + task_load; + prev_eff_load *= prev_stats.load - task_load; + + return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; +} + /* * The purpose of wake_affine() is to quickly determine on which CPU we can run * soonest. For the purpose of speed we only consider the waking and previous @@ -5756,6 +5818,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu = smp_processor_id(); int target = nr_cpumask_bits; + if (sched_feat(WA_OLD)) + target = wake_affine_old(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_IDLE)) target = wake_affine_idle(this_cpu, prev_cpu, sync); @@ -6209,18 +6274,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return prev; /* Check a recently used CPU as a potential idle candidate */ - recent_used_cpu = p->recent_used_cpu; - if (recent_used_cpu != prev && - recent_used_cpu != target && - cpus_share_cache(recent_used_cpu, target) && - idle_cpu(recent_used_cpu) && - cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { - /* - * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake. - */ - p->recent_used_cpu = prev; - return recent_used_cpu; + if (sched_feat(SIS_RECENT)) { + recent_used_cpu = p->recent_used_cpu; + if (recent_used_cpu != prev && + recent_used_cpu != target && + cpus_share_cache(recent_used_cpu, target) && + idle_cpu(recent_used_cpu) && + cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { + /* + * Replace recent_used_cpu with prev as it is a potential + * candidate for the next wake. + */ + p->recent_used_cpu = prev; + return recent_used_cpu; + } } sd = rcu_dereference(per_cpu(sd_llc, target)); @@ -7961,6 +8028,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { + struct sched_domain_shared *shared = env->sd->shared; struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; @@ -8032,6 +8100,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; } + + if (!shared) + return; + + WRITE_ONCE(shared->nr_running, sds->total_running); + WRITE_ONCE(shared->load, sds->total_load); + WRITE_ONCE(shared->capacity, sds->total_capacity); } /** diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..bdb0a66caaae 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_RECENT, true) /* * Issue a WARN when we do multiple update_rq_clock() calls @@ -82,6 +83,7 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +SCHED_FEAT(WA_OLD, false) SCHED_FEAT(WA_IDLE, true) SCHED_FEAT(WA_WEIGHT, true) SCHED_FEAT(WA_BIAS, true)