Re: sysbench throughput degradation in 4.13+

Eric Farman Wed, 27 Sep 2017 09:28:56 -0700


On 09/27/2017 05:35 AM, Peter Zijlstra wrote:

On Fri, Sep 22, 2017 at 12:12:45PM -0400, Eric Farman wrote:


MySQL.  We've tried a few different configs with both test=oltp and
test=threads, but both show the same behavior.  What I have settled on for
my repro is the following:


Right, didn't even need to run it in a guest to observe a regression.

So the below cures native sysbench and NAS bench for me, does it also
work for you virt thingy?

Ran a quick test this morning with 4.13.0 + 90001d67be2f + a731ebe6f17band then with/without this patch. An oltp sysbench run shows that guestcpu migrations decreased significantly, from ~27K to ~2K over 5 seconds.

So, we applied this patch to linux-next (next-20170926) and ran itagainst a couple sysbench tests:


--test=oltp
Baseline:       5655.26 transactions/second
Patched:        9618.13 transactions/second

--test=threads
Baseline:       25482.9 events/sec
Patched:        29577.9 events/sec

That's good!  With that...

Tested-by: Eric Farman <[email protected]>

Thanks!

 - Eric


PRE (current tip/master):

ivb-ex sysbench:

   2: [30 secs]     transactions:                        64110  (2136.94 per 
sec.)
   5: [30 secs]     transactions:                        143644 (4787.99 per 
sec.)
  10: [30 secs]     transactions:                        274298 (9142.93 per 
sec.)
  20: [30 secs]     transactions:                        418683 (13955.45 per 
sec.)
  40: [30 secs]     transactions:                        320731 (10690.15 per 
sec.)
  80: [30 secs]     transactions:                        355096 (11834.28 per 
sec.)

hsw-ex NAS:

OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds =                   
 18.01
OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds =                   
 17.89
OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds =                   
 17.93
lu.C.x_threads_144_run_1.log: Time in seconds =                   434.68
lu.C.x_threads_144_run_2.log: Time in seconds =                   405.36
lu.C.x_threads_144_run_3.log: Time in seconds =                   433.83


POST (+patch):

ivb-ex sysbench:

   2: [30 secs]     transactions:                        64494  (2149.75 per 
sec.)
   5: [30 secs]     transactions:                        145114 (4836.99 per 
sec.)
  10: [30 secs]     transactions:                        278311 (9276.69 per 
sec.)
  20: [30 secs]     transactions:                        437169 (14571.60 per 
sec.)
  40: [30 secs]     transactions:                        669837 (22326.73 per 
sec.)
  80: [30 secs]     transactions:                        631739 (21055.88 per 
sec.)

hsw-ex NAS:

lu.C.x_threads_144_run_1.log: Time in seconds =                    23.36
lu.C.x_threads_144_run_2.log: Time in seconds =                    22.96
lu.C.x_threads_144_run_3.log: Time in seconds =                    22.52


This patch takes out all the shiny wake_affine stuff and goes back to
utter basics. Rik was there another NUMA benchmark that wanted your
fancy stuff? Because NAS isn't it.

(the previous, slightly fancier wake_affine was basically a !idle
extension of this, in that it would pick the 'shortest' of the 2 queues
and thereby run quickest, in approximation)

I'll try and run a number of other benchmarks I have around to see if
there's anything that shows a difference between the below trivial
wake_affine and the old 2-cpu-load one.

---
  include/linux/sched/topology.h |   8 ---
  kernel/sched/fair.c            | 125 ++---------------------------------------
  2 files changed, 6 insertions(+), 127 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d7b6dab956ec..7d065abc7a47 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
        atomic_t        ref;
        atomic_t        nr_busy_cpus;
        int             has_idle_cores;
-
-       /*
-        * Some variables from the most recent sd_lb_stats for this domain,
-        * used by wake_affine().
-        */
-       unsigned long   nr_running;
-       unsigned long   load;
-       unsigned long   capacity;
  };

  struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..66930ce338af 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,115 +5356,19 @@ static int wake_wide(struct task_struct *p)
        return 1;
  }

-struct llc_stats {
-       unsigned long   nr_running;
-       unsigned long   load;
-       unsigned long   capacity;
-       int             has_capacity;
-};
-
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
-{
-       struct sched_domain_shared *sds = 
rcu_dereference(per_cpu(sd_llc_shared, cpu));
-
-       if (!sds)
-               return false;
-
-       stats->nr_running    = READ_ONCE(sds->nr_running);
-       stats->load          = READ_ONCE(sds->load);
-       stats->capacity              = READ_ONCE(sds->capacity);
-       stats->has_capacity  = stats->nr_running < per_cpu(sd_llc_size, cpu);
-
-       return true;
-}
-
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
- */
-static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
-               int this_cpu, int prev_cpu, int sync)
-{
-       struct llc_stats prev_stats, this_stats;
-       s64 this_eff_load, prev_eff_load;
-       unsigned long task_load;
-
-       if (!get_llc_stats(&prev_stats, prev_cpu) ||
-           !get_llc_stats(&this_stats, this_cpu))
-               return false;
-
-       /*
-        * If sync wakeup then subtract the (maximum possible)
-        * effect of the currently running task from the load
-        * of the current LLC.
-        */
-       if (sync) {
-               unsigned long current_load = task_h_load(current);
-
-               /* in this case load hits 0 and this LLC is considered 'idle' */
-               if (current_load > this_stats.load)
-                       return true;
-
-               this_stats.load -= current_load;
-       }
-
-       /*
-        * The has_capacity stuff is not SMT aware, but by trying to balance
-        * the nr_running on both ends we try and fill the domain at equal
-        * rates, thereby first consuming cores before siblings.
-        */
-
-       /* if the old cache has capacity, stay there */
-       if (prev_stats.has_capacity && prev_stats.nr_running < 
this_stats.nr_running+1)
-               return false;
-
-       /* if this cache has capacity, come here */
-       if (this_stats.has_capacity && this_stats.nr_running+1 < 
prev_stats.nr_running)
-               return true;
-
-       /*
-        * Check to see if we can move the load without causing too much
-        * imbalance.
-        */
-       task_load = task_h_load(p);
-
-       this_eff_load = 100;
-       this_eff_load *= prev_stats.capacity;
-
-       prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-       prev_eff_load *= this_stats.capacity;
-
-       this_eff_load *= this_stats.load + task_load;
-       prev_eff_load *= prev_stats.load - task_load;
-
-       return this_eff_load <= prev_eff_load;
-}
-
  static int wake_affine(struct sched_domain *sd, struct task_struct *p,
                       int prev_cpu, int sync)
  {
        int this_cpu = smp_processor_id();
-       bool affine;
-
-       /*
-        * Default to no affine wakeups; wake_affine() should not effect a task
-        * placement the load-balancer feels inclined to undo. The conservative
-        * option is therefore to not move tasks when they wake up.
-        */
-       affine = false;
+       bool affine = false;

        /*
-        * If the wakeup is across cache domains, try to evaluate if movement
-        * makes sense, otherwise rely on select_idle_siblings() to do
-        * placement inside the cache domain.
+        * If we can run _now_ on the waking CPU, go there, otherwise meh.
         */
-       if (!cpus_share_cache(prev_cpu, this_cpu))
-               affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
+       if (idle_cpu(this_cpu))
+               affine = true;
+       else if (sync && cpu_rq(this_cpu)->nr_running == 1)
+               affine = true;

        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
        if (affine) {
@@ -7600,7 +7504,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
   */
  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats 
*sds)
  {
-       struct sched_domain_shared *shared = env->sd->shared;
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7575,6 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
                if (env->dst_rq->rd->overload != overload)
                        env->dst_rq->rd->overload = overload;
        }
-
-       if (!shared)
-               return;
-
-       /*
-        * Since these are sums over groups they can contain some CPUs
-        * multiple times for the NUMA domains.
-        *
-        * Currently only wake_affine_llc() and find_busiest_group()
-        * uses these numbers, only the last is affected by this problem.
-        *
-        * XXX fix that.
-        */
-       WRITE_ONCE(shared->nr_running,       sds->total_running);
-       WRITE_ONCE(shared->load,     sds->total_load);
-       WRITE_ONCE(shared->capacity, sds->total_capacity);
  }

  /**

Re: sysbench throughput degradation in 4.13+

Reply via email to