fair: Always propagate runnable_load_avg

Tejun Heo Fri, 28 Apr 2017 13:39:07 -0700

Here's the debug patch.

The debug condition triggers when the load balancer picks a group w/o
more than one schbench threads on a CPU over one w/.


 /sys/module/fair/parameters/dbg_odd_cnt: resettable counter
 /sys/module/fair/parameters/dbg_odd_nth: dump group states on Nth
                                          occurrence via trace_printk()

The load / weights are printed out so that NICE_0_LOAD is 1.000.

Thanks.
---
 kernel/sched/fair.c |  160 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 1 deletion(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,11 +32,18 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/moduleparam.h>
 
 #include <trace/events/sched.h>
 
 #include "sched.h"
 
+static unsigned long dbg_odd_nth;
+static unsigned long dbg_odd_cnt;
+
+module_param(dbg_odd_nth, ulong, 0644);
+module_param(dbg_odd_cnt, ulong, 0644);
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -7413,6 +7420,149 @@ static inline void update_sg_lb_stats(st
        sgs->group_type = group_classify(group, sgs);
 }
 
+static int count_schb(struct rq *rq)
+{
+       unsigned long flags;
+       struct task_struct *p;
+       int cnt = 0;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+               if (!strncmp(p->comm, "schbench", 8))
+                       cnt++;
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       return cnt;
+}
+
+static bool sg_has_two_schb(struct sched_group *sg)
+{
+       int cpu;
+
+       for_each_cpu(cpu, sched_group_cpus(sg))
+               if (count_schb(cpu_rq(cpu)) >= 2)
+                       return true;
+       return false;
+}
+
+static DEFINE_PER_CPU(char [PAGE_SIZE], odd_buf);
+
+#define lbw(x) (int)((x) / NICE_0_LOAD), (int)(((x) % NICE_0_LOAD) * 1000 / 
NICE_0_LOAD)
+#define lba(x) (int)((scale_load(x)) / NICE_0_LOAD), (int)(((scale_load(x)) % 
NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+
+static int odd_append_se(struct sched_entity *se, const char *postfix,
+                        int cnt, char *buf, size_t size)
+{
+#define odd_append(fmt, args...)       do {                            \
+       cnt += scnprintf(buf + cnt, size - cnt, fmt, ##args);           \
+       cnt = min_t(int, cnt, size);                                    \
+} while (0)
+
+       if (entity_is_task(se)) {
+               struct task_struct *task = task_of(se);
+               odd_append(" %s(%d%s)", task->comm, task->pid, postfix);
+       } else {
+               char nbuf[64];
+               cgroup_name(se->my_q->tg->css.cgroup, nbuf, sizeof(nbuf));
+               odd_append(" %s(%s)", nbuf, postfix);
+       }
+       odd_append(":w=%d.%03d,l=%d.%03d,u=%d.%03d",
+                  lbw(se->load.weight),
+                  lba(se->avg.load_avg),
+                  lba(se->avg.util_avg));
+
+       return cnt;
+}
+
+static void dbg_odd_dump(const char *pref,
+                        struct sched_group *sg, struct sg_lb_stats *sgs)
+{
+       int cpu;
+
+       trace_printk("%sgrp=%*pbl w=%u avg=%d.%03d grp=%d.%03d sum=%d.%03d 
pertask=%d.%03d\n", pref,
+                    cpumask_pr_args(sched_group_cpus(sg)), sg->group_weight,
+                    lba(sgs->avg_load), lba(sgs->group_load),
+                    lba(sgs->sum_weighted_load), lba(sgs->load_per_task));
+       trace_printk("%sgcap=%d.%03d gutil=%d.%03d run=%u idle=%u gwt=%u 
type=%d nocap=%d\n",
+                    pref,
+                    lba(sgs->group_capacity), lba(sgs->group_util),
+                    sgs->sum_nr_running, sgs->idle_cpus, sgs->group_weight,
+                    sgs->group_type, sgs->group_no_capacity);
+
+       for_each_cpu(cpu, sched_group_cpus(sg)) {
+               struct task_group *tg;
+               unsigned long flags;
+
+               trace_printk("%sCPU%03d: run=%u schb=%d\n", pref, cpu,
+                            cpu_rq(cpu)->nr_running, count_schb(cpu_rq(cpu)));
+
+               raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
+
+               list_for_each_entry_rcu(tg, &task_groups, list) {
+                       struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+                       char qname[32] = "root";
+                       int depth = 0;
+                       long tg_weight = 0, tg_shares = 0;
+                       struct sched_entity *se;
+                       char *buf = per_cpu_ptr(odd_buf, cpu);
+                       int cnt;
+
+                       if (!cfs_rq->nr_running)
+                               continue;
+
+                       if (cfs_rq->tg) {
+                               cgroup_name(cfs_rq->tg->css.cgroup, qname, 
sizeof(qname));
+                               if (cfs_rq->tg->se[cpu])
+                                       depth = cfs_rq->tg->se[cpu]->depth;
+                               tg_weight = 
atomic_long_read(&cfs_rq->tg->load_avg);
+                               tg_shares = cfs_rq->tg->shares;
+                       }
+
+                       trace_printk("%sQ%03d-%s@%d: 
w=%d.%03d,l=%d.%03d,u=%d.%03d,r=%d.%03d run=%u hrun=%u tgs=%d.%03d 
tgw=%d.%03d\n",
+                                    pref, cpu, qname, depth,
+                                    lbw(cfs_rq->load.weight),
+                                    lba(cfs_rq->avg.load_avg),
+                                    lba(cfs_rq->avg.util_avg),
+                                    lba(cfs_rq->runnable_load_avg),
+                                    cfs_rq->nr_running, cfs_rq->h_nr_running,
+                                    lbw(tg_shares),
+                                    lba(tg_weight));
+
+                       buf[0] = '\0';
+                       cnt = 0;
+
+                       if (cfs_rq->curr)
+                               cnt = odd_append_se(cfs_rq->curr, "C", cnt, 
buf, PAGE_SIZE);
+
+                       for (se = __pick_first_entity(cfs_rq); se;
+                            se = __pick_next_entity(se))
+                               cnt = odd_append_se(se, "", cnt, buf, 
PAGE_SIZE);
+
+                       trace_printk("%sQ%03d-%s@%d: %s\n",
+                                    pref, cpu, qname, depth, buf);
+               }
+
+               raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
+       }
+}
+
+/* a has >= 2 dts, b doesn't */
+static void dbg_odd(struct lb_env *env,
+                   struct sched_group *sga, struct sg_lb_stats *sgsa,
+                   struct sched_group *sgb, struct sg_lb_stats *sgsb)
+{
+       if (dbg_odd_nth && (dbg_odd_cnt++ % dbg_odd_nth))
+               return;
+
+       trace_printk("odd: dst=%d idle=%d brk=%u lbtgt=%*pbl type=%d\n",
+                    env->dst_cpu, env->idle, env->loop_break,
+                    cpumask_pr_args(env->cpus), env->fbq_type);
+       dbg_odd_dump("A: ", sga, sgsa);
+       dbg_odd_dump("B: ", sgb, sgsb);
+}
+
 /**
  * update_sd_pick_busiest - return 1 on busiest group
  * @env: The load balancing environment.
@@ -7432,6 +7582,8 @@ static bool update_sd_pick_busiest(struc
                                   struct sg_lb_stats *sgs)
 {
        struct sg_lb_stats *busiest = &sds->busiest_stat;
+       bool busiest_has_two = sds->busiest && sg_has_two_schb(sds->busiest);
+       bool sg_has_two = sg_has_two_schb(sg);
 
        if (sgs->group_type > busiest->group_type)
                return true;
@@ -7439,8 +7591,14 @@ static bool update_sd_pick_busiest(struc
        if (sgs->group_type < busiest->group_type)
                return false;
 
-       if (sgs->avg_load <= busiest->avg_load)
+       if (sgs->avg_load <= busiest->avg_load) {
+               if (sg_has_two && !busiest_has_two)
+                       dbg_odd(env, sg, sgs, sds->busiest, busiest);
                return false;
+       }
+
+       if (!sg_has_two && busiest_has_two)
+               dbg_odd(env, sds->busiest, busiest, sg, sgs);
 
        if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
                goto asym_packing;

Re: [PATCH 2/2] sched/fair: Always propagate runnable_load_avg

Reply via email to