This patch add power aware scheduling in fork/exec/wake. It try to select cpu from the busiest while still has utilization group. That's will save power for other groups.
The trade off is adding a power aware statistics collection in group seeking. But since the collection just happened in power scheduling eligible condition, the worst case of hackbench testing just drops about 2% with powersaving/balance policy. No clear change for performance policy. When the system burst by fork, the new tasks utils are may zero, (rq->util == 0). that make new tasks go to few idle cpus, then will be migrated to others in periodic load balance. That's not helpful for both power/performance. So this patch doesn't use rq.util to judge if the cpu has vacancy, instead it uses nr_running of the rq. BTW, I had tried to tracking the burst forking, like just use nr_running when the system has 2 or more forking in same tick. But it's still bad since runnable load avg is tracking about 4S rq util, so one tick care is far not enough. Signed-off-by: Alex Shi <alex....@intel.com> --- kernel/sched/fair.c | 230 +++++++++++++++++++++++++++++++++++++++----------- 1 files changed, 179 insertions(+), 51 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4cc1764..729f35d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3092,25 +3092,189 @@ done: } /* - * sched_balance_self: balance the current task (running on cpu) in domains + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *this; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + /** Statistics of this group */ + unsigned long this_load; + unsigned long this_load_per_task; + unsigned long this_nr_running; + unsigned int this_has_capacity; + unsigned int this_idle_cpus; + + /* Statistics of the busiest group */ + unsigned int busiest_idle_cpus; + unsigned long max_load; + unsigned long busiest_load_per_task; + unsigned long busiest_nr_running; + unsigned long busiest_group_capacity; + unsigned int busiest_has_capacity; + unsigned int busiest_group_weight; + + int group_imb; /* Is there imbalance in this sd */ + + /* Varibles of power awaring scheduling */ + unsigned int sd_utils; /* sum utilizations of this domain */ + unsigned long sd_capacity; /* capacity of this domain */ + struct sched_group *group_leader; /* Group which relieves group_min */ + unsigned long min_load_per_task; /* load_per_task in group_min */ + unsigned int leader_util; /* sum utilizations of group_leader */ + unsigned int min_util; /* sum utilizations of group_min */ +}; + +/* + * sg_lb_stats - stats of a sched_group required for load_balancing + */ +struct sg_lb_stats { + unsigned long avg_load; /*Avg load across the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long sum_nr_running; /* Nr tasks running in the group */ + unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + unsigned long group_capacity; + unsigned long idle_cpus; + unsigned long group_weight; + int group_imb; /* Is there an imbalance in the group ? */ + int group_has_capacity; /* Is there extra capacity in the group? */ + unsigned int group_utils; /* sum utilizations of group */ + + unsigned long sum_shared_running; /* 0 on non-NUMA */ +}; + +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group); + +/* + * Try to collect the task running number and capacity of the group. + */ +static void get_sg_power_stats(struct sched_group *group, + struct sched_domain *sd, struct sg_lb_stats *sgs) +{ + int i; + + for_each_cpu(i, sched_group_cpus(group)) { + struct rq *rq = cpu_rq(i); + + sgs->group_utils += rq->nr_running; + } + + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, + SCHED_POWER_SCALE); + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(sd, group); + sgs->group_weight = group->group_weight; +} + +/* + * Try to collect the task running number and capacity of the doamin. + */ +static void get_sd_power_stats(struct sched_domain *sd, + struct task_struct *p, struct sd_lb_stats *sds) +{ + struct sched_group *group; + struct sg_lb_stats sgs; + int sd_min_delta = INT_MAX; + int cpu = task_cpu(p); + + group = sd->groups; + do { + long g_delta; + unsigned long threshold; + + if (!cpumask_test_cpu(cpu, sched_group_mask(group))) + continue; + + memset(&sgs, 0, sizeof(sgs)); + get_sg_power_stats(group, sd, &sgs); + + if (sched_policy == SCHED_POLICY_POWERSAVING) + threshold = sgs.group_weight; + else + threshold = sgs.group_capacity; + + g_delta = threshold - sgs.group_utils; + + if (g_delta > 0 && g_delta < sd_min_delta) { + sd_min_delta = g_delta; + sds->group_leader = group; + } + + sds->sd_utils += sgs.group_utils; + sds->total_pwr += group->sgp->power; + } while (group = group->next, group != sd->groups); + + sds->sd_capacity = DIV_ROUND_CLOSEST(sds->total_pwr, + SCHED_POWER_SCALE); +} + +/* + * Execute power policy if this domain is not full. + */ +static inline int get_sd_sched_policy(struct sched_domain *sd, + int cpu, struct task_struct *p, struct sd_lb_stats *sds) +{ + unsigned long threshold; + + if (sched_policy == SCHED_POLICY_PERFORMANCE) + return SCHED_POLICY_PERFORMANCE; + + if (sched_policy == SCHED_POLICY_POWERSAVING) + threshold = sd->span_weight; + else + threshold = sds->sd_capacity; + + memset(sds, 0, sizeof(*sds)); + get_sd_power_stats(sd, p, sds); + + /* still can hold one more task in this domain */ + if (sds->sd_utils < threshold) + return sched_policy; + + return SCHED_POLICY_PERFORMANCE; +} + +/* + * If power policy is eligible for this domain, and it has task allowed cpu. + * we will select CPU from this domain. + */ +static int get_cpu_for_power_policy(struct sched_domain *sd, int cpu, + struct task_struct *p, struct sd_lb_stats *sds) +{ + int policy; + int new_cpu = -1; + + policy = get_sd_sched_policy(sd, cpu, p, sds); + if (policy != SCHED_POLICY_PERFORMANCE && sds->group_leader) { + new_cpu = find_idlest_cpu(sds->group_leader, p, cpu); + } + return new_cpu; +} + +/* + * select_task_rq_fair: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and * SD_BALANCE_EXEC. * - * Balance, ie. select the least loaded group. - * * Returns the target CPU number, or the same CPU if no balancing is needed. * * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; - int sync = wake_flags & WF_SYNC; + int sync = flags & WF_SYNC; + struct sd_lb_stats sds; if (p->nr_cpus_allowed == 1) return prev_cpu; @@ -3136,11 +3300,20 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) break; } - if (tmp->flags & sd_flag) + if (tmp->flags & sd_flag) { sd = tmp; + + new_cpu = get_cpu_for_power_policy(sd, cpu, p, &sds); + if (new_cpu != -1) + goto unlock; + } } if (affine_sd) { + new_cpu = get_cpu_for_power_policy(affine_sd, cpu, p, &sds); + if (new_cpu != -1) + goto unlock; + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) prev_cpu = cpu; @@ -3950,51 +4123,6 @@ static unsigned long task_h_load(struct task_struct *p) #endif /********** Helpers for find_busiest_group ************************/ -/* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. - */ -struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *this; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - - /** Statistics of this group */ - unsigned long this_load; - unsigned long this_load_per_task; - unsigned long this_nr_running; - unsigned long this_has_capacity; - unsigned int this_idle_cpus; - - /* Statistics of the busiest group */ - unsigned int busiest_idle_cpus; - unsigned long max_load; - unsigned long busiest_load_per_task; - unsigned long busiest_nr_running; - unsigned long busiest_group_capacity; - unsigned long busiest_has_capacity; - unsigned int busiest_group_weight; - - int group_imb; /* Is there imbalance in this sd */ -}; - -/* - * sg_lb_stats - stats of a sched_group required for load_balancing - */ -struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_nr_running; /* Nr tasks running in the group */ - unsigned long sum_weighted_load; /* Weighted load of group's tasks */ - unsigned long group_capacity; - unsigned long idle_cpus; - unsigned long group_weight; - int group_imb; /* Is there an imbalance in the group ? */ - int group_has_capacity; /* Is there extra capacity in the group? */ -}; - /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. -- 1.7.5.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/