From: Konstantin Khorenko <khore...@virtuozzo.com> Signed-off-by: Konstantin Khorenko <khore...@virtuozzo.com> Reviewed-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
+++ sched: Uninline css_tg() Compilation with custom config fails: kernel/ve/ve.c: In function ‘ve_get_cpu_avenrun’: kernel/ve/ve.c:1679:27: error: inlining failed in call to always_inline ‘css_tg’: function body not available inline struct task_group *css_tg(struct cgroup_subsys_state *css); ^~~~~~ kernel/ve/ve.c:1690:7: note: called from here tg = css_tg(css); ^~~~~~~~~~~ We may remove "inline" attribute, as compiler is clever enough to make itself inlining in kernel/sched/sched.c. Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com> Reviewed-by: Evgenii Shatokhin <eshatok...@virtuozzo.com> Cherry-picked from vz8 commit 0b5495c8980d ("ve/sched/stat: Introduce functions to calculate vcpustat data"). Ported code that calculates CT boot timestamp to time namespaces. Signed-off-by: Nikita Yushchenko <nikita.yushche...@virtuozzo.com> --- kernel/sched/core.c | 2 +- kernel/sched/cpuacct.c | 379 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 380 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 79a6f6808a7c..f1689ac77af1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9718,7 +9718,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, tsk, &rf); } -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +struct task_group *css_tg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct task_group, css) : NULL; } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 893eece65bfd..871b6f8ccb0d 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -5,6 +5,8 @@ * Based on the work by Paul Menage (men...@google.com) and Balbir Singh * (bal...@in.ibm.com). */ +#include <linux/kernel_stat.h> +#include <linux/ve.h> #include <asm/irq_regs.h> #include "sched.h" @@ -374,3 +376,380 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .legacy_cftypes = files, .early_init = true, }; + +extern struct task_group *css_tg(struct cgroup_subsys_state *css); + +static struct task_group *ve_root_tg(struct task_group *tg) { + struct cgroup *cg; + + if (!tg) + return NULL; + + cg = cgroup_get_ve_root1(tg->css.cgroup); + return cg ? css_tg(&cg->self) : NULL; +} + +unsigned int tg_cpu_rate(struct task_group *tg) +{ + unsigned int cpu_rate = 0; +#ifdef CONFIG_CFS_CPULIMIT + tg = ve_root_tg(tg); + if (tg) + cpu_rate = tg->cpu_rate; +#endif + return cpu_rate; +} + +static unsigned int tg_nr_cpus(struct task_group *tg) +{ + unsigned int nr_cpus = 0; + unsigned int max_nr_cpus = num_online_cpus(); + +#ifdef CONFIG_CFS_CPULIMIT + tg = ve_root_tg(tg); + if (tg) + nr_cpus = tg->nr_cpus; +#endif + if (!nr_cpus || nr_cpus > max_nr_cpus) + nr_cpus = max_nr_cpus; + + return nr_cpus; +} + +struct kernel_cpustat *cpuacct_cpustat(struct cgroup_subsys_state *css, int cpu) +{ + return per_cpu_ptr(css_ca(css)->cpustat, cpu); +} + +static void cpu_cgroup_update_stat(struct cgroup_subsys_state *cpu_css, + struct cgroup_subsys_state *cpuacct_css, + int i) +{ +#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED) + struct task_group *tg = css_tg(cpu_css); + struct sched_entity *se = tg->se[i]; + u64 *cpustat = cpuacct_cpustat(cpuacct_css, i)->cpustat; + u64 now = cpu_clock(i); + u64 delta, idle, iowait, steal; + + /* root_task_group has not sched entities */ + if (tg == &root_task_group) + return; + + iowait = se->statistics.iowait_sum; + idle = se->statistics.sum_sleep_runtime; + steal = se->statistics.wait_sum; + + if (idle > iowait) + idle -= iowait; + else + idle = 0; + + if (se->statistics.sleep_start) { + delta = now - se->statistics.sleep_start; + if ((s64)delta > 0) + idle += delta; + } else if (se->statistics.block_start) { + delta = now - se->statistics.block_start; + if ((s64)delta > 0) + iowait += delta; + } else if (se->statistics.wait_start) { + delta = now - se->statistics.wait_start; + if ((s64)delta > 0) + steal += delta; + } + + cpustat[CPUTIME_IDLE] = max(cpustat[CPUTIME_IDLE], idle); + cpustat[CPUTIME_IOWAIT] = max(cpustat[CPUTIME_IOWAIT], iowait); + cpustat[CPUTIME_STEAL] = steal; +#endif +} + +static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur, + struct kernel_cpustat *rem, int ind, + u64 cur_usage, u64 target_usage, + u64 rem_usage) +{ + s64 scaled_val; + u32 scale_pct = 0; + + /* distribute the delta among USER, NICE, and SYSTEM proportionally */ + if (cur_usage < target_usage) { + if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */ + scale_pct = div64_u64(100 * rem->cpustat[ind], + rem_usage); + } else { + if ((s64)cur_usage > 0) /* sanity check to avoid div/0 */ + scale_pct = div64_u64(100 * cur->cpustat[ind], + cur_usage); + } + + scaled_val = div_s64(scale_pct * (target_usage - cur_usage), 100); + + cur->cpustat[ind] += scaled_val; + if ((s64)cur->cpustat[ind] < 0) + cur->cpustat[ind] = 0; + + rem->cpustat[ind] -= scaled_val; + if ((s64)rem->cpustat[ind] < 0) + rem->cpustat[ind] = 0; +} + +static void calc_vcpustat_delta_idle(struct kernel_cpustat *cur, + int ind, u64 cur_idle, u64 target_idle) +{ + /* distribute target_idle between IDLE and IOWAIT proportionally to + * what we initially had on this vcpu */ + if ((s64)cur_idle > 0) { + u32 scale_pct = div64_u64(100 * cur->cpustat[ind], cur_idle); + cur->cpustat[ind] = div_u64(scale_pct * target_idle, 100); + } else { + cur->cpustat[ind] = ind == CPUTIME_IDLE ? target_idle : 0; + } +} + +static void fixup_vcpustat_delta(struct kernel_cpustat *cur, + struct kernel_cpustat *rem, + u64 max_usage) +{ + u64 cur_usage, target_usage, rem_usage; + u64 cur_idle, target_idle; + + cur_usage = kernel_cpustat_total_usage(cur); + rem_usage = kernel_cpustat_total_usage(rem); + + target_usage = min(cur_usage + rem_usage, + max_usage); + + if (cur_usage != target_usage) { + fixup_vcpustat_delta_usage(cur, rem, CPUTIME_USER, + cur_usage, target_usage, rem_usage); + fixup_vcpustat_delta_usage(cur, rem, CPUTIME_NICE, + cur_usage, target_usage, rem_usage); + fixup_vcpustat_delta_usage(cur, rem, CPUTIME_SYSTEM, + cur_usage, target_usage, rem_usage); + } + + cur_idle = kernel_cpustat_total_idle(cur); + target_idle = max_usage - target_usage; + + if (cur_idle != target_idle) { + calc_vcpustat_delta_idle(cur, CPUTIME_IDLE, + cur_idle, target_idle); + calc_vcpustat_delta_idle(cur, CPUTIME_IOWAIT, + cur_idle, target_idle); + } + + /* do not show steal time inside ve */ + cur->cpustat[CPUTIME_STEAL] = 0; +} + +static void cpu_cgroup_update_vcpustat(struct cgroup_subsys_state *cpu_css, + struct cgroup_subsys_state *cpuacct_css) +{ + int i, j; + int nr_vcpus; + int vcpu_rate; + ktime_t now; + u64 max_usage; + struct kernel_cpustat stat_delta, stat_rem; + struct task_group *tg = css_tg(cpu_css); + int first_pass = 1; + + spin_lock(&tg->vcpustat_lock); + + now = ktime_get(); + nr_vcpus = tg_nr_cpus(tg); + vcpu_rate = DIV_ROUND_UP(tg_cpu_rate(tg), nr_vcpus); + if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE) + vcpu_rate = MAX_CPU_RATE; + + if (!ktime_to_ns(tg->vcpustat_last_update)) { + /* on the first read initialize vcpu i stat as a sum of stats + * over pcpus j such that j % nr_vcpus == i */ + for (i = 0; i < nr_vcpus; i++) { + for (j = i; j < nr_cpu_ids; j += nr_vcpus) { + if (!cpu_possible(j)) + continue; + kernel_cpustat_add(tg->vcpustat + i, + cpuacct_cpustat(cpuacct_css, j), + tg->vcpustat + i); + } + } + goto out_update_last; + } + + max_usage = ktime_to_ns(ktime_sub(now, tg->vcpustat_last_update)); + max_usage = div_u64(max_usage * vcpu_rate, MAX_CPU_RATE); + /* don't allow to update stats too often to avoid calculation errors */ + if (max_usage < 10) + goto out_unlock; + + /* temporarily copy per cpu usage delta to tg->cpustat_last */ + for_each_possible_cpu(i) + kernel_cpustat_sub(cpuacct_cpustat(cpuacct_css, i), + tg->cpustat_last + i, + tg->cpustat_last + i); + + /* proceed to calculating per vcpu delta */ + kernel_cpustat_zero(&stat_rem); + +again: + for (i = 0; i < nr_vcpus; i++) { + int exceeds_max; + + kernel_cpustat_zero(&stat_delta); + for (j = i; j < nr_cpu_ids; j += nr_vcpus) { + if (!cpu_possible(j)) + continue; + kernel_cpustat_add(&stat_delta, + tg->cpustat_last + j, &stat_delta); + } + + exceeds_max = kernel_cpustat_total_usage(&stat_delta) >= + max_usage; + /* + * On the first pass calculate delta for vcpus with usage > + * max_usage in order to accumulate excess in stat_rem. + * + * Once the remainder is accumulated, proceed to the rest of + * vcpus so that it will be distributed among them. + */ + if (exceeds_max != first_pass) + continue; + + fixup_vcpustat_delta(&stat_delta, &stat_rem, max_usage); + kernel_cpustat_add(tg->vcpustat + i, &stat_delta, + tg->vcpustat + i); + } + + if (first_pass) { + first_pass = 0; + goto again; + } +out_update_last: + for_each_possible_cpu(i) + tg->cpustat_last[i] = *cpuacct_cpustat(cpuacct_css, i); + tg->vcpustat_last_update = now; +out_unlock: + spin_unlock(&tg->vcpustat_lock); +} + +int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css, + struct cgroup_subsys_state *cpuacct_css, + struct seq_file *p) +{ + int i; + u64 user, nice, system, idle, iowait, steal; + struct time_namespace *time_ns; + struct timespec64 boottime; + struct task_group *tg = css_tg(cpu_css); + bool virt = !ve_is_super(get_exec_env()) && tg != &root_task_group; + int nr_vcpus = tg_nr_cpus(tg); + struct kernel_cpustat *kcpustat; + unsigned long tg_nr_running = 0; + unsigned long tg_nr_iowait = 0; + + time_ns = ve_get_time_ns(get_exec_env()); + if (time_ns) { + getboottime64(&boottime); + /* time_ns->offsets.boottime is (ve_uptime - host_uptime), i.e. + * negative for ve created on this host. Shall subtract that + * from the timestamp of host's boot to get the timestamp of + * ve's boot */ + boottime = timespec64_sub(boottime, time_ns->offsets.boottime); + put_time_ns(time_ns); + } else { + /* for not yet started ve, use current time as the timestamp of + * ve's boot */ + ktime_get_real_ts64(&boottime); + } + + for_each_possible_cpu(i) { + cpu_cgroup_update_stat(cpu_css, cpuacct_css, i); + + /* root task group has autogrouping, so this doesn't hold */ +#ifdef CONFIG_FAIR_GROUP_SCHED + tg_nr_running += tg->cfs_rq[i]->h_nr_running; + tg_nr_iowait += tg->cfs_rq[i]->nr_iowait; +#endif +#ifdef CONFIG_RT_GROUP_SCHED + tg_nr_running += tg->rt_rq[i]->rt_nr_running; +#endif + } + + if (virt) + cpu_cgroup_update_vcpustat(cpu_css, cpuacct_css); + + user = nice = system = idle = iowait = steal = 0; + + for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) { + if (!virt && !cpu_possible(i)) + continue; + + kcpustat = virt ? tg->vcpustat + i : + cpuacct_cpustat(cpuacct_css, i); + + user += kcpustat->cpustat[CPUTIME_USER]; + nice += kcpustat->cpustat[CPUTIME_NICE]; + system += kcpustat->cpustat[CPUTIME_SYSTEM]; + idle += kcpustat->cpustat[CPUTIME_IDLE]; + iowait += kcpustat->cpustat[CPUTIME_IOWAIT]; + steal += kcpustat->cpustat[CPUTIME_STEAL]; + } + /* Don't scare CT users with high steal time */ + if (!ve_is_super(get_exec_env())) + steal = 0; + + seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 %llu\n", + (unsigned long long)nsec_to_clock_t(user), + (unsigned long long)nsec_to_clock_t(nice), + (unsigned long long)nsec_to_clock_t(system), + (unsigned long long)nsec_to_clock_t(idle), + (unsigned long long)nsec_to_clock_t(iowait), + virt ? 0ULL : + (unsigned long long)nsec_to_clock_t(steal)); + + for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) { + if (!virt && !cpu_online(i)) + continue; + kcpustat = virt ? tg->vcpustat + i : + cpuacct_cpustat(cpuacct_css, i); + + user = kcpustat->cpustat[CPUTIME_USER]; + nice = kcpustat->cpustat[CPUTIME_NICE]; + system = kcpustat->cpustat[CPUTIME_SYSTEM]; + idle = kcpustat->cpustat[CPUTIME_IDLE]; + iowait = kcpustat->cpustat[CPUTIME_IOWAIT]; + steal = kcpustat->cpustat[CPUTIME_STEAL]; + /* Don't scare CT users with high steal time */ + if (!ve_is_super(get_exec_env())) + steal = 0; + + seq_printf(p, + "cpu%d %llu %llu %llu %llu %llu 0 0 %llu\n", + i, + (unsigned long long)nsec_to_clock_t(user), + (unsigned long long)nsec_to_clock_t(nice), + (unsigned long long)nsec_to_clock_t(system), + (unsigned long long)nsec_to_clock_t(idle), + (unsigned long long)nsec_to_clock_t(iowait), + virt ? 0ULL : + (unsigned long long)nsec_to_clock_t(steal)); + } + seq_printf(p, "intr 0"); + + seq_printf(p, + "\nctxt %llu\n" + "btime %llu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + nr_context_switches(), + (unsigned long long)boottime.tv_sec, + total_forks, + tg_nr_running, + tg_nr_iowait); + + return 0; +} -- 2.30.2 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel