Accumulate unused quota from previous periods, thus accumulated bandwidth runtime can be used in the following periods. During accumulation, take care of runtime overflow. Previous non-burstable CFS bandwidth controller only assign quota to runtime, that saves a lot.
A sysctl parameter sysctl_sched_cfs_bw_burst_enabled is introduced as a switch for burst. It is enabled by default. Co-developed-by: Shanpei Chen <shanp...@linux.alibaba.com> Signed-off-by: Shanpei Chen <shanp...@linux.alibaba.com> Signed-off-by: Huaixin Chang <changhuai...@linux.alibaba.com> --- include/linux/sched/sysctl.h | 1 + kernel/sched/core.c | 8 +++--- kernel/sched/fair.c | 58 ++++++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 4 +-- kernel/sysctl.c | 9 +++++++ 5 files changed, 66 insertions(+), 14 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca5..3cce25485c69 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -72,6 +72,7 @@ extern unsigned int sysctl_sched_uclamp_util_min_rt_default; #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; +extern unsigned int sysctl_sched_cfs_bw_burst_enabled; #endif #ifdef CONFIG_SCHED_AUTOGROUP diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 708c31e6ce1f..16e23a2499ef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8948,7 +8948,7 @@ static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ /* More than 203 days if BW_SHIFT equals 20. */ -static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; +const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -9012,13 +9012,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, cfs_b->quota = quota; cfs_b->burst = burst; - __refill_cfs_bandwidth_runtime(cfs_b); - if (runtime_enabled) { cfs_b->buffer = min(max_cfs_runtime, quota + burst); + cfs_b->max_overrun = DIV_ROUND_UP_ULL(max_cfs_runtime, quota); + cfs_b->runtime = cfs_b->quota; /* Restart the period timer (if active) to handle new period expiry: */ - start_cfs_bandwidth(cfs_b); + start_cfs_bandwidth(cfs_b, 1); } raw_spin_unlock_irq(&cfs_b->lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 59d816a365f3..c981d4845c96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -127,6 +127,13 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: 5 msec, units: microseconds) */ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + +/* + * A switch for cfs bandwidth burst. + * + * (default: 1, enabled) + */ +unsigned int sysctl_sched_cfs_bw_burst_enabled = 1; #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) @@ -4602,10 +4609,23 @@ static inline u64 sched_cfs_bandwidth_slice(void) * * requires cfs_b->lock */ -void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) +static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b, + u64 overrun) { - if (cfs_b->quota != RUNTIME_INF) - cfs_b->runtime = cfs_b->quota; + u64 refill; + + if (cfs_b->quota != RUNTIME_INF) { + + if (!sysctl_sched_cfs_bw_burst_enabled) { + cfs_b->runtime = cfs_b->quota; + return; + } + + overrun = min(overrun, cfs_b->max_overrun); + refill = cfs_b->quota * overrun; + cfs_b->runtime += refill; + cfs_b->runtime = min(cfs_b->runtime, cfs_b->buffer); + } } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4627,7 +4647,7 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { - start_cfs_bandwidth(cfs_b); + start_cfs_bandwidth(cfs_b, 0); if (cfs_b->runtime > 0) { amount = min(cfs_b->runtime, min_amount); @@ -4973,7 +4993,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u if (cfs_b->idle && !throttled) goto out_deactivate; - __refill_cfs_bandwidth_runtime(cfs_b); + __refill_cfs_bandwidth_runtime(cfs_b, overrun); if (!throttled) { /* mark as potentially idle for the upcoming period */ @@ -5194,6 +5214,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) } extern const u64 max_cfs_quota_period; +extern const u64 max_cfs_runtime; static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { @@ -5223,7 +5244,18 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) new = old * 2; if (new < max_cfs_quota_period) { cfs_b->period = ns_to_ktime(new); - cfs_b->quota *= 2; + cfs_b->quota = min(cfs_b->quota * 2, + max_cfs_runtime); + + cfs_b->buffer = min(cfs_b->quota + cfs_b->burst, + max_cfs_runtime); + /* + * Add 1 in case max_overrun becomes 0. + * 0 max_overrun will cause no runtime being + * refilled in __refill_cfs_bandwidth_runtime(). + */ + cfs_b->max_overrun >>= 1; + cfs_b->max_overrun++; pr_warn_ratelimited( "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", @@ -5272,16 +5304,26 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) INIT_LIST_HEAD(&cfs_rq->throttled_list); } -void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, int init) { + u64 overrun; + lockdep_assert_held(&cfs_b->lock); if (cfs_b->period_active) return; cfs_b->period_active = 1; - hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); + + /* + * When period timer stops, quota for the following period is not + * refilled, however period timer is already forwarded. We should + * accumulate quota once more than overrun here. + */ + if (!init) + __refill_cfs_bandwidth_runtime(cfs_b, overrun + 1); } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a4a1c0116d51..efcbbfc31619 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -359,6 +359,7 @@ struct cfs_bandwidth { u64 runtime; u64 burst; u64 buffer; + u64 max_overrun; s64 hierarchical_quota; u8 idle; @@ -469,8 +470,7 @@ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *parent); extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, int init); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62fbd09b5dc1..20d6a5ca9ef3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1842,6 +1842,15 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "sched_cfs_bw_burst_enabled", + .data = &sysctl_sched_cfs_bw_burst_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) { -- 2.14.4.44.g2045bb6