Accumulate unused quota from previous periods, thus accumulated
bandwidth runtime can be used in the following periods. During
accumulation, take care of runtime overflow. Previous non-burstable
CFS bandwidth controller only assign quota to runtime, that saves a lot.

A sysctl parameter sysctl_sched_cfs_bw_burst_enabled is introduced as a
switch for burst. It is enabled by default.

Co-developed-by: Shanpei Chen <shanp...@linux.alibaba.com>
Signed-off-by: Shanpei Chen <shanp...@linux.alibaba.com>
Signed-off-by: Huaixin Chang <changhuai...@linux.alibaba.com>
---
 include/linux/sched/sysctl.h |  1 +
 kernel/sched/core.c          |  8 +++---
 kernel/sched/fair.c          | 58 ++++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h         |  4 +--
 kernel/sysctl.c              |  9 +++++++
 5 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 3c31ba88aca5..3cce25485c69 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -72,6 +72,7 @@ extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
 
 #ifdef CONFIG_CFS_BANDWIDTH
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+extern unsigned int sysctl_sched_cfs_bw_burst_enabled;
 #endif
 
 #ifdef CONFIG_SCHED_AUTOGROUP
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 708c31e6ce1f..16e23a2499ef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8948,7 +8948,7 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
 static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
 /* More than 203 days if BW_SHIFT equals 20. */
-static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
+const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
@@ -9012,13 +9012,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, 
u64 period, u64 quota,
        cfs_b->quota = quota;
        cfs_b->burst = burst;
 
-       __refill_cfs_bandwidth_runtime(cfs_b);
-
        if (runtime_enabled) {
                cfs_b->buffer = min(max_cfs_runtime, quota + burst);
+               cfs_b->max_overrun = DIV_ROUND_UP_ULL(max_cfs_runtime, quota);
+               cfs_b->runtime = cfs_b->quota;
 
                /* Restart the period timer (if active) to handle new period 
expiry: */
-               start_cfs_bandwidth(cfs_b);
+               start_cfs_bandwidth(cfs_b, 1);
        }
 
        raw_spin_unlock_irq(&cfs_b->lock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 59d816a365f3..c981d4845c96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -127,6 +127,13 @@ int __weak arch_asym_cpu_priority(int cpu)
  * (default: 5 msec, units: microseconds)
  */
 unsigned int sysctl_sched_cfs_bandwidth_slice          = 5000UL;
+
+/*
+ * A switch for cfs bandwidth burst.
+ *
+ * (default: 1, enabled)
+ */
+unsigned int sysctl_sched_cfs_bw_burst_enabled = 1;
 #endif
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -4602,10 +4609,23 @@ static inline u64 sched_cfs_bandwidth_slice(void)
  *
  * requires cfs_b->lock
  */
-void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b,
+                                          u64 overrun)
 {
-       if (cfs_b->quota != RUNTIME_INF)
-               cfs_b->runtime = cfs_b->quota;
+       u64 refill;
+
+       if (cfs_b->quota != RUNTIME_INF) {
+
+               if (!sysctl_sched_cfs_bw_burst_enabled) {
+                       cfs_b->runtime = cfs_b->quota;
+                       return;
+               }
+
+               overrun = min(overrun, cfs_b->max_overrun);
+               refill = cfs_b->quota * overrun;
+               cfs_b->runtime += refill;
+               cfs_b->runtime = min(cfs_b->runtime, cfs_b->buffer);
+       }
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4627,7 +4647,7 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth 
*cfs_b,
        if (cfs_b->quota == RUNTIME_INF)
                amount = min_amount;
        else {
-               start_cfs_bandwidth(cfs_b);
+               start_cfs_bandwidth(cfs_b, 0);
 
                if (cfs_b->runtime > 0) {
                        amount = min(cfs_b->runtime, min_amount);
@@ -4973,7 +4993,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth 
*cfs_b, int overrun, u
        if (cfs_b->idle && !throttled)
                goto out_deactivate;
 
-       __refill_cfs_bandwidth_runtime(cfs_b);
+       __refill_cfs_bandwidth_runtime(cfs_b, overrun);
 
        if (!throttled) {
                /* mark as potentially idle for the upcoming period */
@@ -5194,6 +5214,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct 
hrtimer *timer)
 }
 
 extern const u64 max_cfs_quota_period;
+extern const u64 max_cfs_runtime;
 
 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
@@ -5223,7 +5244,18 @@ static enum hrtimer_restart 
sched_cfs_period_timer(struct hrtimer *timer)
                        new = old * 2;
                        if (new < max_cfs_quota_period) {
                                cfs_b->period = ns_to_ktime(new);
-                               cfs_b->quota *= 2;
+                               cfs_b->quota = min(cfs_b->quota * 2,
+                                                  max_cfs_runtime);
+
+                               cfs_b->buffer = min(cfs_b->quota + cfs_b->burst,
+                                                   max_cfs_runtime);
+                               /*
+                                * Add 1 in case max_overrun becomes 0.
+                                * 0 max_overrun will cause no runtime being
+                                * refilled in __refill_cfs_bandwidth_runtime().
+                                */
+                               cfs_b->max_overrun >>= 1;
+                               cfs_b->max_overrun++;
 
                                pr_warn_ratelimited(
        "cfs_period_timer[cpu%d]: period too short, scaling up (new 
cfs_period_us = %lld, cfs_quota_us = %lld)\n",
@@ -5272,16 +5304,26 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 
-void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, int init)
 {
+       u64 overrun;
+
        lockdep_assert_held(&cfs_b->lock);
 
        if (cfs_b->period_active)
                return;
 
        cfs_b->period_active = 1;
-       hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+       overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
        hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+
+       /*
+        * When period timer stops, quota for the following period is not
+        * refilled, however period timer is already forwarded. We should
+        * accumulate quota once more than overrun here.
+        */
+       if (!init)
+               __refill_cfs_bandwidth_runtime(cfs_b, overrun + 1);
 }
 
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4a1c0116d51..efcbbfc31619 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -359,6 +359,7 @@ struct cfs_bandwidth {
        u64                     runtime;
        u64                     burst;
        u64                     buffer;
+       u64                     max_overrun;
        s64                     hierarchical_quota;
 
        u8                      idle;
@@ -469,8 +470,7 @@ extern void init_tg_cfs_entry(struct task_group *tg, struct 
cfs_rq *cfs_rq,
                        struct sched_entity *parent);
 extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 
-extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, int init);
 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 
 extern void free_rt_sched_group(struct task_group *tg);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09b5dc1..20d6a5ca9ef3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1842,6 +1842,15 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = SYSCTL_ONE,
        },
+       {
+               .procname       = "sched_cfs_bw_burst_enabled",
+               .data           = &sysctl_sched_cfs_bw_burst_enabled,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = SYSCTL_ONE,
+       },
 #endif
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
        {
-- 
2.14.4.44.g2045bb6

Reply via email to