The branch main has been updated by markj:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=c72188d85a793c7610208beafb83af544de6e3b7

commit c72188d85a793c7610208beafb83af544de6e3b7
Author:     Cyril Zhang <cy...@freebsdfoundation.org>
AuthorDate: 2025-08-05 23:20:56 +0000
Commit:     Mark Johnston <ma...@freebsd.org>
CommitDate: 2025-08-05 23:33:55 +0000

    racct: Improve handling of the pcpu resource
    
    The previous scheme would inflate the CPU consumption of short-lived
    processes.  For containers (e.g., processes, jails), the total pcpu
    usage was computed as a sum of the pcpu usage of all constituent
    threads, which makes little sense for a decaying average.
    
    Instead, aggregate wallclock time of all on-CPU threads and compute the
    pcpu resource as a decaying average as the sum.  This gives much more
    reasonable and accurate values in various simple tests.
    
    PR:             235556
    Reviewed by:    markj
    MFC after:      1 month
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D30878
---
 sys/kern/kern_racct.c | 307 +++++++++++++++++++++++++-------------------------
 sys/sys/proc.h        |   1 -
 sys/sys/racct.h       |   6 +-
 3 files changed, 156 insertions(+), 158 deletions(-)

diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
index 7ee3b9e2048a..7351e9cb6313 100644
--- a/sys/kern/kern_racct.c
+++ b/sys/kern/kern_racct.c
@@ -96,6 +96,13 @@ static void racct_sub_cred_locked(struct ucred *cred, int 
resource,
                uint64_t amount);
 static void racct_add_cred_locked(struct ucred *cred, int resource,
                uint64_t amount);
+static int racct_set_locked(struct proc *p, int resource, uint64_t amount,
+                int force);
+static void racct_updatepcpu_locked(struct proc *p);
+static void racct_updatepcpu_racct_locked(struct racct *racct);
+static void racct_updatepcpu_containers(void);
+static void racct_settime_locked(struct proc *p, bool exit);
+static void racct_zeropcpu_locked(struct proc *p);
 
 SDT_PROVIDER_DEFINE(racct);
 SDT_PROBE_DEFINE3(racct, , rusage, add,
@@ -308,68 +315,6 @@ fixpt_t ccpu_exp[] = {
 
 #define        CCPU_EXP_MAX    110
 
-/*
- * This function is analogical to the getpcpu() function in the ps(1) command.
- * They should both calculate in the same way so that the racct %cpu
- * calculations are consistent with the values shown by the ps(1) tool.
- * The calculations are more complex in the 4BSD scheduler because of the value
- * of the ccpu variable.  In ULE it is defined to be zero which saves us some
- * work.
- */
-static uint64_t
-racct_getpcpu(struct proc *p, u_int pcpu)
-{
-       u_int swtime;
-#ifdef SCHED_4BSD
-       fixpt_t pctcpu, pctcpu_next;
-#endif
-       fixpt_t p_pctcpu;
-       struct thread *td;
-
-       ASSERT_RACCT_ENABLED();
-       KASSERT((p->p_flag & P_IDLEPROC) == 0,
-           ("racct_getpcpu: idle process %p", p));
-
-       swtime = (ticks - p->p_swtick) / hz;
-
-       /*
-        * For short-lived processes, the sched_pctcpu() returns small
-        * values even for cpu intensive processes.  Therefore we use
-        * our own estimate in this case.
-        */
-       if (swtime < RACCT_PCPU_SECS)
-               return (pcpu);
-
-       p_pctcpu = 0;
-       FOREACH_THREAD_IN_PROC(p, td) {
-               thread_lock(td);
-#ifdef SCHED_4BSD
-               pctcpu = sched_pctcpu(td);
-               /* Count also the yet unfinished second. */
-               pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
-               pctcpu_next += sched_pctcpu_delta(td);
-               p_pctcpu += max(pctcpu, pctcpu_next);
-#else
-               /*
-                * In ULE the %cpu statistics are updated on every
-                * sched_pctcpu() call.  So special calculations to
-                * account for the latest (unfinished) second are
-                * not needed.
-                */
-               p_pctcpu += sched_pctcpu(td);
-#endif
-               thread_unlock(td);
-       }
-
-#ifdef SCHED_4BSD
-       if (swtime <= CCPU_EXP_MAX)
-               return ((100 * (uint64_t)p_pctcpu * 1000000) /
-                   (FSCALE - ccpu_exp[swtime]));
-#endif
-
-       return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
-}
-
 static void
 racct_add_racct(struct racct *dest, const struct racct *src)
 {
@@ -499,19 +444,6 @@ racct_adjust_resource(struct racct *racct, int resource,
                    ("%s: resource %d usage < 0", __func__, resource));
                racct->r_resources[resource] = 0;
        }
-
-       /*
-        * There are some cases where the racct %cpu resource would grow
-        * beyond 100% per core.  For example in racct_proc_exit() we add
-        * the process %cpu usage to the ucred racct containers.  If too
-        * many processes terminated in a short time span, the ucred %cpu
-        * resource could grow too much.  Also, the 4BSD scheduler sometimes
-        * returns for a thread more than 100% cpu usage. So we set a sane
-        * boundary here to 100% * the maximum number of CPUs.
-        */
-       if ((resource == RACCT_PCTCPU) &&
-           (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * 
(int64_t)MAXCPU))
-               racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * 
(int64_t)MAXCPU;
 }
 
 static int
@@ -635,10 +567,44 @@ racct_add_buf(struct proc *p, const struct buf *bp, int 
is_write)
        RACCT_UNLOCK();
 }
 
+static void
+racct_settime_locked(struct proc *p, bool exit)
+{
+       struct thread *td;
+       struct timeval wallclock;
+       uint64_t runtime;
+
+       ASSERT_RACCT_ENABLED();
+       RACCT_LOCK_ASSERT();
+       PROC_LOCK_ASSERT(p, MA_OWNED);
+
+       if (exit) {
+               /*
+                * proc_reap() has already calculated rux
+                * and added crux to rux.
+                */
+               runtime = cputick2usec(p->p_rux.rux_runtime -
+                   p->p_crux.rux_runtime);
+       } else {
+               PROC_STATLOCK(p);
+               FOREACH_THREAD_IN_PROC(p, td)
+                       ruxagg(p, td);
+               PROC_STATUNLOCK(p);
+               runtime = cputick2usec(p->p_rux.rux_runtime);
+       }
+       microuptime(&wallclock);
+       timevalsub(&wallclock, &p->p_stats->p_start);
+
+       racct_set_locked(p, RACCT_CPU, runtime, 0);
+       racct_set_locked(p, RACCT_WALLCLOCK,
+           (uint64_t)wallclock.tv_sec * 1000000 +
+           wallclock.tv_usec, 0);
+}
+
 static int
 racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
 {
-       int64_t old_amount, decayed_amount, diff_proc, diff_cred;
+       int64_t old_amount, diff_proc, diff_cred;
 #ifdef RCTL
        int error;
 #endif
@@ -655,17 +621,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t 
amount, int force)
         * The diffs may be negative.
         */
        diff_proc = amount - old_amount;
-       if (resource == RACCT_PCTCPU) {
-               /*
-                * Resources in per-credential racct containers may decay.
-                * If this is the case, we need to calculate the difference
-                * between the new amount and the proportional value of the
-                * old amount that has decayed in the ucred racct containers.
-                */
-               decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
-               diff_cred = amount - decayed_amount;
-       } else
-               diff_cred = diff_proc;
+       diff_cred = diff_proc;
 #ifdef notyet
        KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
            ("%s: usage of non-droppable resource %d dropping", __func__,
@@ -908,8 +864,6 @@ racct_proc_fork(struct proc *parent, struct proc *child)
                goto out;
 #endif
 
-       /* Init process cpu time. */
-       child->p_prev_runtime = 0;
        child->p_throttled = 0;
 
        /*
@@ -964,37 +918,16 @@ racct_proc_fork_done(struct proc *child)
 void
 racct_proc_exit(struct proc *p)
 {
-       struct timeval wallclock;
-       uint64_t pct_estimate, pct, runtime;
        int i;
 
        if (!racct_enable)
                return;
 
        PROC_LOCK(p);
-       /*
-        * We don't need to calculate rux, proc_reap() has already done this.
-        */
-       runtime = cputick2usec(p->p_rux.rux_runtime);
-#ifdef notyet
-       KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
-#else
-       if (runtime < p->p_prev_runtime)
-               runtime = p->p_prev_runtime;
-#endif
-       microuptime(&wallclock);
-       timevalsub(&wallclock, &p->p_stats->p_start);
-       if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
-               pct_estimate = (1000000 * runtime * 100) /
-                   ((uint64_t)wallclock.tv_sec * 1000000 +
-                   wallclock.tv_usec);
-       } else
-               pct_estimate = 0;
-       pct = racct_getpcpu(p, pct_estimate);
-
        RACCT_LOCK();
-       racct_set_locked(p, RACCT_CPU, runtime, 0);
-       racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
+
+       racct_settime_locked(p, true);
+       racct_zeropcpu_locked(p);
 
        KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0,
            ("process reaped with %ju allocated for RSS\n",
@@ -1068,6 +1001,10 @@ racct_move(struct racct *dest, struct racct *src)
        RACCT_LOCK();
        racct_add_racct(dest, src);
        racct_sub_racct(src, src);
+       dest->r_runtime = src->r_runtime;
+       dest->r_time = src->r_time;
+       src->r_runtime = 0;
+       timevalsub(&src->r_time, &src->r_time);
        RACCT_UNLOCK();
 }
 
@@ -1170,8 +1107,6 @@ racct_proc_wakeup(struct proc *p)
 static void
 racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
 {
-       int64_t r_old, r_new;
-
        ASSERT_RACCT_ENABLED();
        RACCT_LOCK_ASSERT();
 
@@ -1181,15 +1116,6 @@ racct_decay_callback(struct racct *racct, void *dummy1, 
void *dummy2)
        rctl_throttle_decay(racct, RACCT_READIOPS);
        rctl_throttle_decay(racct, RACCT_WRITEIOPS);
 #endif
-
-       r_old = racct->r_resources[RACCT_PCTCPU];
-
-       /* If there is nothing to decay, just exit. */
-       if (r_old <= 0)
-               return;
-
-       r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
-       racct->r_resources[RACCT_PCTCPU] = r_new;
 }
 
 static void
@@ -1220,16 +1146,106 @@ racct_decay(void)
            racct_decay_post, NULL, NULL);
 }
 
+static void
+racct_updatepcpu_racct_locked(struct racct *racct)
+{
+       struct timeval diff;
+       uint64_t elapsed;
+       uint64_t runtime;
+       uint64_t newpcpu;
+       uint64_t oldpcpu;
+
+       ASSERT_RACCT_ENABLED();
+       RACCT_LOCK_ASSERT();
+
+       /* Difference between now and previously-recorded time. */
+       microuptime(&diff);
+       timevalsub(&diff, &racct->r_time);
+       elapsed = (uint64_t)diff.tv_sec * 1000000 + diff.tv_usec;
+
+       /* Difference between current and previously-recorded runtime. */
+       runtime = racct->r_resources[RACCT_CPU] - racct->r_runtime;
+
+       newpcpu = runtime * 100 * 1000000 / elapsed;
+       oldpcpu = racct->r_resources[RACCT_PCTCPU];
+       /*
+        * This calculation is equivalent to
+        *    (1 - 0.3) * newpcpu + 0.3 * oldpcpu
+        * where RACCT_DECAY_FACTOR = 0.3 * FSCALE.
+        */
+       racct->r_resources[RACCT_PCTCPU] = ((FSCALE - RACCT_DECAY_FACTOR) *
+           newpcpu + RACCT_DECAY_FACTOR * oldpcpu) / FSCALE;
+       if (racct->r_resources[RACCT_PCTCPU] >
+           100 * 1000000 * (uint64_t)mp_ncpus)
+               racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 *
+                   (uint64_t)mp_ncpus;
+
+       /* Record current times. */
+       racct->r_runtime = racct->r_resources[RACCT_CPU];
+       timevaladd(&racct->r_time, &diff);
+}
+
+static void
+racct_zeropcpu_locked(struct proc *p)
+{
+       ASSERT_RACCT_ENABLED();
+       PROC_LOCK_ASSERT(p, MA_OWNED);
+
+       p->p_racct->r_resources[RACCT_PCTCPU] = 0;
+}
+
+static void
+racct_updatepcpu_locked(struct proc *p)
+{
+       ASSERT_RACCT_ENABLED();
+       PROC_LOCK_ASSERT(p, MA_OWNED);
+
+       racct_updatepcpu_racct_locked(p->p_racct);
+}
+
+static void
+racct_updatepcpu_pre(void)
+{
+
+       RACCT_LOCK();
+}
+
+static void
+racct_updatepcpu_post(void)
+{
+
+       RACCT_UNLOCK();
+}
+
+static void
+racct_updatepcpu_racct_callback(struct racct *racct, void *dummy1, void 
*dummy2)
+{
+       racct_updatepcpu_racct_locked(racct);
+}
+
+static void
+racct_updatepcpu_containers(void)
+{
+       ASSERT_RACCT_ENABLED();
+
+       ui_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
+           racct_updatepcpu_post, NULL, NULL);
+       loginclass_racct_foreach(racct_updatepcpu_racct_callback, 
racct_updatepcpu_pre,
+           racct_updatepcpu_post, NULL, NULL);
+       prison_racct_foreach(racct_updatepcpu_racct_callback, 
racct_updatepcpu_pre,
+           racct_updatepcpu_post, NULL, NULL);
+}
+
 static void
 racctd(void)
 {
-       struct thread *td;
        struct proc *p;
-       struct timeval wallclock;
-       uint64_t pct, pct_estimate, runtime;
+       struct proc *idle;
 
        ASSERT_RACCT_ENABLED();
 
+       idle = STAILQ_FIRST(&cpuhead)->pc_idlethread->td_proc;
+
        for (;;) {
                racct_decay();
 
@@ -1237,36 +1253,16 @@ racctd(void)
 
                FOREACH_PROC_IN_SYSTEM(p) {
                        PROC_LOCK(p);
+                       if (p == idle) {
+                               PROC_UNLOCK(p);
+                               continue;
+                       }
                        if (p->p_state != PRS_NORMAL ||
                            (p->p_flag & P_IDLEPROC) != 0) {
-                               if (p->p_state == PRS_ZOMBIE)
-                                       racct_set(p, RACCT_PCTCPU, 0);
                                PROC_UNLOCK(p);
                                continue;
                        }
 
-                       microuptime(&wallclock);
-                       timevalsub(&wallclock, &p->p_stats->p_start);
-                       PROC_STATLOCK(p);
-                       FOREACH_THREAD_IN_PROC(p, td)
-                               ruxagg(p, td);
-                       runtime = cputick2usec(p->p_rux.rux_runtime);
-                       PROC_STATUNLOCK(p);
-#ifdef notyet
-                       KASSERT(runtime >= p->p_prev_runtime,
-                           ("runtime < p_prev_runtime"));
-#else
-                       if (runtime < p->p_prev_runtime)
-                               runtime = p->p_prev_runtime;
-#endif
-                       p->p_prev_runtime = runtime;
-                       if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
-                               pct_estimate = (1000000 * runtime * 100) /
-                                   ((uint64_t)wallclock.tv_sec * 1000000 +
-                                   wallclock.tv_usec);
-                       } else
-                               pct_estimate = 0;
-                       pct = racct_getpcpu(p, pct_estimate);
                        RACCT_LOCK();
 #ifdef RCTL
                        rctl_throttle_decay(p->p_racct, RACCT_READBPS);
@@ -1274,11 +1270,8 @@ racctd(void)
                        rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
                        rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
 #endif
-                       racct_set_locked(p, RACCT_PCTCPU, pct, 1);
-                       racct_set_locked(p, RACCT_CPU, runtime, 0);
-                       racct_set_locked(p, RACCT_WALLCLOCK,
-                           (uint64_t)wallclock.tv_sec * 1000000 +
-                           wallclock.tv_usec, 0);
+                       racct_settime_locked(p, false);
+                       racct_updatepcpu_locked(p);
                        RACCT_UNLOCK();
                        PROC_UNLOCK(p);
                }
@@ -1306,6 +1299,8 @@ racctd(void)
                        PROC_UNLOCK(p);
                }
                sx_sunlock(&allproc_lock);
+
+               racct_updatepcpu_containers();
                pause("-", hz);
        }
 }
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index af9cafa99dd0..9140cee56885 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -763,7 +763,6 @@ struct proc {
        LIST_HEAD(, mqueue_notifier)    p_mqnotifier; /* (c) mqueue notifiers.*/
        struct kdtrace_proc     *p_dtrace; /* (*) DTrace-specific data. */
        struct cv       p_pwait;        /* (*) wait cv for exit/exec. */
-       uint64_t        p_prev_runtime; /* (c) Resource usage accounting. */
        struct racct    *p_racct;       /* (b) Resource accounting. */
        int             p_throttled;    /* (c) Flag for racct pcpu throttling */
        /*
diff --git a/sys/sys/racct.h b/sys/sys/racct.h
index c6020b82c865..92b50353774e 100644
--- a/sys/sys/racct.h
+++ b/sys/sys/racct.h
@@ -141,13 +141,17 @@ extern bool racct_enable;
 
 /*
  * The 'racct' structure defines resource consumption for a particular
- * subject, such as process or jail.
+ * subject, such as process or jail. It also contains the total
+ * cpu time and real time of the subject, recorded at the most recent
+ * time that RACCT_PCPU was updated.
  *
  * This structure must be filled with zeroes initially.
  */
 struct racct {
        int64_t                         r_resources[RACCT_MAX + 1];
        LIST_HEAD(, rctl_rule_link)     r_rule_links;
+       uint64_t                        r_runtime;
+       struct timeval                  r_time;
 };
 
 SYSCTL_DECL(_kern_racct);

Reply via email to