Author: trasz
Date: Sat Apr 13 21:04:06 2013
New Revision: 249444
URL: http://svnweb.freebsd.org/changeset/base/249444

Log:
  MFC r242139:
  
  Add CPU percentage limit enforcement to RCTL.  The resouce name is "pcpu".
  It was implemented by Rudolf Tomori during Google Summer of Code 2012.
  
  MFC r242957:
  
  Don't divide by zero.
  
  MFC r243070:
  
  Fix kassert that's not really valid for %CPU accounting.  The problem
  here is race between decaying the resource usage in containers, and updating
  per-process usage; basically, the former may cause per-container usage
  to get smaller than per-process usage.
  
  MFC r243088:
  
  Improve KASSERT messages in racct, to make it clear which resource
  caused the problem.
  
  MFC r248298:
  
  Accessing td_state requires thread lock to be held.
  
  MFC r248300:
  
  When throttling a process to enforce RACCT limits, do not use neither
  PBDRY (which simply doesn't make any sense) nor PCATCH (which could
  be used by a malicious process to work around the PCPU limit).

Modified:
  stable/9/sys/kern/kern_racct.c
  stable/9/sys/kern/kern_rctl.c
  stable/9/sys/kern/sched_4bsd.c
  stable/9/sys/kern/subr_trap.c
  stable/9/sys/sys/proc.h
  stable/9/sys/sys/racct.h
  stable/9/sys/sys/rctl.h
  stable/9/sys/sys/sched.h
Directory Properties:
  stable/9/sys/   (props changed)
  stable/9/sys/sys/   (props changed)

Modified: stable/9/sys/kern/kern_racct.c
==============================================================================
--- stable/9/sys/kern/kern_racct.c      Sat Apr 13 20:01:07 2013        
(r249443)
+++ stable/9/sys/kern/kern_racct.c      Sat Apr 13 21:04:06 2013        
(r249444)
@@ -33,6 +33,7 @@
 __FBSDID("$FreeBSD$");
 
 #include "opt_kdtrace.h"
+#include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -50,10 +51,13 @@ __FBSDID("$FreeBSD$");
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
+#include <sys/smp.h>
 #include <sys/sx.h>
+#include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/umtx.h>
+#include <machine/smp.h>
 
 #ifdef RCTL
 #include <sys/rctl.h>
@@ -63,6 +67,23 @@ __FBSDID("$FreeBSD$");
 
 FEATURE(racct, "Resource Accounting");
 
+/*
+ * Do not block processes that have their %cpu usage <= pcpu_threshold.
+ */
+static int pcpu_threshold = 1;
+
+SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
+SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
+    0, "Processes with higher %cpu usage than this value can be throttled.");
+
+/*
+ * How many seconds it takes to use the scheduler %cpu calculations.  When a
+ * process starts, we compute its %cpu usage by dividing its runtime by the
+ * process wall clock time.  After RACCT_PCPU_SECS pass, we use the value
+ * provided by the scheduler.
+ */
+#define RACCT_PCPU_SECS                3
+
 static struct mtx racct_lock;
 MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
 
@@ -140,7 +161,217 @@ int racct_types[] = {
        [RACCT_SHMSIZE] =
                RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
        [RACCT_WALLCLOCK] =
-               RACCT_IN_MILLIONS };
+               RACCT_IN_MILLIONS,
+       [RACCT_PCTCPU] =
+               RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
+
+static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
+
+#ifdef SCHED_4BSD
+/*
+ * Contains intermediate values for %cpu calculations to avoid using floating
+ * point in the kernel.
+ * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
+ * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
+ * zero so the calculations are more straightforward.
+ */
+fixpt_t ccpu_exp[] = {
+       [0] = FSCALE * 1,
+       [1] = FSCALE * 0.95122942450071400909,
+       [2] = FSCALE * 0.90483741803595957316,
+       [3] = FSCALE * 0.86070797642505780722,
+       [4] = FSCALE * 0.81873075307798185866,
+       [5] = FSCALE * 0.77880078307140486824,
+       [6] = FSCALE * 0.74081822068171786606,
+       [7] = FSCALE * 0.70468808971871343435,
+       [8] = FSCALE * 0.67032004603563930074,
+       [9] = FSCALE * 0.63762815162177329314,
+       [10] = FSCALE * 0.60653065971263342360,
+       [11] = FSCALE * 0.57694981038048669531,
+       [12] = FSCALE * 0.54881163609402643262,
+       [13] = FSCALE * 0.52204577676101604789,
+       [14] = FSCALE * 0.49658530379140951470,
+       [15] = FSCALE * 0.47236655274101470713,
+       [16] = FSCALE * 0.44932896411722159143,
+       [17] = FSCALE * 0.42741493194872666992,
+       [18] = FSCALE * 0.40656965974059911188,
+       [19] = FSCALE * 0.38674102345450120691,
+       [20] = FSCALE * 0.36787944117144232159,
+       [21] = FSCALE * 0.34993774911115535467,
+       [22] = FSCALE * 0.33287108369807955328,
+       [23] = FSCALE * 0.31663676937905321821,
+       [24] = FSCALE * 0.30119421191220209664,
+       [25] = FSCALE * 0.28650479686019010032,
+       [26] = FSCALE * 0.27253179303401260312,
+       [27] = FSCALE * 0.25924026064589150757,
+       [28] = FSCALE * 0.24659696394160647693,
+       [29] = FSCALE * 0.23457028809379765313,
+       [30] = FSCALE * 0.22313016014842982893,
+       [31] = FSCALE * 0.21224797382674305771,
+       [32] = FSCALE * 0.20189651799465540848,
+       [33] = FSCALE * 0.19204990862075411423,
+       [34] = FSCALE * 0.18268352405273465022,
+       [35] = FSCALE * 0.17377394345044512668,
+       [36] = FSCALE * 0.16529888822158653829,
+       [37] = FSCALE * 0.15723716631362761621,
+       [38] = FSCALE * 0.14956861922263505264,
+       [39] = FSCALE * 0.14227407158651357185,
+       [40] = FSCALE * 0.13533528323661269189,
+       [41] = FSCALE * 0.12873490358780421886,
+       [42] = FSCALE * 0.12245642825298191021,
+       [43] = FSCALE * 0.11648415777349695786,
+       [44] = FSCALE * 0.11080315836233388333,
+       [45] = FSCALE * 0.10539922456186433678,
+       [46] = FSCALE * 0.10025884372280373372,
+       [47] = FSCALE * 0.09536916221554961888,
+       [48] = FSCALE * 0.09071795328941250337,
+       [49] = FSCALE * 0.08629358649937051097,
+       [50] = FSCALE * 0.08208499862389879516,
+       [51] = FSCALE * 0.07808166600115315231,
+       [52] = FSCALE * 0.07427357821433388042,
+       [53] = FSCALE * 0.07065121306042958674,
+       [54] = FSCALE * 0.06720551273974976512,
+       [55] = FSCALE * 0.06392786120670757270,
+       [56] = FSCALE * 0.06081006262521796499,
+       [57] = FSCALE * 0.05784432087483846296,
+       [58] = FSCALE * 0.05502322005640722902,
+       [59] = FSCALE * 0.05233970594843239308,
+       [60] = FSCALE * 0.04978706836786394297,
+       [61] = FSCALE * 0.04735892439114092119,
+       [62] = FSCALE * 0.04504920239355780606,
+       [63] = FSCALE * 0.04285212686704017991,
+       [64] = FSCALE * 0.04076220397836621516,
+       [65] = FSCALE * 0.03877420783172200988,
+       [66] = FSCALE * 0.03688316740124000544,
+       [67] = FSCALE * 0.03508435410084502588,
+       [68] = FSCALE * 0.03337326996032607948,
+       [69] = FSCALE * 0.03174563637806794323,
+       [70] = FSCALE * 0.03019738342231850073,
+       [71] = FSCALE * 0.02872463965423942912,
+       [72] = FSCALE * 0.02732372244729256080,
+       [73] = FSCALE * 0.02599112877875534358,
+       [74] = FSCALE * 0.02472352647033939120,
+       [75] = FSCALE * 0.02351774585600910823,
+       [76] = FSCALE * 0.02237077185616559577,
+       [77] = FSCALE * 0.02127973643837716938,
+       [78] = FSCALE * 0.02024191144580438847,
+       [79] = FSCALE * 0.01925470177538692429,
+       [80] = FSCALE * 0.01831563888873418029,
+       [81] = FSCALE * 0.01742237463949351138,
+       [82] = FSCALE * 0.01657267540176124754,
+       [83] = FSCALE * 0.01576441648485449082,
+       [84] = FSCALE * 0.01499557682047770621,
+       [85] = FSCALE * 0.01426423390899925527,
+       [86] = FSCALE * 0.01356855901220093175,
+       [87] = FSCALE * 0.01290681258047986886,
+       [88] = FSCALE * 0.01227733990306844117,
+       [89] = FSCALE * 0.01167856697039544521,
+       [90] = FSCALE * 0.01110899653824230649,
+       [91] = FSCALE * 0.01056720438385265337,
+       [92] = FSCALE * 0.01005183574463358164,
+       [93] = FSCALE * 0.00956160193054350793,
+       [94] = FSCALE * 0.00909527710169581709,
+       [95] = FSCALE * 0.00865169520312063417,
+       [96] = FSCALE * 0.00822974704902002884,
+       [97] = FSCALE * 0.00782837754922577143,
+       [98] = FSCALE * 0.00744658307092434051,
+       [99] = FSCALE * 0.00708340892905212004,
+       [100] = FSCALE * 0.00673794699908546709,
+       [101] = FSCALE * 0.00640933344625638184,
+       [102] = FSCALE * 0.00609674656551563610,
+       [103] = FSCALE * 0.00579940472684214321,
+       [104] = FSCALE * 0.00551656442076077241,
+       [105] = FSCALE * 0.00524751839918138427,
+       [106] = FSCALE * 0.00499159390691021621,
+       [107] = FSCALE * 0.00474815099941147558,
+       [108] = FSCALE * 0.00451658094261266798,
+       [109] = FSCALE * 0.00429630469075234057,
+       [110] = FSCALE * 0.00408677143846406699,
+};
+#endif
+
+#define        CCPU_EXP_MAX    110
+
+/*
+ * This function is analogical to the getpcpu() function in the ps(1) command.
+ * They should both calculate in the same way so that the racct %cpu
+ * calculations are consistent with the values showed by the ps(1) tool.
+ * The calculations are more complex in the 4BSD scheduler because of the value
+ * of the ccpu variable.  In ULE it is defined to be zero which saves us some
+ * work.
+ */
+static uint64_t
+racct_getpcpu(struct proc *p, u_int pcpu)
+{
+       u_int swtime;
+#ifdef SCHED_4BSD
+       fixpt_t pctcpu, pctcpu_next;
+#endif
+#ifdef SMP
+       struct pcpu *pc;
+       int found;
+#endif
+       fixpt_t p_pctcpu;
+       struct thread *td;
+
+       /*
+        * If the process is swapped out, we count its %cpu usage as zero.
+        * This behaviour is consistent with the userland ps(1) tool.
+        */
+       if ((p->p_flag & P_INMEM) == 0)
+               return (0);
+       swtime = (ticks - p->p_swtick) / hz;
+
+       /*
+        * For short-lived processes, the sched_pctcpu() returns small
+        * values even for cpu intensive processes.  Therefore we use
+        * our own estimate in this case.
+        */
+       if (swtime < RACCT_PCPU_SECS)
+               return (pcpu);
+
+       p_pctcpu = 0;
+       FOREACH_THREAD_IN_PROC(p, td) {
+               if (td == PCPU_GET(idlethread))
+                       continue;
+#ifdef SMP
+               found = 0;
+               STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+                       if (td == pc->pc_idlethread) {
+                               found = 1;
+                               break;
+                       }
+               }
+               if (found)
+                       continue;
+#endif
+               thread_lock(td);
+#ifdef SCHED_4BSD
+               pctcpu = sched_pctcpu(td);
+               /* Count also the yet unfinished second. */
+               pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
+               pctcpu_next += sched_pctcpu_delta(td);
+               p_pctcpu += max(pctcpu, pctcpu_next);
+#else
+               /*
+                * In ULE the %cpu statistics are updated on every
+                * sched_pctcpu() call.  So special calculations to
+                * account for the latest (unfinished) second are
+                * not needed.
+                */
+               p_pctcpu += sched_pctcpu(td);
+#endif
+               thread_unlock(td);
+       }
+
+#ifdef SCHED_4BSD
+       if (swtime <= CCPU_EXP_MAX)
+               return ((100 * (uint64_t)p_pctcpu * 1000000) /
+                   (FSCALE - ccpu_exp[swtime]));
+#endif
+
+       return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
+}
 
 static void
 racct_add_racct(struct racct *dest, const struct racct *src)
@@ -154,9 +385,11 @@ racct_add_racct(struct racct *dest, cons
         */
        for (i = 0; i <= RACCT_MAX; i++) {
                KASSERT(dest->r_resources[i] >= 0,
-                   ("racct propagation meltdown: dest < 0"));
+                   ("%s: resource %d propagation meltdown: dest < 0",
+                   __func__, i));
                KASSERT(src->r_resources[i] >= 0,
-                   ("racct propagation meltdown: src < 0"));
+                   ("%s: resource %d propagation meltdown: src < 0",
+                   __func__, i));
                dest->r_resources[i] += src->r_resources[i];
        }
 }
@@ -172,19 +405,23 @@ racct_sub_racct(struct racct *dest, cons
         * Update resource usage in dest.
         */
        for (i = 0; i <= RACCT_MAX; i++) {
-               if (!RACCT_IS_SLOPPY(i)) {
+               if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
                        KASSERT(dest->r_resources[i] >= 0,
-                           ("racct propagation meltdown: dest < 0"));
+                           ("%s: resource %d propagation meltdown: dest < 0",
+                           __func__, i));
                        KASSERT(src->r_resources[i] >= 0,
-                           ("racct propagation meltdown: src < 0"));
+                           ("%s: resource %d propagation meltdown: src < 0",
+                           __func__, i));
                        KASSERT(src->r_resources[i] <= dest->r_resources[i],
-                           ("racct propagation meltdown: src > dest"));
+                           ("%s: resource %d propagation meltdown: src > dest",
+                           __func__, i));
                }
-               if (RACCT_IS_RECLAIMABLE(i)) {
+               if (RACCT_CAN_DROP(i)) {
                        dest->r_resources[i] -= src->r_resources[i];
                        if (dest->r_resources[i] < 0) {
-                               KASSERT(RACCT_IS_SLOPPY(i),
-                                   ("racct_sub_racct: usage < 0"));
+                               KASSERT(RACCT_IS_SLOPPY(i) ||
+                                   RACCT_IS_DECAYING(i),
+                                   ("%s: resource %d usage < 0", __func__, i));
                                dest->r_resources[i] = 0;
                        }
                }
@@ -254,10 +491,23 @@ racct_alloc_resource(struct racct *racct
 
        racct->r_resources[resource] += amount;
        if (racct->r_resources[resource] < 0) {
-               KASSERT(RACCT_IS_SLOPPY(resource),
-                   ("racct_alloc_resource: usage < 0"));
+               KASSERT(RACCT_IS_SLOPPY(resource) || 
RACCT_IS_DECAYING(resource),
+                   ("%s: resource %d usage < 0", __func__, resource));
                racct->r_resources[resource] = 0;
        }
+       
+       /*
+        * There are some cases where the racct %cpu resource would grow
+        * beyond 100%.
+        * For example in racct_proc_exit() we add the process %cpu usage
+        * to the ucred racct containers.  If too many processes terminated
+        * in a short time span, the ucred %cpu resource could grow too much.
+        * Also, the 4BSD scheduler sometimes returns for a thread more than
+        * 100% cpu usage.  So we set a boundary here to 100%.
+        */
+       if ((resource == RACCT_PCTCPU) &&
+           (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
+               racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
 }
 
 static int
@@ -357,7 +607,8 @@ racct_add_force(struct proc *p, int reso
 static int
 racct_set_locked(struct proc *p, int resource, uint64_t amount)
 {
-       int64_t diff;
+       int64_t old_amount, decayed_amount;
+       int64_t diff_proc, diff_cred;
 #ifdef RCTL
        int error;
 #endif
@@ -369,15 +620,30 @@ racct_set_locked(struct proc *p, int res
         */
        PROC_LOCK_ASSERT(p, MA_OWNED);
 
-       diff = amount - p->p_racct->r_resources[resource];
+       old_amount = p->p_racct->r_resources[resource];
+       /*
+        * The diffs may be negative.
+        */
+       diff_proc = amount - old_amount;
+       if (RACCT_IS_DECAYING(resource)) {
+               /*
+                * Resources in per-credential racct containers may decay.
+                * If this is the case, we need to calculate the difference
+                * between the new amount and the proportional value of the
+                * old amount that has decayed in the ucred racct containers.
+                */
+               decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
+               diff_cred = amount - decayed_amount;
+       } else
+               diff_cred = diff_proc;
 #ifdef notyet
-       KASSERT(diff >= 0 || RACCT_IS_RECLAIMABLE(resource),
-           ("racct_set: usage of non-reclaimable resource %d dropping",
+       KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
+           ("%s: usage of non-droppable resource %d dropping", __func__,
             resource));
 #endif
 #ifdef RCTL
-       if (diff > 0) {
-               error = rctl_enforce(p, resource, diff);
+       if (diff_proc > 0) {
+               error = rctl_enforce(p, resource, diff_proc);
                if (error && RACCT_IS_DENIABLE(resource)) {
                        SDT_PROBE(racct, kernel, rusage, set_failure, p,
                            resource, amount, 0, 0);
@@ -385,11 +651,11 @@ racct_set_locked(struct proc *p, int res
                }
        }
 #endif
-       racct_alloc_resource(p->p_racct, resource, diff);
-       if (diff > 0)
-               racct_add_cred_locked(p->p_ucred, resource, diff);
-       else if (diff < 0)
-               racct_sub_cred_locked(p->p_ucred, resource, -diff);
+       racct_alloc_resource(p->p_racct, resource, diff_proc);
+       if (diff_cred > 0)
+               racct_add_cred_locked(p->p_ucred, resource, diff_cred);
+       else if (diff_cred < 0)
+               racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
 
        return (0);
 }
@@ -412,10 +678,11 @@ racct_set(struct proc *p, int resource, 
        return (error);
 }
 
-void
-racct_set_force(struct proc *p, int resource, uint64_t amount)
+static void
+racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
 {
-       int64_t diff;
+       int64_t old_amount, decayed_amount;
+       int64_t diff_proc, diff_cred;
 
        SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
 
@@ -424,13 +691,35 @@ racct_set_force(struct proc *p, int reso
         */
        PROC_LOCK_ASSERT(p, MA_OWNED);
 
+       old_amount = p->p_racct->r_resources[resource];
+       /*
+        * The diffs may be negative.
+        */
+       diff_proc = amount - old_amount;
+       if (RACCT_IS_DECAYING(resource)) {
+               /*
+                * Resources in per-credential racct containers may decay.
+                * If this is the case, we need to calculate the difference
+                * between the new amount and the proportional value of the
+                * old amount that has decayed in the ucred racct containers.
+                */
+               decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
+               diff_cred = amount - decayed_amount;
+       } else
+               diff_cred = diff_proc;
+
+       racct_alloc_resource(p->p_racct, resource, diff_proc);
+       if (diff_cred > 0)
+               racct_add_cred_locked(p->p_ucred, resource, diff_cred);
+       else if (diff_cred < 0)
+               racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
+}
+
+void
+racct_set_force(struct proc *p, int resource, uint64_t amount)
+{
        mtx_lock(&racct_lock);
-       diff = amount - p->p_racct->r_resources[resource];
-       racct_alloc_resource(p->p_racct, resource, diff);
-       if (diff > 0)
-               racct_add_cred_locked(p->p_ucred, resource, diff);
-       else if (diff < 0)
-               racct_sub_cred_locked(p->p_ucred, resource, -diff);
+       racct_set_force_locked(p, resource, amount);
        mtx_unlock(&racct_lock);
 }
 
@@ -469,6 +758,22 @@ racct_get_available(struct proc *p, int 
 }
 
 /*
+ * Returns amount of the %cpu resource that process 'p' can add to its %cpu
+ * utilization.  Adding more than that would lead to the process being
+ * throttled.
+ */
+static int64_t
+racct_pcpu_available(struct proc *p)
+{
+
+#ifdef RCTL
+       return (rctl_pcpu_available(p));
+#else
+       return (INT64_MAX);
+#endif
+}
+
+/*
  * Decrease allocation of 'resource' by 'amount' for process 'p'.
  */
 void
@@ -481,13 +786,13 @@ racct_sub(struct proc *p, int resource, 
         * We need proc lock to dereference p->p_ucred.
         */
        PROC_LOCK_ASSERT(p, MA_OWNED);
-       KASSERT(RACCT_IS_RECLAIMABLE(resource),
-           ("racct_sub: called for non-reclaimable resource %d", resource));
+       KASSERT(RACCT_CAN_DROP(resource),
+           ("%s: called for non-droppable resource %d", __func__, resource));
 
        mtx_lock(&racct_lock);
        KASSERT(amount <= p->p_racct->r_resources[resource],
-           ("racct_sub: freeing %ju of resource %d, which is more "
-            "than allocated %jd for %s (pid %d)", amount, resource,
+           ("%s: freeing %ju of resource %d, which is more "
+            "than allocated %jd for %s (pid %d)", __func__, amount, resource,
            (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
 
        racct_alloc_resource(p->p_racct, resource, -amount);
@@ -504,8 +809,8 @@ racct_sub_cred_locked(struct ucred *cred
            0, 0);
 
 #ifdef notyet
-       KASSERT(RACCT_IS_RECLAIMABLE(resource),
-           ("racct_sub_cred: called for non-reclaimable resource %d",
+       KASSERT(RACCT_CAN_DROP(resource),
+           ("%s: called for resource %d which can not drop", __func__,
             resource));
 #endif
 
@@ -551,6 +856,10 @@ racct_proc_fork(struct proc *parent, str
                goto out;
 #endif
 
+       /* Init process cpu time. */
+       child->p_prev_runtime = 0;
+       child->p_throttled = 0;
+
        /*
         * Inherit resource usage.
         */
@@ -602,6 +911,8 @@ racct_proc_exit(struct proc *p)
 {
        int i;
        uint64_t runtime;
+       struct timeval wallclock;
+       uint64_t pct_estimate, pct;
 
        PROC_LOCK(p);
        /*
@@ -614,8 +925,19 @@ racct_proc_exit(struct proc *p)
        if (runtime < p->p_prev_runtime)
                runtime = p->p_prev_runtime;
 #endif
+       microuptime(&wallclock);
+       timevalsub(&wallclock, &p->p_stats->p_start);
+       if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+               pct_estimate = (1000000 * runtime * 100) /
+                   ((uint64_t)wallclock.tv_sec * 1000000 +
+                   wallclock.tv_usec);
+       } else
+               pct_estimate = 0;
+       pct = racct_getpcpu(p, pct_estimate);
+
        mtx_lock(&racct_lock);
        racct_set_locked(p, RACCT_CPU, runtime);
+       racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
 
        for (i = 0; i <= RACCT_MAX; i++) {
                if (p->p_racct->r_resources[i] == 0)
@@ -692,23 +1014,122 @@ racct_move(struct racct *dest, struct ra
 }
 
 static void
+racct_proc_throttle(struct proc *p)
+{
+       struct thread *td;
+#ifdef SMP
+       int cpuid;
+#endif
+
+       PROC_LOCK_ASSERT(p, MA_OWNED);
+
+       /*
+        * Do not block kernel processes.  Also do not block processes with
+        * low %cpu utilization to improve interactivity.
+        */
+       if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
+           (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
+               return;
+       p->p_throttled = 1;
+
+       FOREACH_THREAD_IN_PROC(p, td) {
+               thread_lock(td);
+               switch (td->td_state) {
+               case TDS_RUNQ:
+                       /*
+                        * If the thread is on the scheduler run-queue, we can
+                        * not just remove it from there.  So we set the flag
+                        * TDF_NEEDRESCHED for the thread, so that once it is
+                        * running, it is taken off the cpu as soon as possible.
+                        */
+                       td->td_flags |= TDF_NEEDRESCHED;
+                       break;
+               case TDS_RUNNING:
+                       /*
+                        * If the thread is running, we request a context
+                        * switch for it by setting the TDF_NEEDRESCHED flag.
+                        */
+                       td->td_flags |= TDF_NEEDRESCHED;
+#ifdef SMP
+                       cpuid = td->td_oncpu;
+                       if ((cpuid != NOCPU) && (td != curthread))
+                               ipi_cpu(cpuid, IPI_AST);
+#endif
+                       break;
+               default:
+                       break;
+               }
+               thread_unlock(td);
+       }
+}
+
+static void
+racct_proc_wakeup(struct proc *p)
+{
+       PROC_LOCK_ASSERT(p, MA_OWNED);
+
+       if (p->p_throttled) {
+               p->p_throttled = 0;
+               wakeup(p->p_racct);
+       }
+}
+
+static void
+racct_decay_resource(struct racct *racct, void * res, void* dummy)
+{
+       int resource;
+       int64_t r_old, r_new;
+
+       resource = *(int *)res;
+       r_old = racct->r_resources[resource];
+
+       /* If there is nothing to decay, just exit. */
+       if (r_old <= 0)
+               return;
+
+       mtx_lock(&racct_lock);
+       r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
+       racct->r_resources[resource] = r_new;
+       mtx_unlock(&racct_lock);
+}
+
+static void
+racct_decay(int resource)
+{
+       ui_racct_foreach(racct_decay_resource, &resource, NULL);
+       loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
+       prison_racct_foreach(racct_decay_resource, &resource, NULL);
+}
+
+static void
 racctd(void)
 {
        struct thread *td;
        struct proc *p;
        struct timeval wallclock;
        uint64_t runtime;
+       uint64_t pct, pct_estimate;
 
        for (;;) {
+               racct_decay(RACCT_PCTCPU);
+
                sx_slock(&allproc_lock);
 
+               LIST_FOREACH(p, &zombproc, p_list) {
+                       PROC_LOCK(p);
+                       racct_set(p, RACCT_PCTCPU, 0);
+                       PROC_UNLOCK(p);
+               }
+
                FOREACH_PROC_IN_SYSTEM(p) {
-                       if (p->p_state != PRS_NORMAL)
+                       PROC_LOCK(p);
+                       if (p->p_state != PRS_NORMAL) {
+                               PROC_UNLOCK(p);
                                continue;
+                       }
 
                        microuptime(&wallclock);
                        timevalsub(&wallclock, &p->p_stats->p_start);
-                       PROC_LOCK(p);
                        PROC_SLOCK(p);
                        FOREACH_THREAD_IN_PROC(p, td)
                                ruxagg(p, td);
@@ -722,7 +1143,15 @@ racctd(void)
                                runtime = p->p_prev_runtime;
 #endif
                        p->p_prev_runtime = runtime;
+                       if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+                               pct_estimate = (1000000 * runtime * 100) /
+                                   ((uint64_t)wallclock.tv_sec * 1000000 +
+                                   wallclock.tv_usec);
+                       } else
+                               pct_estimate = 0;
+                       pct = racct_getpcpu(p, pct_estimate);
                        mtx_lock(&racct_lock);
+                       racct_set_force_locked(p, RACCT_PCTCPU, pct);
                        racct_set_locked(p, RACCT_CPU, runtime);
                        racct_set_locked(p, RACCT_WALLCLOCK,
                            (uint64_t)wallclock.tv_sec * 1000000 +
@@ -730,6 +1159,26 @@ racctd(void)
                        mtx_unlock(&racct_lock);
                        PROC_UNLOCK(p);
                }
+
+               /*
+                * To ensure that processes are throttled in a fair way, we need
+                * to iterate over all processes again and check the limits
+                * for %cpu resource only after ucred racct containers have been
+                * properly filled.
+                */
+               FOREACH_PROC_IN_SYSTEM(p) {
+                       PROC_LOCK(p);
+                       if (p->p_state != PRS_NORMAL) {
+                               PROC_UNLOCK(p);
+                               continue;
+                       }
+
+                       if (racct_pcpu_available(p) <= 0)
+                               racct_proc_throttle(p);
+                       else if (p->p_throttled)
+                               racct_proc_wakeup(p);
+                       PROC_UNLOCK(p);
+               }
                sx_sunlock(&allproc_lock);
                pause("-", hz);
        }

Modified: stable/9/sys/kern/kern_rctl.c
==============================================================================
--- stable/9/sys/kern/kern_rctl.c       Sat Apr 13 20:01:07 2013        
(r249443)
+++ stable/9/sys/kern/kern_rctl.c       Sat Apr 13 21:04:06 2013        
(r249444)
@@ -76,6 +76,8 @@ FEATURE(rctl, "Resource Limits");
 #define        RCTL_MAX_INBUFLEN       4096
 #define        RCTL_LOG_BUFSIZE        128
 
+#define        RCTL_PCPU_SHIFT         (10 * 1000000)
+
 /*
  * 'rctl_rule_link' connects a rule with every racct it's related to.
  * For example, rule 'user:X:openfiles:deny=N/process' is linked
@@ -120,6 +122,7 @@ static struct dict resourcenames[] = {
        { "nshm", RACCT_NSHM },
        { "shmsize", RACCT_SHMSIZE },
        { "wallclock", RACCT_WALLCLOCK },
+       { "pcpu", RACCT_PCTCPU },
        { NULL, -1 }};
 
 static struct dict actionnames[] = {
@@ -271,6 +274,51 @@ rctl_would_exceed(const struct proc *p, 
 }
 
 /*
+ * Special version of rctl_available() function for the %cpu resource.
+ * We slightly cheat here and return less than we normally would.
+ */
+int64_t
+rctl_pcpu_available(const struct proc *p) {
+       struct rctl_rule *rule;
+       struct rctl_rule_link *link;
+       int64_t available, minavailable, limit;
+
+       minavailable = INT64_MAX;
+       limit = 0;
+
+       rw_rlock(&rctl_lock);
+
+       LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+               rule = link->rrl_rule;
+               if (rule->rr_resource != RACCT_PCTCPU)
+                       continue;
+               if (rule->rr_action != RCTL_ACTION_DENY)
+                       continue;
+               available = rctl_available_resource(p, rule);
+               if (available < minavailable) {
+                       minavailable = available;
+                       limit = rule->rr_amount;
+               }
+       }
+
+       rw_runlock(&rctl_lock);
+
+       /*
+        * Return slightly less than actual value of the available
+        * %cpu resource.  This makes %cpu throttling more agressive
+        * and lets us act sooner than the limits are already exceeded.
+        */
+       if (limit != 0) {
+               if (limit > 2 * RCTL_PCPU_SHIFT)
+                       minavailable -= RCTL_PCPU_SHIFT;
+               else
+                       minavailable -= (limit / 2);
+       }
+
+       return (minavailable);
+}
+
+/*
  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
  * to what it keeps allocated now.  Returns non-zero if the allocation should
  * be denied, 0 otherwise.

Modified: stable/9/sys/kern/sched_4bsd.c
==============================================================================
--- stable/9/sys/kern/sched_4bsd.c      Sat Apr 13 20:01:07 2013        
(r249443)
+++ stable/9/sys/kern/sched_4bsd.c      Sat Apr 13 21:04:06 2013        
(r249444)
@@ -1585,6 +1585,40 @@ sched_pctcpu(struct thread *td)
        return (ts->ts_pctcpu);
 }
 
+#ifdef RACCT
+/*
+ * Calculates the contribution to the thread cpu usage for the latest
+ * (unfinished) second.
+ */
+fixpt_t
+sched_pctcpu_delta(struct thread *td)
+{
+       struct td_sched *ts;
+       fixpt_t delta;
+       int realstathz;
+
+       THREAD_LOCK_ASSERT(td, MA_OWNED);
+       ts = td->td_sched;
+       delta = 0;
+       realstathz = stathz ? stathz : hz;
+       if (ts->ts_cpticks != 0) {
+#if    (FSHIFT >= CCPU_SHIFT)
+               delta = (realstathz == 100)
+                   ? ((fixpt_t) ts->ts_cpticks) <<
+                   (FSHIFT - CCPU_SHIFT) :
+                   100 * (((fixpt_t) ts->ts_cpticks)
+                   << (FSHIFT - CCPU_SHIFT)) / realstathz;
+#else
+               delta = ((FSCALE - ccpu) *
+                   (ts->ts_cpticks *
+                   FSCALE / realstathz)) >> FSHIFT;
+#endif
+       }
+
+       return (delta);
+}
+#endif
+
 void
 sched_tick(int cnt)
 {

Modified: stable/9/sys/kern/subr_trap.c
==============================================================================
--- stable/9/sys/kern/subr_trap.c       Sat Apr 13 20:01:07 2013        
(r249443)
+++ stable/9/sys/kern/subr_trap.c       Sat Apr 13 21:04:06 2013        
(r249444)
@@ -151,6 +151,12 @@ userret(struct thread *td, struct trapfr
 #ifdef XEN
        PT_UPDATES_FLUSH();
 #endif
+#ifdef RACCT
+       PROC_LOCK(p);
+       while (p->p_throttled == 1)
+               msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
+       PROC_UNLOCK(p);
+#endif
 }
 
 /*

Modified: stable/9/sys/sys/proc.h
==============================================================================
--- stable/9/sys/sys/proc.h     Sat Apr 13 20:01:07 2013        (r249443)
+++ stable/9/sys/sys/proc.h     Sat Apr 13 21:04:06 2013        (r249444)
@@ -593,6 +593,7 @@ struct proc {
         */
        LIST_ENTRY(proc) p_orphan;      /* (e) List of orphan processes. */
        LIST_HEAD(, proc) p_orphans;    /* (e) Pointer to list of orphans. */
+       u_char          p_throttled;    /* (c) Flag for racct pcpu throttling */
 };
 
 #define        p_session       p_pgrp->pg_session

Modified: stable/9/sys/sys/racct.h
==============================================================================
--- stable/9/sys/sys/racct.h    Sat Apr 13 20:01:07 2013        (r249443)
+++ stable/9/sys/sys/racct.h    Sat Apr 13 21:04:06 2013        (r249444)
@@ -68,7 +68,8 @@ struct ucred;
 #define        RACCT_NSHM              17
 #define        RACCT_SHMSIZE           18
 #define        RACCT_WALLCLOCK         19
-#define        RACCT_MAX               RACCT_WALLCLOCK
+#define        RACCT_PCTCPU            20
+#define        RACCT_MAX               RACCT_PCTCPU
 
 /*
  * Resource properties.
@@ -78,6 +79,7 @@ struct ucred;
 #define        RACCT_INHERITABLE       0x04
 #define        RACCT_DENIABLE          0x08
 #define        RACCT_SLOPPY            0x10
+#define        RACCT_DECAYING          0x20
 
 extern int racct_types[];
 
@@ -89,7 +91,9 @@ extern int racct_types[];
 #define        RACCT_IS_IN_MILLIONS(X) (racct_types[X] & RACCT_IN_MILLIONS)
 
 /*
- * Resource usage can drop, as opposed to only grow.
+ * Resource usage can drop, as opposed to only grow.  When the process
+ * terminates, its resource usage is freed from the respective
+ * per-credential racct containers.
  */
 #define        RACCT_IS_RECLAIMABLE(X) (racct_types[X] & RACCT_RECLAIMABLE)
 
@@ -113,6 +117,20 @@ extern int racct_types[];
 #define        RACCT_IS_SLOPPY(X)              (racct_types[X] & RACCT_SLOPPY)
 
 /*
+ * When a process terminates, its resource usage is not automatically
+ * subtracted from per-credential racct containers.  Instead, the resource
+ * usage of per-credential racct containers decays in time.
+ * Resource usage can olso drop for such resource.
+ * So far, the only such resource is RACCT_PCTCPU.
+ */
+#define RACCT_IS_DECAYING(X)           (racct_types[X] & RACCT_DECAYING)
+
+/*
+ * Resource usage can drop, as opposed to only grow.
+ */
+#define RACCT_CAN_DROP(X)              (RACCT_IS_RECLAIMABLE(X) | 
RACCT_IS_DECAYING(X))
+
+/*
  * The 'racct' structure defines resource consumption for a particular
  * subject, such as process or jail.
  *

Modified: stable/9/sys/sys/rctl.h
==============================================================================
--- stable/9/sys/sys/rctl.h     Sat Apr 13 20:01:07 2013        (r249443)
+++ stable/9/sys/sys/rctl.h     Sat Apr 13 21:04:06 2013        (r249444)
@@ -140,6 +140,7 @@ void        rctl_rule_release(struct rctl_rule 
 int    rctl_rule_add(struct rctl_rule *rule);
 int    rctl_rule_remove(struct rctl_rule *filter);
 int    rctl_enforce(struct proc *p, int resource, uint64_t amount);
+int64_t        rctl_pcpu_available(const struct proc *p);
 uint64_t rctl_get_limit(struct proc *p, int resource);
 uint64_t rctl_get_available(struct proc *p, int resource);
 const char *rctl_resource_name(int resource);

Modified: stable/9/sys/sys/sched.h
==============================================================================
--- stable/9/sys/sys/sched.h    Sat Apr 13 20:01:07 2013        (r249443)
+++ stable/9/sys/sys/sched.h    Sat Apr 13 21:04:06 2013        (r249444)
@@ -103,6 +103,11 @@ void       sched_user_prio(struct thread *td, 
 void   sched_userret(struct thread *td);
 void   sched_wakeup(struct thread *td);
 void   sched_preempt(struct thread *td);
+#ifdef RACCT
+#ifdef SCHED_4BSD
+fixpt_t        sched_pctcpu_delta(struct thread *td);
+#endif
+#endif
 
 /*
  * Threads are moved on and off of run queues
_______________________________________________
svn-src-stable-9@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-stable-9
To unsubscribe, send any mail to "svn-src-stable-9-unsubscr...@freebsd.org"

Reply via email to