On Fri, 14 Jun 2019, Alexander Motin wrote:
Log:
Update td_runtime of running thread on each statclock().
Normally td_runtime is updated on context switch, but there are some kernel
threads that due to high absolute priority may run for many seconds without
context switches (yes, that is bad, but that is true), which means their
td_runtime was not updated all that time, that made them invisible for top
other then as some general CPU usage.
MFC after: 1 week
Sponsored by: iXsystems, Inc.
This and more is supposed to be done in calcru(). It is also necessary to
adjust for the current timeslice.
I thought that calcru() was fixed, but the fix seems to be only in my
version of FreeBSD-5.2.
The bug seems to be in fill_kinfo_proc_only(). calcru() should update
td_runtime for all threads in the proc (and sum into rux_rutime), but
fill_kinfo_proc_only() uses rux_runtime to initialize ki_runtime before
calling calcru(), so sees a stale value. This was broken in r132855
(the initialization of ki_runtime was moved too early as part of a fix
for zombies).
Using calcru() for its side effect is a bit magic. get_thread_cputime()
does this less hackishly for curthread (by adding the timeslice without
updating td_runtime), but is broken for other threads (by using the
stale td_runtime).
td_runtime is unavailable in userland. top and ps can only see ki_runtime
which is per-process.
Actually, calcru() seems to have the same bug as get_thread_cputime().
This is fixed in my version of FreeBSD-5-2, but is clearly broken in
plain FreeBSD-FreeBSD-5.2, and -current seems to be even more broken
than plain 5.2 -- it is missing the large comment about this bug that
I added in r130858. -current first updates td_runtime for curthread.
Then it loops over all threads in the process, but does nothing unless
td_incruntime != 0, and when td_incruntime != 0 it updates rux_runtime
using the value in td_incruntime, and this value is stale except for
curthread.
td_runtime is updated in one other place: in rufetchtd(), but this function
has the same bug eas everywhere else -- it only updates the runtimes for
curthread.
Here is my fix for this in FreeBSD-5.2:
XX Index: kern_resource.c
XX ===================================================================
XX RCS file: /home/ncvs/src/sys/kern/kern_resource.c,v
XX retrieving revision 1.140
XX diff -u -2 -r1.140 kern_resource.c
XX --- kern_resource.c 21 Jun 2004 17:46:27 -0000 1.140
XX @@ -702,18 +720,80 @@
XX struct timeval *ip;
XX {
XX - struct bintime bt, rt;
XX - struct timeval tv;
XX + struct bintime bt;
XX + struct rusage_ext rux;
-current already has most of the infrastructure for this -- I invented
rusage_ext partly to fix this problem, and -current has a more sophisticated
version of rusage_ext.
-current also uses cputicks instead of bintimes.
XX struct thread *td;
XX - /* {user, system, interrupt, total} {ticks, usec}; previous tu: */
XX - u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
XX - int problemcase;
XX + int bt_valid, proc_locked, sched_locked;
XX
XX - mtx_assert(&sched_lock, MA_OWNED);
XX - /* XXX: why spl-protect ? worst case is an off-by-one report */
XX + proc_locked = mtx_owned(&p->p_mtx);
XX + sched_locked = mtx_owned(&sched_lock);
XX + if (!proc_locked && !sched_locked)
XX + PROC_LOCK(p);
XX + if (!sched_locked)
XX + mtx_lock_spin(&sched_lock);
This also has some restructuring that -current already has in a more
sophisticated way.
XX + rux = p->p_rux;
XX + bt_valid = 0;
XX + FOREACH_THREAD_IN_PROC(p, td) {
XX + if (TD_IS_RUNNING(td)) {
XX + /*
XX + * Adjust for the current time slice. This is
XX + * important since the adjustment is on the order
XX + * of a time quantum, which is much greater than
XX + * the precision of binuptime().
XX + */
XX + KASSERT(td->td_oncpu != NOCPU,
XX + ("calcru: running thread has no CPU"));
XX + if (!bt_valid) {
XX + binuptime(&bt);
XX + bt_valid = 1;
XX + }
XX + bintime_sub(&bt,
XX + &pcpu_find(td->td_oncpu)->pc_switchtime);
XX + if (bt.sec < 0) {
XX + printf(
XX + "calcru: negative delta-runtime of %jd sec + %jd wsec for pid %d
(%s)\n",
XX + (intmax_t)bt.sec, bt.frac,
XX + p->p_pid, p->p_comm);
XX + bt.sec = 0;
XX + bt.frac = 0;
XX + }
XX + bintime_add(&rux.rux_runtime, &bt);
XX + }
XX + }
XX + if (!sched_locked)
XX + mtx_unlock_spin(&sched_lock);
That is the main part of the fix. It is necessary to loop over all threads
in the process.
XX + calcru1(p, &rux, up, sp, ip);
XX + p->p_rux.rux_uu = rux.rux_uu;
XX + p->p_rux.rux_su = rux.rux_su;
XX + p->p_rux.rux_iu = rux.rux_iu;
XX + if (!proc_locked && !sched_locked)
XX + PROC_UNLOCK(p);
XX +}
XX +
XX +void
XX +calccru(p, up, sp)
XX + struct proc *p;
XX + struct timeval *up;
XX + struct timeval *sp;
XX +{
XX
XX - ut = p->p_uticks;
XX - st = p->p_sticks;
XX - it = p->p_iticks;
XX + PROC_LOCK_ASSERT(p, MA_OWNED);
XX + calcru1(p, &p->p_crux, up, sp, NULL);
XX +}
XX
XX +static void
XX +calcru1(p, ruxp, up, sp, ip)
XX + struct proc *p;
XX + struct rusage_ext *ruxp;
XX + struct timeval *up;
XX + struct timeval *sp;
XX + struct timeval *ip;
XX +{
XX + struct timeval tv;
XX + /* {user, system, interrupt, total} {ticks, usec}; previous tu: */
XX + u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
XX +
XX + ut = ruxp->rux_uticks;
XX + st = ruxp->rux_sticks;
XX + it = ruxp->rux_iticks;
XX tt = ut + st + it;
XX if (tt == 0) {
XX @@ -721,42 +801,20 @@
XX tt = 1;
XX }
XX - rt = p->p_runtime;
XX - problemcase = 0;
XX - FOREACH_THREAD_IN_PROC(p, td) {
XX - /*
XX - * Adjust for the current time slice. This is actually fairly
XX - * important since the error here is on the order of a time
XX - * quantum, which is much greater than the sampling error.
XX - */
XX - if (td == curthread) {
XX - binuptime(&bt);
XX - bintime_sub(&bt, PCPU_PTR(switchtime));
XX - bintime_add(&rt, &bt);
XX - } else if (TD_IS_RUNNING(td)) {
XX - /*
XX - * XXX: this case should add the difference between
XX - * the current time and the switch time as above,
XX - * but the switch time is inaccessible, so we can't
XX - * do the adjustment and will end up with a wrong
XX - * runtime. A previous call with a different
XX - * curthread may have obtained a (right or wrong)
XX - * runtime that is in advance of ours. Just set a
XX - * flag to avoid warning about this known problem.
XX - */
XX - problemcase = 1;
This is the large comment about the bug. It was broken by removing it in
r136152. r136152 was mostly by me to introduce rusage_ext, but it seems
to have been incomplete -- it even removed the adjustment of td_runtime
for curthread. The latter wasn't restored until 7 years later in r244188.
XX - }
XX - }
XX - bintime2timeval(&rt, &tv);
XX + bintime2timeval(&ruxp->rux_runtime, &tv);
XX tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
XX - ptu = p->p_uu + p->p_su + p->p_iu;
XX + ptu = ruxp->rux_uu + ruxp->rux_su + ruxp->rux_iu;
XX if (tu < ptu) {
XX - if (!problemcase)
XX - printf(
XX -"calcru: runtime went backwards from %ju usec to %ju usec for pid %d
(%s)\n",
XX - (uintmax_t)ptu, (uintmax_t)tu, p->p_pid, p->p_comm);
XX + printf(
XX +"calcru1: runtime went backwards from %ju usec (uu = %ju, su = %ju, iu = %ju) to
%ju usec for pid %d (%s)\n",
XX + (uintmax_t)ptu,
XX + (uintmax_t)ruxp->rux_uu, (uintmax_t)ruxp->rux_su,
XX + (uintmax_t)ruxp->rux_iu,
XX + (uintmax_t)tu, p->p_pid, p->p_comm);
XX tu = ptu;
XX }
XX if ((int64_t)tu < 0) {
XX - printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
XX + /* XXX p is only needed to print these diagnostics. */
XX + printf(
XX + "calcru1: negative runtime of %jd usec for pid %d (%s)\n",
XX (intmax_t)tu, p->p_pid, p->p_comm);
XX tu = ptu;
-current still has many bugs that let the times go backwards. I don't
remember if 'problemcase' gave negative times, but it still occurs and
gives garbage times. Userland could do a related negative times check:
assert(ki_runtime >= sum of user, sys and interrupt times) often fails
because ki_runtime uses a stale rux_runtime time while the sum equals
the non-stale rux_runtime (after scaling).
Bruce
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"