Aaaaand it didn't take long for me to find a small bug... attached is a
fixed version of the patch. Such things happen if one decides to
regenerate a patch "just in case" and forgets to revert to a working
version before doing that :D

-- 
    Gregor Best
Index: sys/proc.h
===================================================================
RCS file: /cvs/src/sys/sys/proc.h,v
retrieving revision 1.149
diff -u -r1.149 proc.h
--- sys/proc.h  7 Jan 2012 05:38:12 -0000       1.149
+++ sys/proc.h  17 Jan 2012 16:10:09 -0000
@@ -43,6 +43,7 @@
 #include <machine/proc.h>              /* Machine-dependent proc substruct. */
 #include <sys/selinfo.h>               /* For struct selinfo */
 #include <sys/queue.h>
+#include <sys/tree.h>
 #include <sys/timeout.h>               /* For struct timeout */
 #include <sys/event.h>                 /* For struct klist */
 #include <sys/mutex.h>                 /* For struct mutex */
@@ -210,7 +211,9 @@
 #define        PS_SINGLEUNWIND _P_SINGLEUNWIND
 
 struct proc {
-       TAILQ_ENTRY(proc) p_runq;
+       RB_ENTRY(proc) p_runq;
+       RB_ENTRY(proc) p_expq;
+       TAILQ_ENTRY(proc) p_slpq;
        LIST_ENTRY(proc) p_list;        /* List of all processes. */
 
        struct  process *p_p;           /* The process of this thread. */
@@ -251,6 +254,9 @@
 #endif
 
        /* scheduling */
+       struct timeval p_deadline; /* virtual deadline used for scheduling */
+       struct timeval p_deadline_set; /* time at which the deadline was set */
+       int  p_rrticks; /* number of ticks this process is allowed to stay on a 
processor */
        u_int   p_estcpu;        /* Time averaged value of p_cpticks. */
        int     p_cpticks;       /* Ticks of cpu time. */
        fixpt_t p_pctcpu;        /* %cpu for this process during p_swtime */
Index: sys/sched.h
===================================================================
RCS file: /cvs/src/sys/sys/sched.h,v
retrieving revision 1.30
diff -u -r1.30 sched.h
--- sys/sched.h 16 Nov 2011 20:50:19 -0000      1.30
+++ sys/sched.h 17 Jan 2012 16:10:09 -0000
@@ -87,8 +87,6 @@
 #define CP_IDLE                4
 #define CPUSTATES      5
 
-#define        SCHED_NQS       32                      /* 32 run queues. */
-
 /*
  * Per-CPU scheduler state.
  * XXX - expose to userland for now.
@@ -99,7 +97,6 @@
        u_int spc_schedticks;           /* ticks for schedclock() */
        u_int64_t spc_cp_time[CPUSTATES]; /* CPU state statistics */
        u_char spc_curpriority;         /* usrpri of curproc */
-       int spc_rrticks;                /* ticks until roundrobin() */
        int spc_pscnt;                  /* prof/stat counter */
        int spc_psdiv;                  /* prof/stat divisor */ 
        struct proc *spc_idleproc;      /* idle proc for this cpu */
@@ -107,9 +104,6 @@
        u_int spc_nrun;                 /* procs on the run queues */
        fixpt_t spc_ldavg;              /* shortest load avg. for this cpu */
 
-       TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
-       volatile uint32_t spc_whichqs;
-
 #ifdef notyet
        struct proc *spc_reaper;        /* dead proc reaper */
 #endif
@@ -119,18 +113,16 @@
 #ifdef _KERNEL
 
 /* spc_flags */
-#define SPCF_SEENRR             0x0001  /* process has seen roundrobin() */
-#define SPCF_SHOULDYIELD        0x0002  /* process should yield the CPU */
-#define SPCF_SWITCHCLEAR        (SPCF_SEENRR|SPCF_SHOULDYIELD)
-#define SPCF_SHOULDHALT                0x0004  /* CPU should be vacated */
-#define SPCF_HALTED            0x0008  /* CPU has been halted */
+#define SPCF_SHOULDYIELD        0x0001  /* process should yield the CPU */
+#define SPCF_SHOULDHALT                0x0002  /* CPU should be vacated */
+#define SPCF_HALTED            0x0004  /* CPU has been halted */
 
-#define        SCHED_PPQ       (128 / SCHED_NQS)       /* priorities per queue 
*/
 #define NICE_WEIGHT 2                  /* priorities per nice level */
-#define        ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX - SCHED_PPQ)
+#define        ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX)
 
 extern int schedhz;                    /* ideally: 16 */
 extern int rrticks_init;               /* ticks per roundrobin() */
+extern struct cpuset sched_idle_cpus;
 
 struct proc;
 void schedclock(struct proc *);
@@ -147,18 +139,20 @@
 void cpu_switchto(struct proc *, struct proc *);
 struct proc *sched_chooseproc(void);
 struct cpu_info *sched_choosecpu(struct proc *);
-struct cpu_info *sched_choosecpu_fork(struct proc *parent, int);
+struct cpu_info *sched_choosecpu_fork(struct proc *parent);
 void cpu_idle_enter(void);
 void cpu_idle_cycle(void);
 void cpu_idle_leave(void);
 void sched_peg_curproc(struct cpu_info *ci);
 
+void generate_deadline(struct proc *, char);
+
 #ifdef MULTIPROCESSOR
 void sched_start_secondary_cpus(void);
 void sched_stop_secondary_cpus(void);
 #endif
 
-#define curcpu_is_idle()       (curcpu()->ci_schedstate.spc_whichqs == 0)
+#define curcpu_is_idle() (cpuset_isset(&sched_idle_cpus, curcpu()))
 
 void sched_init_runqueues(void);
 void setrunqueue(struct proc *);
Index: kern/kern_clock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.72
diff -u -r1.72 kern_clock.c
--- kern/kern_clock.c   7 Mar 2011 07:07:13 -0000       1.72
+++ kern/kern_clock.c   17 Jan 2012 16:10:10 -0000
@@ -241,7 +241,11 @@
        if (stathz == 0)
                statclock(frame);
 
-       if (--ci->ci_schedstate.spc_rrticks <= 0)
+       /*
+        * If the currently running process went over its round robin tick 
quota,
+        * preempt it.
+        */
+       if (p && (--(p->p_rrticks) <= 0))
                roundrobin(ci);
 
        /*
Index: kern/kern_fork.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.133
diff -u -r1.133 kern_fork.c
--- kern/kern_fork.c    14 Dec 2011 07:32:16 -0000      1.133
+++ kern/kern_fork.c    17 Jan 2012 16:10:10 -0000
@@ -225,6 +225,9 @@
                atomic_setbits_int(&pr->ps_flags, PS_CONTROLT);
 
        p->p_p = pr;
+
+       /* Pretend we preempted this new process */
+       generate_deadline(p, 0);
 }
 
 /* print the 'table full' message once per 10 seconds */
@@ -485,7 +488,7 @@
        getmicrotime(&p->p_stats->p_start);
        p->p_acflag = AFORK;
        p->p_stat = SRUN;
-       p->p_cpu = sched_choosecpu_fork(curp, flags);
+       p->p_cpu = sched_choosecpu_fork(curp);
        setrunqueue(p);
        SCHED_UNLOCK(s);
 
Index: kern/kern_proc.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.47
diff -u -r1.47 kern_proc.c
--- kern/kern_proc.c    18 Sep 2011 23:20:54 -0000      1.47
+++ kern/kern_proc.c    17 Jan 2012 16:10:10 -0000
@@ -398,8 +398,6 @@
            p->p_comm, p->p_pid, pst, p->p_flag, P_BITS);
        (*pr)("    pri=%u, usrpri=%u, nice=%d\n",
            p->p_priority, p->p_usrpri, p->p_p->ps_nice);
-       (*pr)("    forw=%p, list=%p,%p\n",
-           TAILQ_NEXT(p, p_runq), p->p_list.le_next, p->p_list.le_prev);
        (*pr)("    process=%p user=%p, vmspace=%p\n",
            p->p_p, p->p_addr, p->p_vmspace);
        (*pr)("    estcpu=%u, cpticks=%d, pctcpu=%u.%u, swtime=%u\n",
Index: kern/kern_sched.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.24
diff -u -r1.24 kern_sched.c
--- kern/kern_sched.c   12 Oct 2011 18:30:09 -0000      1.24
+++ kern/kern_sched.c   17 Jan 2012 16:10:10 -0000
@@ -19,6 +19,7 @@
 
 #include <sys/sched.h>
 #include <sys/proc.h>
+#include <sys/tree.h>
 #include <sys/kthread.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
@@ -32,9 +33,11 @@
 
 void sched_kthreads_create(void *);
 
-int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
 struct proc *sched_steal_proc(struct cpu_info *);
 
+RB_HEAD(runqueue, proc) sched_runqueue;
+RB_HEAD(expqueue, proc) sched_expqueue;
+
 /*
  * To help choosing which cpu should run which process we keep track
  * of cpus which are currently idle and which cpus have processes
@@ -61,6 +64,27 @@
  * interrupts during the context switch.
  */

+static int
+sched_cmp_proc(struct proc *a, struct proc *b) {
+       if (a == b)
+               return 0;
+       if (timercmp(&(a->p_deadline), &(b->p_deadline), <))
+               return -1;
+       return 1;
+}
+
+static int
+sched_cmp_proc_exp(struct proc *a, struct proc *b) {
+       if (a == b)
+               return 0;
+       if (timercmp(&(a->p_deadline_set), &(b->p_deadline_set), <))
+               return -1;
+       return 1;
+}
+
+RB_GENERATE_STATIC(runqueue, proc, p_runq, sched_cmp_proc);
+RB_GENERATE_STATIC(expqueue, proc, p_expq, sched_cmp_proc_exp);
+
 /*
  * sched_init_cpu is called from main() for the boot cpu, then it's the
  * responsibility of the MD code to call it for all other cpus.
@@ -69,10 +93,6 @@
 sched_init_cpu(struct cpu_info *ci)
 {
        struct schedstate_percpu *spc = &ci->ci_schedstate;
-       int i;
-
-       for (i = 0; i < SCHED_NQS; i++)
-               TAILQ_INIT(&spc->spc_qs[i]);
 
        spc->spc_idleproc = NULL;
 
@@ -106,6 +126,7 @@
 {
        struct schedstate_percpu *spc;
        struct proc *p = curproc;
+       struct proc *dead;
        struct cpu_info *ci = v;
        int s;
 
@@ -120,7 +141,6 @@
        SCHED_LOCK(s);
        cpuset_add(&sched_idle_cpus, ci);
        p->p_stat = SSLEEP;
-       p->p_cpu = ci;
        atomic_setbits_int(&p->p_flag, P_CPUPEG);
        mi_switch();
        cpuset_del(&sched_idle_cpus, ci);
@@ -130,38 +150,27 @@
        KASSERT(curproc == spc->spc_idleproc);
 
        while (1) {
-               while (!curcpu_is_idle()) {
-                       struct proc *dead;
-
-                       SCHED_LOCK(s);
-                       p->p_stat = SSLEEP;
-                       mi_switch();
-                       SCHED_UNLOCK(s);
-
-                       while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
+               while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
                                LIST_REMOVE(dead, p_hash);
                                exit2(dead);
-                       }
                }
+               if ((spc->spc_schedflags & SPCF_SHOULDHALT) && 
!(spc->spc_schedflags & SPCF_HALTED))
+                       atomic_setbits_int(&spc->spc_schedflags, SPCF_HALTED);
 
                splassert(IPL_NONE);
 
                cpuset_add(&sched_idle_cpus, ci);
+
                cpu_idle_enter();
-               while (spc->spc_whichqs == 0) {
-                       if (spc->spc_schedflags & SPCF_SHOULDHALT &&
-                           (spc->spc_schedflags & SPCF_HALTED) == 0) {
-                               cpuset_del(&sched_idle_cpus, ci);
-                               SCHED_LOCK(s);
-                               atomic_setbits_int(&spc->spc_schedflags,
-                                   spc->spc_whichqs ? 0 : SPCF_HALTED);
-                               SCHED_UNLOCK(s);
-                               wakeup(spc);
-                       }
-                       cpu_idle_cycle();
-               }
+               cpu_idle_cycle();
                cpu_idle_leave();
+
                cpuset_del(&sched_idle_cpus, ci);
+
+               SCHED_LOCK(s);
+               p->p_stat = SSLEEP;
+               mi_switch();
+               SCHED_UNLOCK(s);
        }
 }
 
@@ -206,63 +215,35 @@
 void
 sched_init_runqueues(void)
 {
+       RB_INIT(&sched_runqueue);
+       RB_INIT(&sched_expqueue);
 }
 
 void
 setrunqueue(struct proc *p)
 {
-       struct schedstate_percpu *spc;
-       int queue = p->p_priority >> 2;
-
        SCHED_ASSERT_LOCKED();
-       spc = &p->p_cpu->ci_schedstate;
-       spc->spc_nrun++;
-
-       TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
-       spc->spc_whichqs |= (1 << queue);
-       cpuset_add(&sched_queued_cpus, p->p_cpu);
-
-       if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
-               cpu_unidle(p->p_cpu);
+       RB_INSERT(runqueue, &sched_runqueue, p);
 }
 
 void
 remrunqueue(struct proc *p)
 {
-       struct schedstate_percpu *spc;
-       int queue = p->p_priority >> 2;
-
        SCHED_ASSERT_LOCKED();
-       spc = &p->p_cpu->ci_schedstate;
-       spc->spc_nrun--;
-
-       TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
-       if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
-               spc->spc_whichqs &= ~(1 << queue);
-               if (spc->spc_whichqs == 0)
-                       cpuset_del(&sched_queued_cpus, p->p_cpu);
-       }
+       if (!(RB_REMOVE(runqueue, &sched_runqueue, p)))
+               RB_REMOVE(expqueue, &sched_expqueue, p);
 }
 
 struct proc *
 sched_chooseproc(void)
 {
        struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
-       struct proc *p;
-       int queue;
+       struct proc *p = NULL;
+       struct timeval now;
 
        SCHED_ASSERT_LOCKED();
 
        if (spc->spc_schedflags & SPCF_SHOULDHALT) {
-               if (spc->spc_whichqs) {
-                       for (queue = 0; queue < SCHED_NQS; queue++) {
-                               TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
-                                       remrunqueue(p);
-                                       p->p_cpu = sched_choosecpu(p);
-                                       setrunqueue(p);
-                               }
-                       }
-               }
                p = spc->spc_idleproc;
                KASSERT(p);
                KASSERT(p->p_wchan == NULL);
@@ -270,16 +251,30 @@
                return (p);
        }
 
-again:
-       if (spc->spc_whichqs) {
-               queue = ffs(spc->spc_whichqs) - 1;
-               p = TAILQ_FIRST(&spc->spc_qs[queue]);
-               remrunqueue(p);
-               KASSERT(p->p_stat == SRUN);
-       } else if ((p = sched_steal_proc(curcpu())) == NULL) {
-               p = spc->spc_idleproc;
-               if (p == NULL) {
-                        int s;
+       if ((!RB_EMPTY(&sched_runqueue)) || (!RB_EMPTY(&sched_expqueue))) {
+               /*
+                * Move expired processes to the expired runqueue where they 
are sorted by the time they got their
+                * deadline set instead of the deadline itself.
+                * */
+               microuptime(&now);
+               while(!RB_EMPTY(&sched_runqueue)) {
+                       p = RB_MIN(runqueue, &sched_runqueue);
+                       if (!p) break;
+                       if (!timercmp(&(p->p_deadline), &now, <)) break;
+                       RB_INSERT(expqueue, &sched_expqueue, p);
+                       RB_REMOVE(runqueue, &sched_runqueue, p);
+               }
+
+               if (!RB_EMPTY(&sched_expqueue)) {
+                       p = RB_MIN(expqueue, &sched_expqueue);
+                       RB_REMOVE(expqueue, &sched_expqueue, p);
+               } else {
+                       p = RB_MIN(runqueue, &sched_runqueue);
+                       remrunqueue(p);
+               }
+       } else {
+               while ((p = spc->spc_idleproc) == NULL) {
+                       int s;
                        /*
                         * We get here if someone decides to switch during
                         * boot before forking kthreads, bleh.
@@ -291,13 +286,15 @@
                        spl0();
                        delay(10);
                        SCHED_LOCK(s);
-                       goto again;
-                }
+               }
                KASSERT(p);
+       }
+
+       if (p) {
                p->p_stat = SRUN;
+               p->p_cpu = curcpu();
        } 
 
-       KASSERT(p->p_wchan == NULL);
        return (p);     
 }
 
@@ -310,30 +307,13 @@
 uint64_t sched_nomigrations;
 
 struct cpu_info *
-sched_choosecpu_fork(struct proc *parent, int flags)
+sched_choosecpu_fork(struct proc *parent)
 {
        struct cpu_info *choice = NULL;
        fixpt_t load, best_load = ~0;
-       int run, best_run = INT_MAX;
        struct cpu_info *ci;
        struct cpuset set;
 
-#if 0
-       /*
-        * XXX
-        * Don't do this until we have a painless way to move the cpu in exec.
-        * Preferably when nuking the old pmap and getting a new one on a
-        * new cpu.
-        */
-       /*
-        * PPWAIT forks are simple. We know that the parent will not
-        * run until we exec and choose another cpu, so we just steal its
-        * cpu.
-        */
-       if (flags & FORK_PPWAIT)
-               return (parent->p_cpu);
-#endif
-
        /*
         * Look at all cpus that are currently idle and have nothing queued.
         * If there are none, pick the one with least queued procs first,
@@ -347,13 +327,10 @@
                cpuset_del(&set, ci);
 
                load = ci->ci_schedstate.spc_ldavg;
-               run = ci->ci_schedstate.spc_nrun;
 
-               if (choice == NULL || run < best_run ||
-                   (run == best_run &&load < best_load)) {
+               if (choice == NULL || load < best_load) {
                        choice = ci;
                        best_load = load;
-                       best_run = run;
                }
        }
 
@@ -364,9 +341,6 @@
 sched_choosecpu(struct proc *p)
 {
        struct cpu_info *choice = NULL;
-       int last_cost = INT_MAX;
-       struct cpu_info *ci;
-       struct cpuset set;
 
        /*
         * If pegged to a cpu, don't allow it to move.
@@ -374,169 +348,19 @@
        if (p->p_flag & P_CPUPEG)
                return (p->p_cpu);
 
-       sched_choose++;
-
-       /*
-        * Look at all cpus that are currently idle and have nothing queued.
-        * If there are none, pick the cheapest of those.
-        * (idle + queued could mean that the cpu is handling an interrupt
-        * at this moment and haven't had time to leave idle yet).
-        */
-       cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
-
        /*
-        * First, just check if our current cpu is in that set, if it is,
-        * this is simple.
-        * Also, our cpu might not be idle, but if it's the current cpu
-        * and it has nothing else queued and we're curproc, take it.
+        * Else, just pretend we forked a new process.
         */
-       if (cpuset_isset(&set, p->p_cpu) ||
-           (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
-           curproc == p)) {
-               sched_wasidle++;
-               return (p->p_cpu);
-       }
-
-       if (cpuset_first(&set) == NULL)
-               cpuset_copy(&set, &sched_all_cpus);
-
-       while ((ci = cpuset_first(&set)) != NULL) {
-               int cost = sched_proc_to_cpu_cost(ci, p);
+       sched_choose++;
 
-               if (choice == NULL || cost < last_cost) {
-                       choice = ci;
-                       last_cost = cost;
-               }
-               cpuset_del(&set, ci);
-       }
+       choice = sched_choosecpu_fork(p);
 
        if (p->p_cpu != choice)
                sched_nmigrations++;
        else
                sched_nomigrations++;
 
-       return (choice);
-}
-
-/*
- * Attempt to steal a proc from some cpu.
- */
-struct proc *
-sched_steal_proc(struct cpu_info *self)
-{
-       struct schedstate_percpu *spc;
-       struct proc *best = NULL;
-       int bestcost = INT_MAX;
-       struct cpu_info *ci;
-       struct cpuset set;
-
-       cpuset_copy(&set, &sched_queued_cpus);
-
-       while ((ci = cpuset_first(&set)) != NULL) {
-               struct proc *p;
-               int queue;
-               int cost;
-
-               cpuset_del(&set, ci);
-
-               spc = &ci->ci_schedstate;
-
-               queue = ffs(spc->spc_whichqs) - 1;
-               TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
-                       if (p->p_flag & P_CPUPEG)
-                               continue;
-
-                       cost = sched_proc_to_cpu_cost(self, p);
-
-                       if (best == NULL || cost < bestcost) {
-                               best = p;
-                               bestcost = cost;
-                       }
-               }
-       }
-       if (best == NULL)
-               return (NULL);
-
-       spc = &best->p_cpu->ci_schedstate;
-       remrunqueue(best);
-       best->p_cpu = self;
-
-       sched_stolen++;
-
-       return (best);
-}
-
-/*
- * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
- */
-static int
-log2(unsigned int i)
-{
-       int ret = 0;
-
-       while (i >>= 1)
-               ret++;
-
-       return (ret);
-}
-
-/*
- * Calculate the cost of moving the proc to this cpu.
- * 
- * What we want is some guesstimate of how much "performance" it will
- * cost us to move the proc here. Not just for caches and TLBs and NUMA
- * memory, but also for the proc itself. A highly loaded cpu might not
- * be the best candidate for this proc since it won't get run.
- *
- * Just total guesstimates for now.
- */
-
-int sched_cost_load = 1;
-int sched_cost_priority = 1;
-int sched_cost_runnable = 3;
-int sched_cost_resident = 1;
-
-int
-sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
-{
-       struct schedstate_percpu *spc;
-       int l2resident = 0;
-       int cost;
-
-       spc = &ci->ci_schedstate;
-
-       cost = 0;
-
-       /*
-        * First, account for the priority of the proc we want to move.
-        * More willing to move, the lower the priority of the destination
-        * and the higher the priority of the proc.
-        */
-       if (!cpuset_isset(&sched_idle_cpus, ci)) {
-               cost += (p->p_priority - spc->spc_curpriority) *
-                   sched_cost_priority;
-               cost += sched_cost_runnable;
-       }
-       if (cpuset_isset(&sched_queued_cpus, ci)) {
-               cost += spc->spc_nrun * sched_cost_runnable;
-       }
-
-       /*
-        * Higher load on the destination means we don't want to go there.
-        */
-       cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
-
-       /*
-        * If the proc is on this cpu already, lower the cost by how much
-        * it has been running and an estimate of its footprint.
-        */
-       if (p->p_cpu == ci && p->p_slptime == 0) {
-               l2resident =
-                   log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
-               cost -= l2resident * sched_cost_resident;
-       }
-
-       return (cost);
+       return choice;
 }
 
 /*
@@ -560,7 +384,6 @@
 }
 
 #ifdef MULTIPROCESSOR
-
 void
 sched_start_secondary_cpus(void)
 {
@@ -583,6 +406,9 @@
 {
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
+       int s;
+
+       SCHED_LOCK(s);
 
        /*
         * Make sure we stop the secondary CPUs.
@@ -601,14 +427,17 @@
 
                if (CPU_IS_PRIMARY(ci))
                        continue;
-               while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
+
+               SCHED_UNLOCK(s);
+               while (!(spc->spc_schedflags & SPCF_HALTED)) {
                        sleep_setup(&sls, spc, PZERO, "schedstate");
-                       sleep_finish(&sls,
-                           (spc->spc_schedflags & SPCF_HALTED) == 0);
+                       sleep_setup_timeout(&sls, 10);
+                       sleep_finish(&sls, 1);
                }
+               SCHED_LOCK(s);
        }
+       SCHED_UNLOCK(s);
 }
-
 #endif
 
 /*
Index: kern/kern_synch.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.99
diff -u -r1.99 kern_synch.c
--- kern/kern_synch.c   17 Jan 2012 02:34:18 -0000      1.99
+++ kern/kern_synch.c   17 Jan 2012 16:10:10 -0000
@@ -205,7 +205,7 @@
        p->p_wmesg = wmesg;
        p->p_slptime = 0;
        p->p_priority = prio & PRIMASK;
-       TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_runq);
+       TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
 }
 
 void
@@ -342,7 +342,7 @@
 unsleep(struct proc *p)
 {
        if (p->p_wchan) {
-               TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_runq);
+               TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
                p->p_wchan = NULL;
        }
 }
@@ -361,7 +361,7 @@
        SCHED_LOCK(s);
        qp = &slpque[LOOKUP(ident)];
        for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
-               pnext = TAILQ_NEXT(p, p_runq);
+               pnext = TAILQ_NEXT(p, p_slpq);
 #ifdef DIAGNOSTIC
                if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
                        panic("wakeup: p_stat is %d", (int)p->p_stat);
@@ -369,7 +369,7 @@
                if (p->p_wchan == ident) {
                        --n;
                        p->p_wchan = 0;
-                       TAILQ_REMOVE(qp, p, p_runq);
+                       TAILQ_REMOVE(qp, p, p_slpq);
                        if (p->p_stat == SSLEEP) {
                                /* OPTIMIZED EXPANSION OF setrunnable(p); */
                                if (p->p_slptime > 1)
Index: kern/sched_bsd.c
===================================================================
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.27
diff -u -r1.27 sched_bsd.c
--- kern/sched_bsd.c    7 Jul 2011 18:00:33 -0000       1.27
+++ kern/sched_bsd.c    17 Jan 2012 16:10:10 -0000
@@ -77,37 +77,23 @@
 
        timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
 
-       rrticks_init = hz / 10;
+       rrticks_init = hz / 50;
        schedcpu(&schedcpu_to);
 }
 
 /*
- * Force switch among equal priority processes every 100ms.
+ * Force switch among equal priority processes every 20ms.
  */
 void
 roundrobin(struct cpu_info *ci)
 {
        struct schedstate_percpu *spc = &ci->ci_schedstate;
 
-       spc->spc_rrticks = rrticks_init;
-
        if (ci->ci_curproc != NULL) {
-               if (spc->spc_schedflags & SPCF_SEENRR) {
-                       /*
-                        * The process has already been through a roundrobin
-                        * without switching and may be hogging the CPU.
-                        * Indicate that the process should yield.
-                        */
-                       atomic_setbits_int(&spc->spc_schedflags,
-                           SPCF_SHOULDYIELD);
-               } else {
-                       atomic_setbits_int(&spc->spc_schedflags,
-                           SPCF_SEENRR);
-               }
+               atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
        }
 
-       if (spc->spc_nrun)
-               need_resched(ci);
+       need_resched(ci);
 }
 
 /*
@@ -204,7 +190,6 @@
        struct timeout *to = (struct timeout *)arg;
        fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
        struct proc *p;
-       int s;
        unsigned int newcpu;
        int phz;
 
@@ -233,7 +218,6 @@
                 */
                if (p->p_slptime > 1)
                        continue;
-               SCHED_LOCK(s);
                /*
                 * p_pctcpu is only for ps.
                 */
@@ -250,17 +234,6 @@
                newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
                p->p_estcpu = newcpu;
                resetpriority(p);
-               if (p->p_priority >= PUSER) {
-                       if (p->p_stat == SRUN &&
-                           (p->p_priority / SCHED_PPQ) !=
-                           (p->p_usrpri / SCHED_PPQ)) {
-                               remrunqueue(p);
-                               p->p_priority = p->p_usrpri;
-                               setrunqueue(p);
-                       } else
-                               p->p_priority = p->p_usrpri;
-               }
-               SCHED_UNLOCK(s);
        }
        uvm_meter();
        wakeup(&lbolt);
@@ -278,8 +251,6 @@
        unsigned int newcpu = p->p_estcpu;
        fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 
-       SCHED_ASSERT_LOCKED();
-
        if (p->p_slptime > 5 * loadfac)
                p->p_estcpu = 0;
        else {
@@ -304,6 +275,7 @@
        SCHED_LOCK(s);
        p->p_priority = p->p_usrpri;
        p->p_stat = SRUN;
+       generate_deadline(p, 1);
        setrunqueue(p);
        p->p_stats->p_ru.ru_nvcsw++;
        mi_switch();
@@ -332,6 +304,7 @@
        p->p_priority = p->p_usrpri;
        p->p_stat = SRUN;
        p->p_cpu = sched_choosecpu(p);
+       generate_deadline(p, 0);
        setrunqueue(p);
        p->p_stats->p_ru.ru_nivcsw++;
        mi_switch();
@@ -372,14 +345,7 @@
         * process was running, and add that to its total so far.
         */
        microuptime(&tv);
-       if (timercmp(&tv, &spc->spc_runtime, <)) {
-#if 0
-               printf("uptime is not monotonic! "
-                   "tv=%lu.%06lu, runtime=%lu.%06lu\n",
-                   tv.tv_sec, tv.tv_usec, spc->spc_runtime.tv_sec,
-                   spc->spc_runtime.tv_usec);
-#endif
-       } else {
+       if (timercmp(&tv, &spc->spc_runtime, >=)) {
                timersub(&tv, &spc->spc_runtime, &tv);
                timeradd(&p->p_rtime, &tv, &p->p_rtime);
        }
@@ -403,7 +369,7 @@
         * Process is about to yield the CPU; clear the appropriate
         * scheduling flags.
         */
-       atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
+       atomic_clearbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
 
        nextproc = sched_chooseproc();
 
@@ -435,9 +401,8 @@
         * be running on a new CPU now, so don't use the cache'd
         * schedstate_percpu pointer.
         */
-       KASSERT(p->p_cpu == curcpu());
-
-       microuptime(&p->p_cpu->ci_schedstate.spc_runtime);
+       if (p->p_cpu == curcpu())
+               microuptime(&p->p_cpu->ci_schedstate.spc_runtime);
 
 #ifdef MULTIPROCESSOR
        /*
@@ -515,30 +480,56 @@
        }
        p->p_stat = SRUN;
        p->p_cpu = sched_choosecpu(p);
-       setrunqueue(p);
        if (p->p_slptime > 1)
                updatepri(p);
+       setrunqueue(p);
        p->p_slptime = 0;
        resched_proc(p, p->p_priority);
 }
 
 /*
  * Compute the priority of a process when running in user mode.
- * Arrange to reschedule if the resulting priority is better
- * than that of the current process.
  */
 void
 resetpriority(struct proc *p)
 {
-       unsigned int newpriority;
-
-       SCHED_ASSERT_LOCKED();
+       p->p_usrpri = min(((NZERO + (p->p_p->ps_nice))) << 1, MAXPRI);
+}
 
-       newpriority = PUSER + p->p_estcpu +
-           NICE_WEIGHT * (p->p_p->ps_nice - NZERO);
-       newpriority = min(newpriority, MAXPRI);
-       p->p_usrpri = newpriority;
-       resched_proc(p, p->p_usrpri);
+/*
+ * Generate a new virtual deadline for a process. The deadline is a soft
+ * one and has no purpose besides being used for choosing the next process
+ * to run. Also resets the number of round robin ticks available to the
+ * process if the previous timeslice expired and the process had to be 
preempted.
+ */
+void
+generate_deadline(struct proc *p, char voluntary) {
+       /*
+        * For nice values between 0 and 39 inclusively, the offset lies between
+        * 32 and 1280 milliseconds for a machine with hz=100. That means that
+        * processes with nice value=0 (i.e. -20 in userland) will be executed
+        * 32 milliseconds in the future at the latest. Processes with very
+        * little priority will be executed 1.28 seconds in the future at the 
very
+        * latest. The shift is done to ensure that the lowest possible offset 
is
+        * larger than the timeslice, in order to make sure that the scheduler 
does
+        * not degenerate to round robin behaviour when more than just a few 
processes
+        * with high priority are started.
+        *
+        * If the process voluntarily yielded its CPU, we reward it by halving 
its
+        * deadline offset.
+        */
+       unsigned int offset_msec = ((p->p_p->ps_nice + 1) * rrticks_init) << 
(voluntary ? 2 : 3);
+       struct timeval offset = {
+               .tv_sec  = offset_msec / 1000,
+               .tv_usec = offset_msec % 1000
+       };
+       struct timeval now;
+       microuptime(&now);
+       bcopy(&now, &(p->p_deadline_set), sizeof(struct timeval));
+
+       timeradd(&now, &offset, &(p->p_deadline));
+       if (!voluntary)
+               p->p_rrticks = rrticks_init;
 }
 
 /*
@@ -559,12 +550,8 @@
 void
 schedclock(struct proc *p)
 {
-       int s;
-
-       SCHED_LOCK(s);
        p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
        resetpriority(p);
        if (p->p_priority >= PUSER)
                p->p_priority = p->p_usrpri;
-       SCHED_UNLOCK(s);
 }

Reply via email to