On 2016/07/06 21:14, Martin Pieuchot wrote:
> Please, don't try this diff blindly it won't make your machine faster.
>
> In the past months I've been looking more closely at our scheduler.
> At p2k16 I've shown to a handful of developers that when running a
> browser on my x220 with HT enable, a typical desktop usage, the per-
> CPU runqueues were never balanced. You often have no job on a CPU
> and multiple on the others.
>
> Currently when a CPU doesn't have any job on its runqueue it tries
> to "steal" a job from another CPU's runqueue. If look at the stats
> on my machine running a lot of threaded apps (GNOME3, Thunderbird,
> Firefox, Chrome), here's what I get:
>
> # pstat -d ld sched_stolen sched_choose sched_wasidle
> sched_stolen: 1665846
> sched_choose: 3195615
> sched_wasidle: 1309253
>
> For 32K jobs dispatched, 16K got stolen. That's 50% of the jobs on
> my machine and this ratio is stable for my usage.
>
> On my test machine, an Atom with HT, I got the following number:
>
> - after boot:
> sched_stolen: 570
> sched_choose: 10450
> sched_wasidle: 8936
>
> - after playing a video on youtube w/ firefox:
> sched_stolen: 2153754
> sched_choose: 10261682
> sched_wasidle: 1525801
>
> - after playing a video on youtube w/ chromium (after reboot):
> sched_stolen: 310000
> sched_choose: 6470258
> sched_wasidle: 934772
>
> What's interesting here is that threaded apps (like firefox) seems to
> trigger more "stealing". It would be interesting to see if/how this
> is related to the yield-busy-wait triggered by librthread's thrsleep()
> usage explained some months ago.
>
> What's also interesting is that the number of stolen jobs seems to
> be higher if your number of CPU is higher. Elementary, My Dear Watson?
> I observed that for the same workload, playing a HD video in firefox
> while compiling a kernel with make -j4, I have 50% have stolen jobs
> with 4 CPUs and 20% with 2 CPUs. Sadly I don't have a bigger machine
> to test. How bad can it be?
>
> So I looked at how this situation could be improved. My goal was to
> be able to compile a kernel while watching a video in my browser without
> having my audio slutter. I started by removing the "stealing" logic but
> the situation didn't improve. Then I tried to play with the calculation
> of the cost and failed. Then I decided to remove completely the per-CPU
> runqueues and came up with the diff below...
>
> There's too many things that I still don't understand so I'm not asking
> for ok, but I'd appreciate if people could test this diff and report back.
> My goal is currently to get a better understanding of our scheduler to
> hopefully improve it.
>
> By using a single runqueue I prioritise latency over throughput. That
> means your performance might degrade, but at least I can watch my HD
> video while doing a "make -j4".
>
> As a bonus, the diff below also greatly reduces the number of IPIs on my
> systems.
In case anyone is interested, here's a version of this diff against
-current. It helps a lot for me. I'm not watching HD video while doing
"make -j4", just things like trying to move the pointer around the screen
and type into a terminal while a map is loading in a browser.
Index: sys/sched.h
===================================================================
RCS file: /cvs/src/sys/sys/sched.h,v
retrieving revision 1.41
diff -u -p -r1.41 sched.h
--- sys/sched.h 17 Mar 2016 13:18:47 -0000 1.41
+++ sys/sched.h 10 Dec 2016 22:24:15 -0000
@@ -89,9 +89,10 @@
#define SCHED_NQS 32 /* 32 run queues. */
+#ifdef _KERNEL
+
/*
* Per-CPU scheduler state.
- * XXX - expose to userland for now.
*/
struct schedstate_percpu {
struct timespec spc_runtime; /* time curproc started running */
@@ -102,23 +103,16 @@ struct schedstate_percpu {
int spc_rrticks; /* ticks until roundrobin() */
int spc_pscnt; /* prof/stat counter */
int spc_psdiv; /* prof/stat divisor */
+ unsigned int spc_npeg; /* nb. of pegged threads on runqueue */
struct proc *spc_idleproc; /* idle proc for this cpu */
- u_int spc_nrun; /* procs on the run queues */
fixpt_t spc_ldavg; /* shortest load avg. for this cpu */
- TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
- volatile uint32_t spc_whichqs;
-
-#ifdef notyet
- struct proc *spc_reaper; /* dead proc reaper */
-#endif
LIST_HEAD(,proc) spc_deadproc;
volatile int spc_barrier; /* for sched_barrier() */
};
-#ifdef _KERNEL
/* spc_flags */
#define SPCF_SEENRR 0x0001 /* process has seen roundrobin() */
@@ -141,14 +135,13 @@ void roundrobin(struct cpu_info *);
void scheduler_start(void);
void userret(struct proc *p);
+void sched_init(void);
void sched_init_cpu(struct cpu_info *);
void sched_idle(void *);
void sched_exit(struct proc *);
void mi_switch(void);
void cpu_switchto(struct proc *, struct proc *);
struct proc *sched_chooseproc(void);
-struct cpu_info *sched_choosecpu(struct proc *);
-struct cpu_info *sched_choosecpu_fork(struct proc *parent, int);
void cpu_idle_enter(void);
void cpu_idle_cycle(void);
void cpu_idle_leave(void);
@@ -163,11 +156,11 @@ void sched_start_secondary_cpus(void);
void sched_stop_secondary_cpus(void);
#endif
-#define cpu_is_idle(ci) ((ci)->ci_schedstate.spc_whichqs == 0)
-
-void sched_init_runqueues(void);
void setrunqueue(struct proc *);
void remrunqueue(struct proc *);
+
+extern volatile uint32_t sched_whichqs;
+#define sched_qs_empty(ci) (sched_whichqs == 0)
/* Inherit the parent's scheduler history */
#define scheduler_fork_hook(parent, child) do {
\
Index: kern/sched_bsd.c
===================================================================
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.43
diff -u -p -r1.43 sched_bsd.c
--- kern/sched_bsd.c 9 Mar 2016 13:38:50 -0000 1.43
+++ kern/sched_bsd.c 10 Dec 2016 22:24:15 -0000
@@ -105,7 +105,7 @@ roundrobin(struct cpu_info *ci)
}
}
- if (spc->spc_nrun)
+ if (!sched_qs_empty(ci))
need_resched(ci);
}
@@ -300,6 +300,7 @@ yield(void)
SCHED_LOCK(s);
p->p_priority = p->p_usrpri;
p->p_stat = SRUN;
+ KASSERT(p->p_cpu != NULL);
setrunqueue(p);
p->p_ru.ru_nvcsw++;
mi_switch();
@@ -327,7 +328,7 @@ preempt(struct proc *newp)
SCHED_LOCK(s);
p->p_priority = p->p_usrpri;
p->p_stat = SRUN;
- p->p_cpu = sched_choosecpu(p);
+ KASSERT(p->p_cpu != NULL);
setrunqueue(p);
p->p_ru.ru_nivcsw++;
mi_switch();
@@ -418,6 +419,7 @@ mi_switch(void)
}
clear_resched(curcpu());
+ spc->spc_curpriority = p->p_usrpri;
SCHED_ASSERT_LOCKED();
@@ -454,25 +456,15 @@ mi_switch(void)
#endif
}
-static __inline void
+/*
+ * If the last CPU of thread ``p'' is currently running a lower
+ * priority thread, force a reschedule.
+ */
+static inline void
resched_proc(struct proc *p, u_char pri)
{
- struct cpu_info *ci;
+ struct cpu_info *ci = p->p_cpu;
- /*
- * XXXSMP
- * This does not handle the case where its last
- * CPU is running a higher-priority process, but every
- * other CPU is running a lower-priority process. There
- * are ways to handle this situation, but they're not
- * currently very pretty, and we also need to weigh the
- * cost of moving a process from one CPU to another.
- *
- * XXXSMP
- * There is also the issue of locking the other CPU's
- * sched state, which we currently do not do.
- */
- ci = (p->p_cpu != NULL) ? p->p_cpu : curcpu();
if (pri < ci->ci_schedstate.spc_curpriority)
need_resched(ci);
}
@@ -507,7 +499,7 @@ setrunnable(struct proc *p)
break;
}
p->p_stat = SRUN;
- p->p_cpu = sched_choosecpu(p);
+ KASSERT(p->p_cpu != NULL);
setrunqueue(p);
if (p->p_slptime > 1)
updatepri(p);
Index: kern/kern_synch.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.135
diff -u -p -r1.135 kern_synch.c
--- kern/kern_synch.c 13 Sep 2016 08:32:44 -0000 1.135
+++ kern/kern_synch.c 10 Dec 2016 22:24:15 -0000
@@ -300,6 +300,7 @@ sleep_finish(struct sleep_state *sls, in
mi_switch();
} else if (!do_sleep) {
unsleep(p);
+ p->p_cpu->ci_schedstate.spc_curpriority = p->p_usrpri;
}
#ifdef DIAGNOSTIC
@@ -307,7 +308,6 @@ sleep_finish(struct sleep_state *sls, in
panic("sleep_finish !SONPROC");
#endif
- p->p_cpu->ci_schedstate.spc_curpriority = p->p_usrpri;
SCHED_UNLOCK(sls->sls_s);
/*
Index: kern/kern_sched.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.43
diff -u -p -r1.43 kern_sched.c
--- kern/kern_sched.c 3 Jun 2016 15:21:23 -0000 1.43
+++ kern/kern_sched.c 10 Dec 2016 22:24:15 -0000
@@ -26,36 +26,37 @@
#include <sys/mutex.h>
#include <sys/task.h>
-#include <uvm/uvm_extern.h>
+TAILQ_HEAD(, proc) sched_qs[SCHED_NQS];
+volatile uint32_t sched_whichqs;
-void sched_kthreads_create(void *);
+#ifdef MULTIPROCESSOR
+struct taskq *sbartq;
+#endif
-int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
-struct proc *sched_steal_proc(struct cpu_info *);
+struct proc *sched_select(struct cpu_info *);
+void sched_kthreads_create(void *);
-/*
- * To help choosing which cpu should run which process we keep track
- * of cpus which are currently idle and which cpus have processes
- * queued.
- */
-struct cpuset sched_idle_cpus;
-struct cpuset sched_queued_cpus;
-struct cpuset sched_all_cpus;
+void
+sched_init(void)
+{
+ struct cpu_info *ci = curcpu();
+ int i;
-/*
- * Some general scheduler counters.
- */
-uint64_t sched_nmigrations; /* Cpu migration counter */
-uint64_t sched_nomigrations; /* Cpu no migration counter */
-uint64_t sched_noidle; /* Times we didn't pick the idle task */
-uint64_t sched_stolen; /* Times we stole proc from other cpus */
-uint64_t sched_choose; /* Times we chose a cpu */
-uint64_t sched_wasidle; /* Times we came out of idle */
+ for (i = 0; i < SCHED_NQS; i++)
+ TAILQ_INIT(&sched_qs[i]);
+ sched_whichqs = 0;
#ifdef MULTIPROCESSOR
-struct taskq *sbartq;
+ sbartq = taskq_create("sbar", 1, IPL_NONE,
+ TASKQ_MPSAFE | TASKQ_CANTSLEEP);
+ if (sbartq == NULL)
+ panic("unable to create sbar taskq");
#endif
+ ci->ci_randseed = (arc4random() & 0x7fffffff) + 1;
+ sched_init_cpu(ci);
+}
+
/*
* A few notes about cpu_switchto that is implemented in MD code.
*
@@ -74,30 +75,18 @@ struct taskq *sbartq;
*/
/*
- * sched_init_cpu is called from main() for the boot cpu, then it's the
- * responsibility of the MD code to call it for all other cpus.
+ * sched_init_cpu is called from sched_init() for the boot cpu, then
+ * it's the responsibility of the MD code to call it for all other cpus.
*/
void
sched_init_cpu(struct cpu_info *ci)
{
struct schedstate_percpu *spc = &ci->ci_schedstate;
- int i;
-
- for (i = 0; i < SCHED_NQS; i++)
- TAILQ_INIT(&spc->spc_qs[i]);
spc->spc_idleproc = NULL;
-
- kthread_create_deferred(sched_kthreads_create, ci);
-
LIST_INIT(&spc->spc_deadproc);
- /*
- * Slight hack here until the cpuset code handles cpu_info
- * structures.
- */
- cpuset_init_cpu(ci);
- cpuset_add(&sched_all_cpus, ci);
+ kthread_create_deferred(sched_kthreads_create, ci);
}
void
@@ -115,10 +104,46 @@ sched_kthreads_create(void *v)
/* Name it as specified. */
snprintf(spc->spc_idleproc->p_comm, sizeof(spc->spc_idleproc->p_comm),
"idle%d", num);
+ /* Always triggers a reschedule when an idle thread is running. */
+ spc->spc_idleproc->p_usrpri = MAXPRI;
num++;
}
+/*
+ * Returns 1 if a CPU can idle, 0 otherwise.
+ */
+static inline int
+can_idle(struct cpu_info *ci)
+{
+#ifdef MULTIPROCESSOR
+ struct schedstate_percpu *spc = &ci->ci_schedstate;
+#endif /* MULTIPROCESSOR */
+
+ /*
+ * As soon as a wakeup() or roundrobin() called need_resched()
+ * for this CPU, it has to go through mi_switch() to clear the
+ * resched flag.
+ *
+ * Yes, it is racy as the thread that triggered the reschedule
+ * might already be executing on another CPU. In this case,
+ * if there's nothing else on the runqueue, this CPU will come
+ * back in its idle loop.
+ */
+ if (want_resched(ci))
+ return (0);
+
+ if (sched_qs_empty(ci))
+ return (1);
+
+#ifdef MULTIPROCESSOR
+ if ((spc->spc_schedflags & SPCF_SHOULDHALT) && (spc->spc_npeg == 0))
+ return (1);
+#endif /* MULTIPROCESSOR */
+
+ return (0);
+}
+
void
sched_idle(void *v)
{
@@ -136,19 +161,17 @@ sched_idle(void *v)
* just go away for a while.
*/
SCHED_LOCK(s);
- cpuset_add(&sched_idle_cpus, ci);
p->p_stat = SSLEEP;
p->p_cpu = ci;
atomic_setbits_int(&p->p_flag, P_CPUPEG);
mi_switch();
- cpuset_del(&sched_idle_cpus, ci);
SCHED_UNLOCK(s);
KASSERT(ci == curcpu());
KASSERT(curproc == spc->spc_idleproc);
while (1) {
- while (!cpu_is_idle(curcpu())) {
+ while (!can_idle(ci)) {
struct proc *dead;
SCHED_LOCK(s);
@@ -164,24 +187,20 @@ sched_idle(void *v)
splassert(IPL_NONE);
- cpuset_add(&sched_idle_cpus, ci);
cpu_idle_enter();
- while (spc->spc_whichqs == 0) {
+ while (!want_resched(ci)) {
#ifdef MULTIPROCESSOR
if (spc->spc_schedflags & SPCF_SHOULDHALT &&
(spc->spc_schedflags & SPCF_HALTED) == 0) {
- cpuset_del(&sched_idle_cpus, ci);
- SCHED_LOCK(s);
+ KASSERT(spc->spc_npeg == 0);
atomic_setbits_int(&spc->spc_schedflags,
- spc->spc_whichqs ? 0 : SPCF_HALTED);
- SCHED_UNLOCK(s);
+ SPCF_HALTED);
wakeup(spc);
}
-#endif
+#endif /* MULTIPROCESSOR */
cpu_idle_cycle();
}
cpu_idle_leave();
- cpuset_del(&sched_idle_cpus, ci);
}
}
@@ -216,100 +235,94 @@ sched_exit(struct proc *p)
SCHED_LOCK(s);
idle = spc->spc_idleproc;
idle->p_stat = SRUN;
+ idle->p_cpu = curcpu();
cpu_switchto(NULL, idle);
panic("cpu_switchto returned");
}
-/*
- * Run queue management.
- */
-void
-sched_init_runqueues(void)
-{
-#ifdef MULTIPROCESSOR
- sbartq = taskq_create("sbar", 1, IPL_NONE,
- TASKQ_MPSAFE | TASKQ_CANTSLEEP);
- if (sbartq == NULL)
- panic("unable to create sbar taskq");
-#endif
-}
-
void
setrunqueue(struct proc *p)
{
- struct schedstate_percpu *spc;
int queue = p->p_priority >> 2;
SCHED_ASSERT_LOCKED();
- spc = &p->p_cpu->ci_schedstate;
- spc->spc_nrun++;
- TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
- spc->spc_whichqs |= (1 << queue);
- cpuset_add(&sched_queued_cpus, p->p_cpu);
+ TAILQ_INSERT_TAIL(&sched_qs[queue], p, p_runq);
+ sched_whichqs |= (1 << queue);
- if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
- cpu_unidle(p->p_cpu);
+ if (p->p_flag & P_CPUPEG)
+ p->p_cpu->ci_schedstate.spc_npeg++;
}
void
remrunqueue(struct proc *p)
{
- struct schedstate_percpu *spc;
int queue = p->p_priority >> 2;
SCHED_ASSERT_LOCKED();
- spc = &p->p_cpu->ci_schedstate;
- spc->spc_nrun--;
- TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
- if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
- spc->spc_whichqs &= ~(1 << queue);
- if (spc->spc_whichqs == 0)
- cpuset_del(&sched_queued_cpus, p->p_cpu);
- }
+ TAILQ_REMOVE(&sched_qs[queue], p, p_runq);
+ if (TAILQ_EMPTY(&sched_qs[queue]))
+ sched_whichqs &= ~(1 << queue);
+
+ if (p->p_flag & P_CPUPEG)
+ p->p_cpu->ci_schedstate.spc_npeg--;
}
+/*
+ * Select the first thread that can run on cpu ``ci'' from the runqueue.
+ *
+ * This is O(1) when there's no pegged thread in the runqueue.
+ */
struct proc *
-sched_chooseproc(void)
+sched_select(struct cpu_info *ci)
{
- struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
+#ifdef MULTIPROCESSOR
+ struct schedstate_percpu *spc = &ci->ci_schedstate;
+#endif /* MULTIPROCESSOR */
struct proc *p;
int queue;
- SCHED_ASSERT_LOCKED();
+ if (sched_qs_empty(ci))
+ return (NULL);
+ for (queue = 0; queue < SCHED_NQS; queue++) {
+ TAILQ_FOREACH(p, &sched_qs[queue], p_runq) {
#ifdef MULTIPROCESSOR
- if (spc->spc_schedflags & SPCF_SHOULDHALT) {
- if (spc->spc_whichqs) {
- for (queue = 0; queue < SCHED_NQS; queue++) {
- while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
- remrunqueue(p);
- p->p_cpu = sched_choosecpu(p);
- setrunqueue(p);
- if (p->p_cpu == curcpu()) {
- KASSERT(p->p_flag & P_CPUPEG);
- goto again;
- }
- }
- }
+ /* Never run a thread pegged to another CPU. */
+ if ((p->p_flag & P_CPUPEG) && p->p_cpu != ci)
+ continue;
+
+ /* If it should halt, only run pegged threads. */
+ if ((spc->spc_schedflags & SPCF_SHOULDHALT) &&
+ (p->p_flag & P_CPUPEG) == 0)
+ continue;
+#endif /* MULTIPROCESSOR */
+
+ return (p);
}
- p = spc->spc_idleproc;
- KASSERT(p);
- KASSERT(p->p_wchan == NULL);
- p->p_stat = SRUN;
- return (p);
}
-#endif
+
+ return (NULL);
+}
+
+struct proc *
+sched_chooseproc(void)
+{
+ struct cpu_info *ci = curcpu();
+ struct proc *p = NULL;
+
+ SCHED_ASSERT_LOCKED();
again:
- if (spc->spc_whichqs) {
- queue = ffs(spc->spc_whichqs) - 1;
- p = TAILQ_FIRST(&spc->spc_qs[queue]);
+ p = sched_select(ci);
+
+ if (p != NULL) {
remrunqueue(p);
- sched_noidle++;
KASSERT(p->p_stat == SRUN);
- } else if ((p = sched_steal_proc(curcpu())) == NULL) {
+ } else {
+ struct schedstate_percpu *spc = &ci->ci_schedstate;
+
p = spc->spc_idleproc;
if (p == NULL) {
int s;
@@ -328,263 +341,11 @@ again:
}
KASSERT(p);
p->p_stat = SRUN;
- }
-
- KASSERT(p->p_wchan == NULL);
- return (p);
-}
-
-struct cpu_info *
-sched_choosecpu_fork(struct proc *parent, int flags)
-{
-#ifdef MULTIPROCESSOR
- struct cpu_info *choice = NULL;
- fixpt_t load, best_load = ~0;
- int run, best_run = INT_MAX;
- struct cpu_info *ci;
- struct cpuset set;
-
-#if 0
- /*
- * XXX
- * Don't do this until we have a painless way to move the cpu in exec.
- * Preferably when nuking the old pmap and getting a new one on a
- * new cpu.
- */
- /*
- * PPWAIT forks are simple. We know that the parent will not
- * run until we exec and choose another cpu, so we just steal its
- * cpu.
- */
- if (flags & FORK_PPWAIT)
- return (parent->p_cpu);
-#endif
-
- /*
- * Look at all cpus that are currently idle and have nothing queued.
- * If there are none, pick the one with least queued procs first,
- * then the one with lowest load average.
- */
- cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
- cpuset_intersection(&set, &set, &sched_all_cpus);
- if (cpuset_first(&set) == NULL)
- cpuset_copy(&set, &sched_all_cpus);
-
- while ((ci = cpuset_first(&set)) != NULL) {
- cpuset_del(&set, ci);
-
- load = ci->ci_schedstate.spc_ldavg;
- run = ci->ci_schedstate.spc_nrun;
-
- if (choice == NULL || run < best_run ||
- (run == best_run &&load < best_load)) {
- choice = ci;
- best_load = load;
- best_run = run;
- }
- }
-
- return (choice);
-#else
- return (curcpu());
-#endif
-}
-
-struct cpu_info *
-sched_choosecpu(struct proc *p)
-{
-#ifdef MULTIPROCESSOR
- struct cpu_info *choice = NULL;
- int last_cost = INT_MAX;
- struct cpu_info *ci;
- struct cpuset set;
-
- /*
- * If pegged to a cpu, don't allow it to move.
- */
- if (p->p_flag & P_CPUPEG)
- return (p->p_cpu);
-
- sched_choose++;
-
- /*
- * Look at all cpus that are currently idle and have nothing queued.
- * If there are none, pick the cheapest of those.
- * (idle + queued could mean that the cpu is handling an interrupt
- * at this moment and haven't had time to leave idle yet).
- */
- cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
- cpuset_intersection(&set, &set, &sched_all_cpus);
-
- /*
- * First, just check if our current cpu is in that set, if it is,
- * this is simple.
- * Also, our cpu might not be idle, but if it's the current cpu
- * and it has nothing else queued and we're curproc, take it.
- */
- if (cpuset_isset(&set, p->p_cpu) ||
- (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
- (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
- curproc == p)) {
- sched_wasidle++;
- return (p->p_cpu);
}
- if (cpuset_first(&set) == NULL)
- cpuset_copy(&set, &sched_all_cpus);
-
- while ((ci = cpuset_first(&set)) != NULL) {
- int cost = sched_proc_to_cpu_cost(ci, p);
-
- if (choice == NULL || cost < last_cost) {
- choice = ci;
- last_cost = cost;
- }
- cpuset_del(&set, ci);
- }
-
- if (p->p_cpu != choice)
- sched_nmigrations++;
- else
- sched_nomigrations++;
-
- return (choice);
-#else
- return (curcpu());
-#endif
-}
-
-/*
- * Attempt to steal a proc from some cpu.
- */
-struct proc *
-sched_steal_proc(struct cpu_info *self)
-{
- struct proc *best = NULL;
-#ifdef MULTIPROCESSOR
- struct schedstate_percpu *spc;
- int bestcost = INT_MAX;
- struct cpu_info *ci;
- struct cpuset set;
-
- KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
-
- cpuset_copy(&set, &sched_queued_cpus);
-
- while ((ci = cpuset_first(&set)) != NULL) {
- struct proc *p;
- int queue;
- int cost;
-
- cpuset_del(&set, ci);
-
- spc = &ci->ci_schedstate;
-
- queue = ffs(spc->spc_whichqs) - 1;
- TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
- if (p->p_flag & P_CPUPEG)
- continue;
-
- cost = sched_proc_to_cpu_cost(self, p);
-
- if (best == NULL || cost < bestcost) {
- best = p;
- bestcost = cost;
- }
- }
- }
- if (best == NULL)
- return (NULL);
-
- spc = &best->p_cpu->ci_schedstate;
- remrunqueue(best);
- best->p_cpu = self;
-
- sched_stolen++;
-#endif
- return (best);
-}
-
-#ifdef MULTIPROCESSOR
-/*
- * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
- */
-static int
-log2(unsigned int i)
-{
- int ret = 0;
-
- while (i >>= 1)
- ret++;
-
- return (ret);
-}
-
-/*
- * Calculate the cost of moving the proc to this cpu.
- *
- * What we want is some guesstimate of how much "performance" it will
- * cost us to move the proc here. Not just for caches and TLBs and NUMA
- * memory, but also for the proc itself. A highly loaded cpu might not
- * be the best candidate for this proc since it won't get run.
- *
- * Just total guesstimates for now.
- */
-
-int sched_cost_load = 1;
-int sched_cost_priority = 1;
-int sched_cost_runnable = 3;
-int sched_cost_resident = 1;
-#endif
-
-int
-sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
-{
- int cost = 0;
-#ifdef MULTIPROCESSOR
- struct schedstate_percpu *spc;
- int l2resident = 0;
-
- spc = &ci->ci_schedstate;
-
- /*
- * First, account for the priority of the proc we want to move.
- * More willing to move, the lower the priority of the destination
- * and the higher the priority of the proc.
- */
- if (!cpuset_isset(&sched_idle_cpus, ci)) {
- cost += (p->p_priority - spc->spc_curpriority) *
- sched_cost_priority;
- cost += sched_cost_runnable;
- }
- if (cpuset_isset(&sched_queued_cpus, ci))
- cost += spc->spc_nrun * sched_cost_runnable;
-
- /*
- * Try to avoid the primary cpu as it handles hardware interrupts.
- *
- * XXX Needs to be revisited when we distribute interrupts
- * over cpus.
- */
- if (CPU_IS_PRIMARY(ci))
- cost += sched_cost_runnable;
-
- /*
- * Higher load on the destination means we don't want to go there.
- */
- cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
-
- /*
- * If the proc is on this cpu already, lower the cost by how much
- * it has been running and an estimate of its footprint.
- */
- if (p->p_cpu == ci && p->p_slptime == 0) {
- l2resident =
- log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
- cost -= l2resident * sched_cost_resident;
- }
-#endif
- return (cost);
+ KASSERT(p->p_wchan == NULL);
+ p->p_cpu = ci;
+ return (p);
}
/*
@@ -620,7 +381,6 @@ sched_start_secondary_cpus(void)
if (CPU_IS_PRIMARY(ci))
continue;
- cpuset_add(&sched_all_cpus, ci);
atomic_clearbits_int(&spc->spc_schedflags,
SPCF_SHOULDHALT | SPCF_HALTED);
}
@@ -640,7 +400,6 @@ sched_stop_secondary_cpus(void)
if (CPU_IS_PRIMARY(ci))
continue;
- cpuset_del(&sched_all_cpus, ci);
atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
}
CPU_INFO_FOREACH(cii, ci) {
@@ -697,14 +456,14 @@ sched_barrier(struct cpu_info *ci)
}
}
-#else
+#else /* MULTIPROCESSOR */
void
sched_barrier(struct cpu_info *ci)
{
}
-#endif
+#endif /* MULTIPROCESSOR */
/*
* Functions to manipulate cpu sets.
Index: kern/kern_fork.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.192
diff -u -p -r1.192 kern_fork.c
--- kern/kern_fork.c 7 Nov 2016 00:26:32 -0000 1.192
+++ kern/kern_fork.c 10 Dec 2016 22:24:15 -0000
@@ -491,7 +491,7 @@ fork1(struct proc *curp, int flags, void
if ((flags & FORK_IDLE) == 0) {
SCHED_LOCK(s);
p->p_stat = SRUN;
- p->p_cpu = sched_choosecpu_fork(curp, flags);
+ p->p_cpu = curcpu();
setrunqueue(p);
SCHED_UNLOCK(s);
} else
Index: kern/kern_clock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.91
diff -u -p -r1.91 kern_clock.c
--- kern/kern_clock.c 4 Sep 2016 09:22:29 -0000 1.91
+++ kern/kern_clock.c 10 Dec 2016 22:24:15 -0000
@@ -400,7 +400,8 @@ statclock(struct clockframe *frame)
spc->spc_pscnt = psdiv;
if (p != NULL) {
- p->p_cpticks++;
+ if (p != spc->spc_idleproc)
+ p->p_cpticks++;
/*
* If no schedclock is provided, call it here at ~~12-25 Hz;
* ~~16 Hz is best
Index: kern/init_main.c
===================================================================
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.263
diff -u -p -r1.263 init_main.c
--- kern/init_main.c 14 Nov 2016 10:32:46 -0000 1.263
+++ kern/init_main.c 10 Dec 2016 22:24:15 -0000
@@ -333,11 +333,7 @@ main(void *framep)
*/
(void)chgproccnt(0, 1);
- /* Initialize run queues */
- sched_init_runqueues();
sleep_queue_init();
- sched_init_cpu(curcpu());
- p->p_cpu->ci_randseed = (arc4random() & 0x7fffffff) + 1;
/* Initialize timeouts in process context. */
timeout_proc_init();
@@ -347,6 +343,9 @@ main(void *framep)
/* Initialize the interface/address trees */
ifinit();
+
+ /* Initialize the scheduler */
+ sched_init();
/* Lock the kernel on behalf of proc0. */
KERNEL_LOCK();
Index: dev/acpi/acpicpu.c
===================================================================
RCS file: /cvs/src/sys/dev/acpi/acpicpu.c,v
retrieving revision 1.78
diff -u -p -r1.78 acpicpu.c
--- dev/acpi/acpicpu.c 18 Sep 2016 23:56:45 -0000 1.78
+++ dev/acpi/acpicpu.c 10 Dec 2016 22:24:15 -0000
@@ -1210,7 +1210,7 @@ acpicpu_idle(void)
#endif
/* something already queued? */
- if (!cpu_is_idle(ci))
+ if (want_resched(ci))
return;
/*
@@ -1226,7 +1226,7 @@ acpicpu_idle(void)
hints = (unsigned)best->address;
microuptime(&start);
atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING);
- if (cpu_is_idle(ci)) {
+ if (!want_resched(ci)) {
/* intel errata AAI65: cflush before monitor */
if (ci->ci_cflushsz != 0) {
membar_sync();
Index: arch/sparc64/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/include/cpu.h,v
retrieving revision 1.89
diff -u -p -r1.89 cpu.h
--- arch/sparc64/include/cpu.h 17 Aug 2016 11:09:01 -0000 1.89
+++ arch/sparc64/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -234,8 +234,9 @@ extern void (*cpu_start_clock)(void);
* Preempt the current process if in interrupt from user mode,
* or after the current trap/syscall if in system mode.
*/
-extern void need_resched(struct cpu_info *);
-#define clear_resched(ci) (ci)->ci_want_resched = 0
+void need_resched(struct cpu_info *);
+#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define want_resched(ci) ((ci)->ci_want_resched)
/*
* This is used during profiling to integrate system time.
Index: arch/sh/sh/trap.c
===================================================================
RCS file: /cvs/src/sys/arch/sh/sh/trap.c,v
retrieving revision 1.36
diff -u -p -r1.36 trap.c
--- arch/sh/sh/trap.c 8 Oct 2016 05:49:09 -0000 1.36
+++ arch/sh/sh/trap.c 10 Dec 2016 22:24:15 -0000
@@ -483,7 +483,7 @@ ast(struct proc *p, struct trapframe *tf
p->p_md.md_astpending = 0;
refreshcreds(p);
uvmexp.softs++;
- mi_ast(p, want_resched);
+ mi_ast(p, want_resched(curcpu()));
userret(p);
}
}
Index: arch/sh/sh/locore_c.c
===================================================================
RCS file: /cvs/src/sys/arch/sh/sh/locore_c.c,v
retrieving revision 1.12
diff -u -p -r1.12 locore_c.c
--- arch/sh/sh/locore_c.c 18 Nov 2014 20:51:01 -0000 1.12
+++ arch/sh/sh/locore_c.c 10 Dec 2016 22:24:15 -0000
@@ -121,7 +121,7 @@
void (*__sh_switch_resume)(struct proc *);
void cpu_switch_prepare(struct proc *, struct proc *);
-int want_resched;
+int cpu_want_resched;
/*
* Prepare context switch from oproc to nproc.
Index: arch/sh/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/sh/include/cpu.h,v
retrieving revision 1.27
diff -u -p -r1.27 cpu.h
--- arch/sh/include/cpu.h 11 Jul 2014 10:53:07 -0000 1.27
+++ arch/sh/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -105,17 +105,19 @@ struct clockframe {
#define PROC_PC(p) ((p)->p_md.md_regs->tf_spc)
#define PROC_STACK(p) ((p)->p_md.md_regs->tf_r15)
+extern int cpu_want_resched; /* need_resched() was called */
/*
* Preempt the current process if in interrupt from user mode,
* or after the current trap/syscall if in system mode.
*/
#define need_resched(ci)
\
do { \
- want_resched = 1; \
+ cpu_want_resched = 1; \
if (curproc != NULL) \
- aston(curproc); \
+ aston(curproc); \
} while (/*CONSTCOND*/0)
-#define clear_resched(ci) want_resched = 0
+#define clear_resched(ci) cpu_want_resched = 0
+#define want_resched(ci) (cpu_want_resched)
/*
* Give a profiling tick to the current process when the user profiling
@@ -131,8 +133,6 @@ do {
\
#define signotify(p) aston(p)
#define aston(p) ((p)->p_md.md_astpending = 1)
-
-extern int want_resched; /* need_resched() was called */
/*
* We need a machine-independent name for this.
Index: arch/powerpc/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/powerpc/include/cpu.h,v
retrieving revision 1.63
diff -u -p -r1.63 cpu.h
--- arch/powerpc/include/cpu.h 7 May 2016 22:46:54 -0000 1.63
+++ arch/powerpc/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -181,7 +181,8 @@ do {
\
if (ci->ci_curproc != NULL) \
aston(ci->ci_curproc); \
} while (0)
-#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define want_resched(ci) ((ci)->ci_want_resched)
#define need_proftick(p) aston(p)
Index: arch/mips64/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/mips64/include/cpu.h,v
retrieving revision 1.111
diff -u -p -r1.111 cpu.h
--- arch/mips64/include/cpu.h 14 Aug 2016 08:23:52 -0000 1.111
+++ arch/mips64/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -305,6 +305,7 @@ void cp0_calibrate(struct cpu_info *);
aston((ci)->ci_curproc); \
} while(0)
#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define want_resched(ci) ((ci)->ci_want_resched)
/*
* Give a profiling tick to the current process when the user profiling
Index: arch/m88k/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/m88k/include/cpu.h,v
retrieving revision 1.64
diff -u -p -r1.64 cpu.h
--- arch/m88k/include/cpu.h 2 Jul 2015 01:33:59 -0000 1.64
+++ arch/m88k/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -274,7 +274,9 @@ struct clockframe {
#define PROC_PC(p) PC_REGS((struct reg *)((p)->p_md.md_tf))
#define PROC_STACK(p) ((p)->p_md.md_tf->tf_sp)
+void need_resched(struct cpu_info *);
#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define want_resched(ci) ((ci)->ci_want_resched)
/*
* Give a profiling tick to the current process when the user profiling
@@ -283,7 +285,6 @@ struct clockframe {
*/
#define need_proftick(p) aston(p)
-void need_resched(struct cpu_info *);
void signotify(struct proc *);
void softipi(void);
Index: arch/i386/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/i386/include/cpu.h,v
retrieving revision 1.150
diff -u -p -r1.150 cpu.h
--- arch/i386/include/cpu.h 21 Oct 2016 06:20:58 -0000 1.150
+++ arch/i386/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -286,14 +286,14 @@ void cpu_unidle(struct cpu_info *);
#define curpcb curcpu()->ci_curpcb
-#define want_resched (curcpu()->ci_want_resched)
-
/*
* Preempt the current process if in interrupt from user mode,
* or after the current trap/syscall if in system mode.
*/
-extern void need_resched(struct cpu_info *);
-#define clear_resched(ci) (ci)->ci_want_resched = 0
+void need_resched(struct cpu_info *);
+#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define want_resched(ci) ((ci)->ci_want_resched)
+
#define CLKF_USERMODE(frame) USERMODE((frame)->if_cs,
(frame)->if_eflags)
#define CLKF_PC(frame) ((frame)->if_eip)
Index: arch/i386/i386/trap.c
===================================================================
RCS file: /cvs/src/sys/arch/i386/i386/trap.c,v
retrieving revision 1.126
diff -u -p -r1.126 trap.c
--- arch/i386/i386/trap.c 8 Oct 2016 05:49:08 -0000 1.126
+++ arch/i386/i386/trap.c 10 Dec 2016 22:24:15 -0000
@@ -528,7 +528,7 @@ ast(struct trapframe *frame)
p->p_md.md_regs = frame;
refreshcreds(p);
uvmexp.softs++;
- mi_ast(p, want_resched);
+ mi_ast(p, want_resched(curcpu()));
userret(p);
}
Index: arch/i386/i386/cpu.c
===================================================================
RCS file: /cvs/src/sys/arch/i386/i386/cpu.c,v
retrieving revision 1.80
diff -u -p -r1.80 cpu.c
--- arch/i386/i386/cpu.c 21 Oct 2016 06:20:58 -0000 1.80
+++ arch/i386/i386/cpu.c 10 Dec 2016 22:24:15 -0000
@@ -780,7 +780,7 @@ cpu_idle_mwait_cycle(void)
panic("idle with interrupts blocked!");
/* something already queued? */
- if (!cpu_is_idle(ci))
+ if (want_resched(ci))
return;
/*
@@ -794,7 +794,7 @@ cpu_idle_mwait_cycle(void)
* the check in sched_idle() and here.
*/
atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING | MWAIT_ONLY);
- if (cpu_is_idle(ci)) {
+ if (!want_resched(ci)) {
monitor(&ci->ci_mwait, 0, 0);
if ((ci->ci_mwait & MWAIT_IDLING) == MWAIT_IDLING)
mwait(0, 0);
Index: arch/hppa/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/hppa/include/cpu.h,v
retrieving revision 1.89
diff -u -p -r1.89 cpu.h
--- arch/hppa/include/cpu.h 10 May 2016 14:52:03 -0000 1.89
+++ arch/hppa/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -244,8 +244,13 @@ void cpu_unidle(struct cpu_info *);
#define cpu_unidle(ci)
#endif
-extern void need_resched(struct cpu_info *);
-#define clear_resched(ci) (ci)->ci_want_resched = 0
+/*
+ * Preempt the current process if in interrupt from user mode,
+ * or after the current trap/syscall if in system mode.
+ */
+void need_resched(struct cpu_info *);
+#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define want_resched(ci) ((ci)->ci_want_resched)
#endif
Index: arch/arm/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/arm/include/cpu.h,v
retrieving revision 1.41
diff -u -p -r1.41 cpu.h
--- arch/arm/include/cpu.h 4 Apr 2016 09:13:44 -0000 1.41
+++ arch/arm/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -267,9 +267,10 @@ extern int astpending;
* Preempt the current process if in interrupt from user mode,
* or after the current trap/syscall if in system mode.
*/
-extern int want_resched; /* resched() was called */
-#define need_resched(ci) (want_resched = 1, setsoftast())
-#define clear_resched(ci) want_resched = 0
+extern int cpu_want_resched; /* need_resched() was called */
+#define need_resched(ci) (cpu_want_resched = 1, setsoftast())
+#define clear_resched(ci) cpu_want_resched = 0
+#define want_resched(ci) (cpu_want_resched)
/*
* Give a profiling tick to the current process when the user profiling
Index: arch/arm/arm/ast.c
===================================================================
RCS file: /cvs/src/sys/arch/arm/arm/ast.c,v
retrieving revision 1.14
diff -u -p -r1.14 ast.c
--- arch/arm/arm/ast.c 18 Nov 2014 20:51:01 -0000 1.14
+++ arch/arm/arm/ast.c 10 Dec 2016 22:24:15 -0000
@@ -65,7 +65,7 @@
*/
void ast(struct trapframe *);
-int want_resched;
+int cpu_want_resched;
extern int astpending;
/*
@@ -91,7 +91,7 @@ ast(struct trapframe *tf)
#endif
uvmexp.softs++;
- mi_ast(p, want_resched);
+ mi_ast(p, want_resched(curcpu()));
userret(p);
}
Index: arch/amd64/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v
retrieving revision 1.106
diff -u -p -r1.106 cpu.h
--- arch/amd64/include/cpu.h 13 Oct 2016 19:36:25 -0000 1.106
+++ arch/amd64/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -220,12 +220,13 @@ extern struct cpu_info *cpu_info_list;
#define CPU_INFO_UNIT(ci) ((ci)->ci_dev ? (ci)->ci_dev->dv_unit : 0)
-/*
+/*
* Preempt the current process if in interrupt from user mode,
* or after the current trap/syscall if in system mode.
*/
-extern void need_resched(struct cpu_info *);
-#define clear_resched(ci) (ci)->ci_want_resched = 0
+void need_resched(struct cpu_info *);
+#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define want_resched(ci) ((ci)->ci_want_resched)
#if defined(MULTIPROCESSOR)
Index: arch/amd64/amd64/cpu.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
retrieving revision 1.102
diff -u -p -r1.102 cpu.c
--- arch/amd64/amd64/cpu.c 28 Jul 2016 21:57:57 -0000 1.102
+++ arch/amd64/amd64/cpu.c 10 Dec 2016 22:24:15 -0000
@@ -253,7 +253,7 @@ cpu_idle_mwait_cycle(void)
panic("idle with interrupts blocked!");
/* something already queued? */
- if (!cpu_is_idle(ci))
+ if (want_resched(ci))
return;
/*
@@ -267,7 +267,7 @@ cpu_idle_mwait_cycle(void)
* the check in sched_idle() and here.
*/
atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING | MWAIT_ONLY);
- if (cpu_is_idle(ci)) {
+ if (!want_resched(ci)) {
monitor(&ci->ci_mwait, 0, 0);
if ((ci->ci_mwait & MWAIT_IDLING) == MWAIT_IDLING)
mwait(0, 0);
Index: arch/alpha/include/cpu.h
===================================================================
RCS file: /cvs/src/sys/arch/alpha/include/cpu.h,v
retrieving revision 1.57
diff -u -p -r1.57 cpu.h
--- arch/alpha/include/cpu.h 30 Mar 2016 15:39:46 -0000 1.57
+++ arch/alpha/include/cpu.h 10 Dec 2016 22:24:15 -0000
@@ -301,7 +301,8 @@ do {
\
if ((ci)->ci_curproc != NULL) \
aston((ci)->ci_curproc); \
} while (/*CONSTCOND*/0)
-#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define clear_resched(ci) (ci)->ci_want_resched = 0
+#define want_resched(ci) ((ci)->ci_want_resched)
/*
* Give a profiling tick to the current process when the user profiling