Module Name:    src
Committed By:   ad
Date:           Sat Nov 23 19:42:52 UTC 2019

Modified Files:
        src/sys/compat/linux/common: linux_sched.c
        src/sys/kern: kern_exec.c kern_fork.c kern_idle.c kern_kthread.c
            kern_lwp.c kern_runq.c kern_sleepq.c kern_softint.c kern_synch.c
            sched_4bsd.c sys_aio.c sys_lwp.c
        src/sys/rump/librump/rumpkern: scheduler.c
        src/sys/sys: cpu.h lwp.h sched.h

Log Message:
Minor scheduler cleanup:

- Adapt to cpu_need_resched() changes. Avoid lost & duplicate IPIs and ASTs.
  sched_resched_cpu() and sched_resched_lwp() contain the logic for this.
- Changes for LSIDL to make the locking scheme match the intended design.
- Reduce lock contention and false sharing further.
- Numerous small bugfixes, including some corrections for SCHED_FIFO/RT.
- Use setrunnable() in more places, and merge cut & pasted code.


To generate a diff of this commit:
cvs rdiff -u -r1.72 -r1.73 src/sys/compat/linux/common/linux_sched.c
cvs rdiff -u -r1.483 -r1.484 src/sys/kern/kern_exec.c
cvs rdiff -u -r1.215 -r1.216 src/sys/kern/kern_fork.c
cvs rdiff -u -r1.25 -r1.26 src/sys/kern/kern_idle.c
cvs rdiff -u -r1.43 -r1.44 src/sys/kern/kern_kthread.c
cvs rdiff -u -r1.211 -r1.212 src/sys/kern/kern_lwp.c
cvs rdiff -u -r1.47 -r1.48 src/sys/kern/kern_runq.c
cvs rdiff -u -r1.52 -r1.53 src/sys/kern/kern_sleepq.c
cvs rdiff -u -r1.49 -r1.50 src/sys/kern/kern_softint.c
cvs rdiff -u -r1.325 -r1.326 src/sys/kern/kern_synch.c
cvs rdiff -u -r1.35 -r1.36 src/sys/kern/sched_4bsd.c
cvs rdiff -u -r1.44 -r1.45 src/sys/kern/sys_aio.c
cvs rdiff -u -r1.70 -r1.71 src/sys/kern/sys_lwp.c
cvs rdiff -u -r1.44 -r1.45 src/sys/rump/librump/rumpkern/scheduler.c
cvs rdiff -u -r1.43 -r1.44 src/sys/sys/cpu.h
cvs rdiff -u -r1.189 -r1.190 src/sys/sys/lwp.h
cvs rdiff -u -r1.76 -r1.77 src/sys/sys/sched.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/compat/linux/common/linux_sched.c
diff -u src/sys/compat/linux/common/linux_sched.c:1.72 src/sys/compat/linux/common/linux_sched.c:1.73
--- src/sys/compat/linux/common/linux_sched.c:1.72	Thu Oct  3 22:16:53 2019
+++ src/sys/compat/linux/common/linux_sched.c	Sat Nov 23 19:42:52 2019
@@ -1,7 +1,7 @@
-/*	$NetBSD: linux_sched.c,v 1.72 2019/10/03 22:16:53 kamil Exp $	*/
+/*	$NetBSD: linux_sched.c,v 1.73 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
- * Copyright (c) 1999 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2019 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.72 2019/10/03 22:16:53 kamil Exp $");
+__KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.73 2019/11/23 19:42:52 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/mount.h>
@@ -180,7 +180,6 @@ linux_clone_nptl(struct lwp *l, const st
 	struct lwp *l2;
 	struct linux_emuldata *led;
 	void *parent_tidptr, *tls, *child_tidptr;
-	struct schedstate_percpu *spc;
 	vaddr_t uaddr;
 	lwpid_t lid;
 	int flags, tnprocs, error;
@@ -248,31 +247,8 @@ linux_clone_nptl(struct lwp *l, const st
 		}
 	}
 
-	/*
-	 * Set the new LWP running, unless the process is stopping,
-	 * then the LWP is created stopped.
-	 */
-	mutex_enter(p->p_lock);
-	lwp_lock(l2);
-	spc = &l2->l_cpu->ci_schedstate;
-	if ((l->l_flag & (LW_WREBOOT | LW_DBGSUSPEND | LW_WSUSPEND | LW_WEXIT)) == 0) {
-	    	if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
-			KASSERT(l2->l_wchan == NULL);
-	    		l2->l_stat = LSSTOP;
-			p->p_nrlwps--;
-			lwp_unlock_to(l2, spc->spc_lwplock);
-		} else {
-			KASSERT(lwp_locked(l2, spc->spc_mutex));
-			l2->l_stat = LSRUN;
-			sched_enqueue(l2, false);
-			lwp_unlock(l2);
-		}
-	} else {
-		l2->l_stat = LSSUSPENDED;
-		p->p_nrlwps--;
-		lwp_unlock_to(l2, spc->spc_lwplock);
-	}
-	mutex_exit(p->p_lock);
+	/* Set the new LWP running. */
+	lwp_start(l2, 0);
 
 	retval[0] = lid;
 	retval[1] = 0;

Index: src/sys/kern/kern_exec.c
diff -u src/sys/kern/kern_exec.c:1.483 src/sys/kern/kern_exec.c:1.484
--- src/sys/kern/kern_exec.c:1.483	Sat Oct 12 10:55:23 2019
+++ src/sys/kern/kern_exec.c	Sat Nov 23 19:42:52 2019
@@ -1,9 +1,12 @@
-/*	$NetBSD: kern_exec.c,v 1.483 2019/10/12 10:55:23 kamil Exp $	*/
+/*	$NetBSD: kern_exec.c,v 1.484 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
- * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2008, 2019 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -59,7 +62,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.483 2019/10/12 10:55:23 kamil Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.484 2019/11/23 19:42:52 ad Exp $");
 
 #include "opt_exec.h"
 #include "opt_execfmt.h"
@@ -2650,11 +2653,11 @@ do_posix_spawn(struct lwp *l1, pid_t *pi
 
 	lwp_lock(l2);
 	KASSERT(p2->p_nrlwps == 1);
+	KASSERT(l2->l_stat == LSIDL);
 	p2->p_nrlwps = 1;
 	p2->p_stat = SACTIVE;
-	l2->l_stat = LSRUN;
-	sched_enqueue(l2, false);
-	lwp_unlock(l2);
+	setrunnable(l2);
+	/* LWP now unlocked */
 
 	mutex_exit(p2->p_lock);
 	mutex_exit(proc_lock);

Index: src/sys/kern/kern_fork.c
diff -u src/sys/kern/kern_fork.c:1.215 src/sys/kern/kern_fork.c:1.216
--- src/sys/kern/kern_fork.c:1.215	Sat Oct 12 10:55:23 2019
+++ src/sys/kern/kern_fork.c	Sat Nov 23 19:42:52 2019
@@ -1,7 +1,8 @@
-/*	$NetBSD: kern_fork.c,v 1.215 2019/10/12 10:55:23 kamil Exp $	*/
+/*	$NetBSD: kern_fork.c,v 1.216 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2001, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2001, 2004, 2006, 2007, 2008, 2019
+ *     The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -67,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_fork.c,v 1.215 2019/10/12 10:55:23 kamil Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_fork.c,v 1.216 2019/11/23 19:42:52 ad Exp $");
 
 #include "opt_ktrace.h"
 #include "opt_dtrace.h"
@@ -561,21 +562,20 @@ fork1(struct lwp *l1, int flags, int exi
 	p2->p_acflag = AFORK;
 	lwp_lock(l2);
 	KASSERT(p2->p_nrlwps == 1);
+	KASSERT(l2->l_stat == LSIDL);
 	if (p2->p_sflag & PS_STOPFORK) {
-		struct schedstate_percpu *spc = &l2->l_cpu->ci_schedstate;
 		p2->p_nrlwps = 0;
 		p2->p_stat = SSTOP;
 		p2->p_waited = 0;
 		p1->p_nstopchild++;
 		l2->l_stat = LSSTOP;
 		KASSERT(l2->l_wchan == NULL);
-		lwp_unlock_to(l2, spc->spc_lwplock);
+		lwp_unlock(l2);
 	} else {
 		p2->p_nrlwps = 1;
 		p2->p_stat = SACTIVE;
-		l2->l_stat = LSRUN;
-		sched_enqueue(l2, false);
-		lwp_unlock(l2);
+		setrunnable(l2);
+		/* LWP now unlocked */
 	}
 
 	/*

Index: src/sys/kern/kern_idle.c
diff -u src/sys/kern/kern_idle.c:1.25 src/sys/kern/kern_idle.c:1.26
--- src/sys/kern/kern_idle.c:1.25	Sun Jan 29 22:55:40 2012
+++ src/sys/kern/kern_idle.c	Sat Nov 23 19:42:52 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_idle.c,v 1.25 2012/01/29 22:55:40 rmind Exp $	*/
+/*	$NetBSD: kern_idle.c,v 1.26 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
  * Copyright (c)2002, 2006, 2007 YAMAMOTO Takashi,
@@ -28,7 +28,7 @@
 
 #include <sys/cdefs.h>
 
-__KERNEL_RCSID(0, "$NetBSD: kern_idle.c,v 1.25 2012/01/29 22:55:40 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_idle.c,v 1.26 2019/11/23 19:42:52 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/cpu.h>
@@ -50,26 +50,26 @@ idle_loop(void *dummy)
 	struct lwp *l = curlwp;
 
 	kcpuset_atomic_set(kcpuset_running, cpu_index(ci));
+	spc = &ci->ci_schedstate;
 	ci->ci_data.cpu_onproc = l;
 
 	/* Update start time for this thread. */
 	lwp_lock(l);
+	KASSERT(lwp_locked(l, spc->spc_lwplock));
 	binuptime(&l->l_stime);
+	spc->spc_flags |= SPCF_RUNNING;
+	l->l_stat = LSONPROC;
+	l->l_pflag |= LP_RUNNING;
 	lwp_unlock(l);
 
 	/*
 	 * Use spl0() here to ensure that we have the correct interrupt
 	 * priority.  This may be the first thread running on the CPU,
-	 * in which case we took a dirtbag route to get here.
+	 * in which case we took an odd route to get here.
 	 */
-	spc = &ci->ci_schedstate;
-	(void)splsched();
-	spc->spc_flags |= SPCF_RUNNING;
 	spl0();
-
 	KERNEL_UNLOCK_ALL(l, NULL);
-	l->l_stat = LSONPROC;
-	l->l_pflag |= LP_RUNNING;
+
 	for (;;) {
 		LOCKDEBUG_BARRIER(NULL, 0);
 		KASSERT((l->l_flag & LW_IDLE) != 0);
@@ -113,7 +113,6 @@ create_idle_lwp(struct cpu_info *ci)
 	lwp_lock(l);
 	l->l_flag |= LW_IDLE;
 	lwp_unlock(l);
-	l->l_cpu = ci;
 	ci->ci_data.cpu_idlelwp = l;
 
 	return error;

Index: src/sys/kern/kern_kthread.c
diff -u src/sys/kern/kern_kthread.c:1.43 src/sys/kern/kern_kthread.c:1.44
--- src/sys/kern/kern_kthread.c:1.43	Tue Jan  9 22:58:45 2018
+++ src/sys/kern/kern_kthread.c	Sat Nov 23 19:42:52 2019
@@ -1,7 +1,7 @@
-/*	$NetBSD: kern_kthread.c,v 1.43 2018/01/09 22:58:45 pgoyette Exp $	*/
+/*	$NetBSD: kern_kthread.c,v 1.44 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
- * Copyright (c) 1998, 1999, 2007, 2009 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 1999, 2007, 2009, 2019 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_kthread.c,v 1.43 2018/01/09 22:58:45 pgoyette Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_kthread.c,v 1.44 2019/11/23 19:42:52 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -108,10 +108,10 @@ kthread_create(pri_t pri, int flag, stru
 	}
 	mutex_enter(proc0.p_lock);
 	lwp_lock(l);
-	l->l_priority = pri;
+	lwp_changepri(l, pri);
 	if (ci != NULL) {
 		if (ci != l->l_cpu) {
-			lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
+			lwp_unlock_to(l, ci->ci_schedstate.spc_lwplock);
 			lwp_lock(l);
 		}
 		l->l_pflag |= LP_BOUND;
@@ -133,15 +133,12 @@ kthread_create(pri_t pri, int flag, stru
 	 * Set the new LWP running, unless the caller has requested
 	 * otherwise.
 	 */
+	KASSERT(l->l_stat == LSIDL);
 	if ((flag & KTHREAD_IDLE) == 0) {
-		l->l_stat = LSRUN;
-		sched_enqueue(l, false);
-		lwp_unlock(l);
+		setrunnable(l);
+		/* LWP now unlocked */
 	} else {
-		if (ci != NULL)
-			lwp_unlock_to(l, ci->ci_schedstate.spc_lwplock);
-		else
-			lwp_unlock(l);
+		lwp_unlock(l);
 	}
 	mutex_exit(proc0.p_lock);
 

Index: src/sys/kern/kern_lwp.c
diff -u src/sys/kern/kern_lwp.c:1.211 src/sys/kern/kern_lwp.c:1.212
--- src/sys/kern/kern_lwp.c:1.211	Thu Nov 21 19:47:21 2019
+++ src/sys/kern/kern_lwp.c	Sat Nov 23 19:42:52 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_lwp.c,v 1.211 2019/11/21 19:47:21 ad Exp $	*/
+/*	$NetBSD: kern_lwp.c,v 1.212 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
  * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2019 The NetBSD Foundation, Inc.
@@ -161,22 +161,23 @@
  *
  *	States and their associated locks:
  *
- *	LSONPROC, LSZOMB:
+ *	LSIDL, LSONPROC, LSZOMB, LSSUPENDED:
  *
- *		Always covered by spc_lwplock, which protects running LWPs.
- *		This is a per-CPU lock and matches lwp::l_cpu.
+ *		Always covered by spc_lwplock, which protects LWPs not
+ *		associated with any other sync object.  This is a per-CPU
+ *		lock and matches lwp::l_cpu.
  *
- *	LSIDL, LSRUN:
+ *	LSRUN:
  *
  *		Always covered by spc_mutex, which protects the run queues.
  *		This is a per-CPU lock and matches lwp::l_cpu.
  *
  *	LSSLEEP:
  *
- *		Covered by a lock associated with the sleep queue that the
- *		LWP resides on.  Matches lwp::l_sleepq::sq_mutex.
+ *		Covered by a lock associated with the sleep queue (sometimes
+ *		a turnstile sleep queue) that the LWP resides on.
  *
- *	LSSTOP, LSSUSPENDED:
+ *	LSSTOP:
  *
  *		If the LWP was previously sleeping (l_wchan != NULL), then
  *		l_mutex references the sleep queue lock.  If the LWP was
@@ -185,10 +186,7 @@
  *
  *	The lock order is as follows:
  *
- *		spc::spc_lwplock ->
- *		    sleeptab::st_mutex ->
- *			tschain_t::tc_mutex ->
- *			    spc::spc_mutex
+ *		sleepq -> turnstile -> spc_lwplock -> spc_mutex
  *
  *	Each process has an scheduler state lock (proc::p_lock), and a
  *	number of counters on LWPs and their states: p_nzlwps, p_nrlwps, and
@@ -199,7 +197,7 @@
  *		LSIDL, LSZOMB, LSSTOP, LSSUSPENDED
  *
  *	(But not always for kernel threads.  There are some special cases
- *	as mentioned above.  See kern_softint.c.)
+ *	as mentioned above: soft interrupts, and the idle loops.)
  *
  *	Note that an LWP is considered running or likely to run soon if in
  *	one of the following states.  This affects the value of p_nrlwps:
@@ -211,7 +209,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.211 2019/11/21 19:47:21 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.212 2019/11/23 19:42:52 ad Exp $");
 
 #include "opt_ddb.h"
 #include "opt_lockdebug.h"
@@ -841,7 +839,7 @@ lwp_create(lwp_t *l1, proc_t *p2, vaddr_
 	l2->l_inheritedprio = -1;
 	l2->l_protectprio = -1;
 	l2->l_auxprio = -1;
-	l2->l_flag = 0;
+	l2->l_flag = (l1->l_flag & (LW_WEXIT | LW_WREBOOT | LW_WCORE));
 	l2->l_pflag = LP_MPSAFE;
 	TAILQ_INIT(&l2->l_ld_locks);
 	l2->l_psrefs = 0;
@@ -874,7 +872,7 @@ lwp_create(lwp_t *l1, proc_t *p2, vaddr_
 	}
 
 	kpreempt_disable();
-	l2->l_mutex = l1->l_cpu->ci_schedstate.spc_mutex;
+	l2->l_mutex = l1->l_cpu->ci_schedstate.spc_lwplock;
 	l2->l_cpu = l1->l_cpu;
 	kpreempt_enable();
 
@@ -984,6 +982,35 @@ lwp_create(lwp_t *l1, proc_t *p2, vaddr_
 }
 
 /*
+ * Set a new LWP running.  If the process is stopping, then the LWP is
+ * created stopped.
+ */
+void
+lwp_start(lwp_t *l, int flags)
+{
+	proc_t *p = l->l_proc;
+
+	mutex_enter(p->p_lock);
+	lwp_lock(l);
+	KASSERT(l->l_stat == LSIDL);
+	if ((flags & LWP_SUSPENDED) != 0) {
+		/* It'll suspend itself in lwp_userret(). */
+		l->l_flag |= LW_WSUSPEND;
+	}
+	if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
+		KASSERT(l->l_wchan == NULL);
+	    	l->l_stat = LSSTOP;
+		p->p_nrlwps--;
+		lwp_unlock(l);
+	} else {
+		l->l_cpu = curcpu();
+		setrunnable(l);
+		/* LWP now unlocked */
+	}
+	mutex_exit(p->p_lock);
+}
+
+/*
  * Called by MD code when a new LWP begins execution.  Must be called
  * with the previous LWP locked (so at splsched), or if there is no
  * previous LWP, at splsched.
@@ -1345,13 +1372,10 @@ lwp_migrate(lwp_t *l, struct cpu_info *t
 	case LSRUN:
 		l->l_target_cpu = tci;
 		break;
-	case LSIDL:
-		l->l_cpu = tci;
-		lwp_unlock_to(l, tspc->spc_mutex);
-		return;
 	case LSSLEEP:
 		l->l_cpu = tci;
 		break;
+	case LSIDL:
 	case LSSTOP:
 	case LSSUSPENDED:
 		l->l_cpu = tci;
@@ -1363,8 +1387,8 @@ lwp_migrate(lwp_t *l, struct cpu_info *t
 	case LSONPROC:
 		l->l_target_cpu = tci;
 		spc_lock(l->l_cpu);
-		cpu_need_resched(l->l_cpu, RESCHED_KPREEMPT);
-		spc_unlock(l->l_cpu);
+		sched_resched_cpu(l->l_cpu, PRI_USER_RT, true);
+		/* spc now unlocked */
 		break;
 	}
 	lwp_unlock(l);

Index: src/sys/kern/kern_runq.c
diff -u src/sys/kern/kern_runq.c:1.47 src/sys/kern/kern_runq.c:1.48
--- src/sys/kern/kern_runq.c:1.47	Thu Jun  1 02:45:13 2017
+++ src/sys/kern/kern_runq.c	Sat Nov 23 19:42:52 2019
@@ -1,4 +1,33 @@
-/*	$NetBSD: kern_runq.c,v 1.47 2017/06/01 02:45:13 chs Exp $	*/
+/*	$NetBSD: kern_runq.c,v 1.48 2019/11/23 19:42:52 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /*
  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
@@ -27,7 +56,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.47 2017/06/01 02:45:13 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.48 2019/11/23 19:42:52 ad Exp $");
 
 #include "opt_dtrace.h"
 
@@ -101,7 +130,6 @@ static void	sched_balance(void *);
 /*
  * Preemption control.
  */
-int		sched_upreempt_pri = 0;
 #ifdef __HAVE_PREEMPTION
 # ifdef DEBUG
 int		sched_kpreempt_pri = 0;
@@ -209,27 +237,24 @@ sched_getrq(runqueue_t *ci_rq, const pri
 	    &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
 }
 
+/*
+ * Put an LWP onto a run queue.  The LWP must be locked by spc_mutex for
+ * l_cpu.
+ */
 void
-sched_enqueue(struct lwp *l, bool swtch)
+sched_enqueue(struct lwp *l)
 {
 	runqueue_t *ci_rq;
 	struct schedstate_percpu *spc;
 	TAILQ_HEAD(, lwp) *q_head;
 	const pri_t eprio = lwp_eprio(l);
 	struct cpu_info *ci;
-	int type;
 
 	ci = l->l_cpu;
 	spc = &ci->ci_schedstate;
 	ci_rq = spc->spc_sched_info;
 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
 
-	/* Update the last run time on switch */
-	if (__predict_true(swtch == true))
-		l->l_rticksum += (hardclock_ticks - l->l_rticks);
-	else if (l->l_rticks == 0)
-		l->l_rticks = hardclock_ticks;
-
 	/* Enqueue the thread */
 	q_head = sched_getrq(ci_rq, eprio);
 	if (TAILQ_EMPTY(q_head)) {
@@ -242,7 +267,12 @@ sched_enqueue(struct lwp *l, bool swtch)
 		KASSERT((ci_rq->r_bitmap[i] & q) == 0);
 		ci_rq->r_bitmap[i] |= q;
 	}
-	TAILQ_INSERT_TAIL(q_head, l, l_runq);
+	/* Preempted SCHED_RR and SCHED_FIFO LWPs go to the queue head. */
+	if (l->l_class != SCHED_OTHER && (l->l_pflag & LP_PREEMPTING) != 0) {
+		TAILQ_INSERT_HEAD(q_head, l, l_runq);
+	} else {
+		TAILQ_INSERT_TAIL(q_head, l, l_runq);
+	}
 	ci_rq->r_count++;
 	if ((l->l_pflag & LP_BOUND) == 0)
 		ci_rq->r_mcount++;
@@ -255,23 +285,12 @@ sched_enqueue(struct lwp *l, bool swtch)
 		spc->spc_maxpriority = eprio;
 
 	sched_newts(l);
-
-	/*
-	 * Wake the chosen CPU or cause a preemption if the newly
-	 * enqueued thread has higher priority.  Don't cause a 
-	 * preemption if the thread is yielding (swtch).
-	 */
-	if (!swtch && eprio > spc->spc_curpriority) {
-		if (eprio >= sched_kpreempt_pri)
-			type = RESCHED_KPREEMPT;
-		else if (eprio >= sched_upreempt_pri)
-			type = RESCHED_IMMED;
-		else
-			type = RESCHED_LAZY;
-		cpu_need_resched(ci, type);
-	}
 }
 
+/*
+ * Remove and LWP from the run queue it's on.  The LWP must be in state
+ * LSRUN.
+ */
 void
 sched_dequeue(struct lwp *l)
 {
@@ -329,6 +348,121 @@ sched_dequeue(struct lwp *l)
 }
 
 /*
+ * Cause a preemption on the given CPU, if the priority "pri" is higher
+ * priority than the running LWP.  If "unlock" is specified, and ideally it
+ * will be for concurrency reasons, spc_mutex will be dropped before return.
+ */
+void
+sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
+{
+	struct schedstate_percpu *spc;
+	u_int o, n, f;
+	lwp_t *l;
+
+	spc = &ci->ci_schedstate;
+
+	KASSERT(mutex_owned(spc->spc_mutex));
+
+	/*
+	 * If the priority level we're evaluating wouldn't cause a new LWP
+	 * to be run on the CPU, then we have nothing to do.
+	 */
+	if (pri <= spc->spc_curpriority) {
+		if (__predict_true(unlock)) {
+			spc_unlock(ci);
+		}
+		return;
+	}
+
+	/*
+	 * Figure out what kind of preemption we should do.
+	 */	
+	l = ci->ci_data.cpu_onproc;
+	if ((l->l_flag & LW_IDLE) != 0) {
+		f = RESCHED_IDLE | RESCHED_UPREEMPT;
+	} else if ((l->l_pflag & LP_INTR) != 0) {
+		/* We can't currently preempt interrupt LWPs - should do. */
+		if (__predict_true(unlock)) {
+			spc_unlock(ci);
+		}
+		return;
+	} else if (pri >= sched_kpreempt_pri) {
+#ifdef __HAVE_PREEMPTION
+		f = RESCHED_KPREEMPT;
+#else
+		/* Leave door open for test: set kpreempt_pri with sysctl. */
+		f = RESCHED_UPREEMPT;
+#endif
+		/*
+		 * l_dopreempt must be set with the CPU locked to sync with
+		 * mi_switch().  It must also be set with an atomic to sync
+		 * with kpreempt().
+		 */
+		atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
+	} else {
+		f = RESCHED_UPREEMPT;
+	}
+	if (ci != curcpu()) {
+		f |= RESCHED_REMOTE;
+	}
+
+	/*
+	 * Things start as soon as we touch ci_want_resched: x86 for example
+	 * has an instruction that monitors the memory cell it's in.  We
+	 * want to drop the schedstate lock in advance, otherwise the remote
+	 * CPU can awaken and immediately block on the lock.
+	 */
+	if (__predict_true(unlock)) {
+		spc_unlock(ci);
+	}
+
+	/*
+	 * The caller will always have a second scheduler lock held: either
+	 * the running LWP lock (spc_lwplock), or a sleep queue lock.  That
+	 * keeps preemption disabled, which among other things ensures all
+	 * LWPs involved won't be freed while we're here (see lwp_dtor()).
+	 */
+ 	KASSERT(kpreempt_disabled());
+
+	for (o = 0;; o = n) {
+		n = atomic_cas_uint(&ci->ci_want_resched, o, o | f);
+		if (__predict_true(o == n)) {
+			/*
+			 * We're the first.  If we're in process context on
+			 * the same CPU, we can avoid the visit to trap().
+			 */
+			if (l != curlwp || cpu_intr_p()) {
+				cpu_need_resched(ci, l, f);
+			}
+			break;
+		}
+		if (__predict_true(
+		    (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >=
+		    (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) {
+			/* Already in progress, nothing to do. */
+			break;
+		}
+	}
+}
+
+/*
+ * Cause a preemption on the given CPU, if the priority of LWP "l" in state
+ * LSRUN, is higher priority than the running LWP.  If "unlock" is
+ * specified, and ideally it will be for concurrency reasons, spc_mutex will
+ * be dropped before return.
+ */
+void
+sched_resched_lwp(struct lwp *l, bool unlock)
+{
+	struct cpu_info *ci = l->l_cpu;
+
+	KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex));
+	KASSERT(l->l_stat == LSRUN);
+
+	sched_resched_cpu(ci, lwp_eprio(l), unlock);
+}
+
+/*
  * Migration and balancing.
  */
 
@@ -385,6 +519,7 @@ sched_takecpu(struct lwp *l)
 
 	spc = &ci->ci_schedstate;
 	ci_rq = spc->spc_sched_info;
+	eprio = lwp_eprio(l);
 
 	/* Make sure that thread is in appropriate processor-set */
 	if (__predict_true(spc->spc_psid == l->l_psid)) {
@@ -393,15 +528,22 @@ sched_takecpu(struct lwp *l)
 			ci_rq->r_ev_stay.ev_count++;
 			return ci;
 		}
+		/*
+		 * New LWPs must start on the same CPU as the parent (l_cpu
+		 * was inherited when the LWP was created).  Doing otherwise
+		 * is bad for performance and repeatability, and agitates
+		 * buggy programs.  Also, we want the child to have a good
+		 * chance of reusing the VM context from the parent.
+		 */
+		if (l->l_stat == LSIDL) {
+			ci_rq->r_ev_stay.ev_count++;
+			return ci;
+		}		 
 		/* Stay if thread is cache-hot */
-		eprio = lwp_eprio(l);
-		if (__predict_true(l->l_stat != LSIDL) &&
-		    lwp_cache_hot(l) && eprio >= spc->spc_curpriority) {
+		if (lwp_cache_hot(l) && eprio >= spc->spc_curpriority) {
 			ci_rq->r_ev_stay.ev_count++;
 			return ci;
 		}
-	} else {
-		eprio = lwp_eprio(l);
 	}
 
 	/* Run on current CPU if priority of thread is higher */
@@ -507,7 +649,7 @@ sched_catchlwp(struct cpu_info *ci)
 		l->l_cpu = curci;
 		ci_rq->r_ev_pull.ev_count++;
 		lwp_unlock_to(l, curspc->spc_mutex);
-		sched_enqueue(l, false);
+		sched_enqueue(l);
 		return l;
 	}
 	spc_unlock(ci);
@@ -569,7 +711,7 @@ sched_idle(void)
 {
 	struct cpu_info *ci = curcpu(), *tci = NULL;
 	struct schedstate_percpu *spc, *tspc;
-	runqueue_t *ci_rq;
+	runqueue_t *ci_rq, *tci_rq;
 	bool dlock = false;
 
 	/* Check if there is a migrating LWP */
@@ -631,8 +773,11 @@ sched_idle(void)
 		sched_dequeue(l);
 		l->l_cpu = tci;
 		lwp_setlock(l, tspc->spc_mutex);
-		sched_enqueue(l, false);
-		break;
+		sched_enqueue(l);
+		sched_resched_lwp(l, true);
+		/* tci now unlocked */
+		spc_unlock(ci);
+		goto no_migration;
 	}
 	if (dlock == true) {
 		KASSERT(tci != NULL);
@@ -653,9 +798,13 @@ no_migration:
 	tspc = &tci->ci_schedstate;
 	if (ci == tci || spc->spc_psid != tspc->spc_psid)
 		return;
-	spc_dlock(ci, tci);
-	(void)sched_catchlwp(tci);
-	spc_unlock(ci);
+	/* Don't hit the locks unless there's something to do. */
+	tci_rq = tci->ci_schedstate.spc_sched_info;
+	if (tci_rq->r_mcount >= min_catch) {
+		spc_dlock(ci, tci);
+		(void)sched_catchlwp(tci);
+		spc_unlock(ci);
+	}
 }
 
 #else
@@ -746,6 +895,10 @@ sched_nextlwp(void)
 	runqueue_t *ci_rq;
 	struct lwp *l;
 
+	/* Update the last run time on switch */
+	l = curlwp;
+	l->l_rticksum += (hardclock_ticks - l->l_rticks);
+
 	/* Return to idle LWP if there is a migrating thread */
 	spc = &ci->ci_schedstate;
 	if (__predict_false(spc->spc_migrating != NULL))
@@ -873,12 +1026,6 @@ SYSCTL_SETUP(sysctl_sched_setup, "sysctl
 		SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
 		NULL, 0, &sched_kpreempt_pri, 0,
 		CTL_CREATE, CTL_EOL);
-	sysctl_createv(clog, 0, &node, NULL,
-		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
-		CTLTYPE_INT, "upreempt_pri",
-		SYSCTL_DESCR("Minimum priority to trigger user preemption"),
-		NULL, 0, &sched_upreempt_pri, 0,
-		CTL_CREATE, CTL_EOL);
 }
 
 /*

Index: src/sys/kern/kern_sleepq.c
diff -u src/sys/kern/kern_sleepq.c:1.52 src/sys/kern/kern_sleepq.c:1.53
--- src/sys/kern/kern_sleepq.c:1.52	Thu Nov 21 18:56:55 2019
+++ src/sys/kern/kern_sleepq.c	Sat Nov 23 19:42:52 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_sleepq.c,v 1.52 2019/11/21 18:56:55 ad Exp $	*/
+/*	$NetBSD: kern_sleepq.c,v 1.53 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
  * Copyright (c) 2006, 2007, 2008, 2009, 2019 The NetBSD Foundation, Inc.
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.52 2019/11/21 18:56:55 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.53 2019/11/23 19:42:52 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -156,8 +156,9 @@ sleepq_remove(sleepq_t *sq, lwp_t *l)
 	sched_setrunnable(l);
 	l->l_stat = LSRUN;
 	l->l_slptime = 0;
-	sched_enqueue(l, false);
-	spc_unlock(ci);
+	sched_enqueue(l);
+	sched_resched_lwp(l, true);
+	/* LWP & SPC now unlocked, but we still hold sleep queue lock. */
 }
 
 /*

Index: src/sys/kern/kern_softint.c
diff -u src/sys/kern/kern_softint.c:1.49 src/sys/kern/kern_softint.c:1.50
--- src/sys/kern/kern_softint.c:1.49	Thu Nov 21 17:50:49 2019
+++ src/sys/kern/kern_softint.c	Sat Nov 23 19:42:52 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_softint.c,v 1.49 2019/11/21 17:50:49 ad Exp $	*/
+/*	$NetBSD: kern_softint.c,v 1.50 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
  * Copyright (c) 2007, 2008, 2019 The NetBSD Foundation, Inc.
@@ -170,7 +170,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_softint.c,v 1.49 2019/11/21 17:50:49 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_softint.c,v 1.50 2019/11/23 19:42:52 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/proc.h>
@@ -661,19 +661,20 @@ schednetisr(int isr)
 void
 softint_init_md(lwp_t *l, u_int level, uintptr_t *machdep)
 {
+	struct proc *p;
 	softint_t *si;
 
 	*machdep = (1 << level);
 	si = l->l_private;
+	p = l->l_proc;
 
-	lwp_lock(l);
-	lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_mutex);
+	mutex_enter(p->p_lock);
 	lwp_lock(l);
 	/* Cheat and make the KASSERT in softint_thread() happy. */
 	si->si_active = 1;
-	l->l_stat = LSRUN;
-	sched_enqueue(l, false);
-	lwp_unlock(l);
+	setrunnable(l);
+	/* LWP now unlocked */
+	mutex_exit(p->p_lock);
 }
 
 /*
@@ -692,10 +693,10 @@ softint_trigger(uintptr_t machdep)
 	ci = l->l_cpu;
 	ci->ci_data.cpu_softints |= machdep;
 	if (l == ci->ci_data.cpu_idlelwp) {
-		cpu_need_resched(ci, 0);
+		atomic_or_uint(&ci->ci_want_resched, RESCHED_UPREEMPT);
 	} else {
 		/* MI equivalent of aston() */
-		cpu_signotify(l);
+		lwp_need_userret(l);
 	}
 }
 

Index: src/sys/kern/kern_synch.c
diff -u src/sys/kern/kern_synch.c:1.325 src/sys/kern/kern_synch.c:1.326
--- src/sys/kern/kern_synch.c:1.325	Thu Nov 21 20:51:05 2019
+++ src/sys/kern/kern_synch.c	Sat Nov 23 19:42:52 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_synch.c,v 1.325 2019/11/21 20:51:05 ad Exp $	*/
+/*	$NetBSD: kern_synch.c,v 1.326 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019
@@ -69,7 +69,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.325 2019/11/21 20:51:05 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.326 2019/11/23 19:42:52 ad Exp $");
 
 #include "opt_kstack.h"
 #include "opt_dtrace.h"
@@ -104,7 +104,6 @@ dtrace_vtime_switch_func_t      dtrace_v
 static void	sched_unsleep(struct lwp *, bool);
 static void	sched_changepri(struct lwp *, pri_t);
 static void	sched_lendpri(struct lwp *, pri_t);
-static void	resched_cpu(struct lwp *);
 
 syncobj_t sleep_syncobj = {
 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
@@ -303,10 +302,10 @@ preempt(void)
  *
  * Character addresses for lockstat only.
  */
-static char	in_critical_section;
+static char	kpreempt_is_disabled;
 static char	kernel_lock_held;
-static char	is_softint;
-static char	cpu_kpreempt_enter_fail;
+static char	is_softint_lwp;
+static char	spl_is_raised;
 
 bool
 kpreempt(uintptr_t where)
@@ -338,13 +337,13 @@ kpreempt(uintptr_t where)
 			if ((dop & DOPREEMPT_COUNTED) == 0) {
 				kpreempt_ev_crit.ev_count++;
 			}
-			failed = (uintptr_t)&in_critical_section;
+			failed = (uintptr_t)&kpreempt_is_disabled;
 			break;
 		}
 		if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
 			/* Can't preempt soft interrupts yet. */
 			atomic_swap_uint(&l->l_dopreempt, 0);
-			failed = (uintptr_t)&is_softint;
+			failed = (uintptr_t)&is_softint_lwp;
 			break;
 		}
 		s = splsched();
@@ -365,7 +364,7 @@ kpreempt(uintptr_t where)
 			 * interrupt to retry later.
 			 */
 			splx(s);
-			failed = (uintptr_t)&cpu_kpreempt_enter_fail;
+			failed = (uintptr_t)&spl_is_raised;
 			break;
 		}
 		/* Do it! */
@@ -373,6 +372,7 @@ kpreempt(uintptr_t where)
 			kpreempt_ev_immed.ev_count++;
 		}
 		lwp_lock(l);
+		l->l_pflag |= LP_PREEMPTING;
 		mi_switch(l);
 		l->l_nopreempt++;
 		splx(s);
@@ -555,13 +555,6 @@ mi_switch(lwp_t *l)
 	}
 #endif	/* !__HAVE_FAST_SOFTINTS */
 
-	/* Count time spent in current system call */
-	if (!returning) {
-		SYSCALL_TIME_SLEEP(l);
-
-		updatertime(l, &bt);
-	}
-
 	/* Lock the runqueue */
 	KASSERT(l->l_stat != LSRUN);
 	mutex_spin_enter(spc->spc_mutex);
@@ -574,7 +567,7 @@ mi_switch(lwp_t *l)
 		if ((l->l_flag & LW_IDLE) == 0) {
 			l->l_stat = LSRUN;
 			lwp_setlock(l, spc->spc_mutex);
-			sched_enqueue(l, true);
+			sched_enqueue(l);
 			/*
 			 * Handle migration.  Note that "migrating LWP" may
 			 * be reset here, if interrupt/preemption happens
@@ -596,6 +589,11 @@ mi_switch(lwp_t *l)
 
 	/* Items that must be updated with the CPU locked. */
 	if (!returning) {
+		/* Count time spent in current system call */
+		SYSCALL_TIME_SLEEP(l);
+
+		updatertime(l, &bt);
+
 		/* Update the new LWP's start time. */
 		newl->l_stime = bt;
 
@@ -656,9 +654,8 @@ mi_switch(lwp_t *l)
 		l->l_ncsw++;
 		if ((l->l_pflag & LP_PREEMPTING) != 0)
 			l->l_nivcsw++;
-		l->l_pflag &= ~LP_PREEMPTING;
 		KASSERT((l->l_pflag & LP_RUNNING) != 0);
-		l->l_pflag &= ~LP_RUNNING;
+		l->l_pflag &= ~(LP_RUNNING | LP_PREEMPTING);
 
 		/*
 		 * Increase the count of spin-mutexes before the release
@@ -882,6 +879,7 @@ setrunnable(struct lwp *l)
 {
 	struct proc *p = l->l_proc;
 	struct cpu_info *ci;
+	kmutex_t *oldlock;
 
 	KASSERT((l->l_flag & LW_IDLE) == 0);
 	KASSERT((l->l_flag & LW_DBGSUSPEND) == 0);
@@ -900,6 +898,7 @@ setrunnable(struct lwp *l)
 		p->p_nrlwps++;
 		break;
 	case LSSUSPENDED:
+		KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
 		l->l_flag &= ~LW_WSUSPEND;
 		p->p_nrlwps++;
 		cv_broadcast(&p->p_lwpcv);
@@ -907,6 +906,9 @@ setrunnable(struct lwp *l)
 	case LSSLEEP:
 		KASSERT(l->l_wchan != NULL);
 		break;
+	case LSIDL:
+		KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
+		break;
 	default:
 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
 	}
@@ -939,14 +941,14 @@ setrunnable(struct lwp *l)
 	ci = sched_takecpu(l);
 	l->l_cpu = ci;
 	spc_lock(ci);
-	lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
+	oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex);
 	sched_setrunnable(l);
 	l->l_stat = LSRUN;
 	l->l_slptime = 0;
-
-	sched_enqueue(l, false);
-	resched_cpu(l);
-	lwp_unlock(l);
+	sched_enqueue(l);
+	sched_resched_lwp(l, true);
+	/* SPC & LWP now unlocked. */
+	mutex_spin_exit(oldlock);
 }
 
 /*
@@ -1012,13 +1014,19 @@ suspendsched(void)
 
 	/*
 	 * Kick all CPUs to make them preempt any LWPs running in user mode. 
-	 * They'll trap into the kernel and suspend themselves in userret().
+	 * They'll trap into the kernel and suspend themselves in userret(). 
+	 *
+	 * Unusually, we don't hold any other scheduler object locked, which
+	 * would keep preemption off for sched_resched_cpu(), so disable it
+	 * explicitly.
 	 */
+	kpreempt_disable();
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		spc_lock(ci);
-		cpu_need_resched(ci, RESCHED_IMMED);
-		spc_unlock(ci);
+		sched_resched_cpu(ci, PRI_KERNEL, true);
+		/* spc now unlocked */
 	}
+	kpreempt_enable();
 }
 
 /*
@@ -1037,49 +1045,64 @@ sched_unsleep(struct lwp *l, bool cleanu
 }
 
 static void
-resched_cpu(struct lwp *l)
-{
-	struct cpu_info *ci = l->l_cpu;
-
-	KASSERT(lwp_locked(l, NULL));
-	if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
-		cpu_need_resched(ci, 0);
-}
-
-static void
 sched_changepri(struct lwp *l, pri_t pri)
 {
+	struct schedstate_percpu *spc;
+	struct cpu_info *ci;
 
 	KASSERT(lwp_locked(l, NULL));
 
+	ci = l->l_cpu;
+	spc = &ci->ci_schedstate;
+
 	if (l->l_stat == LSRUN) {
-		KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
+		KASSERT(lwp_locked(l, spc->spc_mutex));
 		sched_dequeue(l);
 		l->l_priority = pri;
-		sched_enqueue(l, false);
+		sched_enqueue(l);
+		sched_resched_lwp(l, false);
+	} else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
+		/* On priority drop, only evict realtime LWPs. */
+		KASSERT(lwp_locked(l, spc->spc_lwplock));
+		l->l_priority = pri;
+		spc_lock(ci);
+		sched_resched_cpu(ci, spc->spc_maxpriority, true);
+		/* spc now unlocked */
 	} else {
 		l->l_priority = pri;
 	}
-	resched_cpu(l);
 }
 
 static void
 sched_lendpri(struct lwp *l, pri_t pri)
 {
+	struct schedstate_percpu *spc;
+	struct cpu_info *ci;
 
 	KASSERT(lwp_locked(l, NULL));
 
+	ci = l->l_cpu;
+	spc = &ci->ci_schedstate;
+
 	if (l->l_stat == LSRUN) {
-		KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
+		KASSERT(lwp_locked(l, spc->spc_mutex));
 		sched_dequeue(l);
 		l->l_inheritedprio = pri;
 		l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
-		sched_enqueue(l, false);
+		sched_enqueue(l);
+		sched_resched_lwp(l, false);
+	} else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
+		/* On priority drop, only evict realtime LWPs. */
+		KASSERT(lwp_locked(l, spc->spc_lwplock));
+		l->l_inheritedprio = pri;
+		l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
+		spc_lock(ci);
+		sched_resched_cpu(ci, spc->spc_maxpriority, true);
+		/* spc now unlocked */
 	} else {
 		l->l_inheritedprio = pri;
 		l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
 	}
-	resched_cpu(l);
 }
 
 struct lwp *

Index: src/sys/kern/sched_4bsd.c
diff -u src/sys/kern/sched_4bsd.c:1.35 src/sys/kern/sched_4bsd.c:1.36
--- src/sys/kern/sched_4bsd.c:1.35	Mon Sep  3 16:29:35 2018
+++ src/sys/kern/sched_4bsd.c	Sat Nov 23 19:42:52 2019
@@ -1,7 +1,8 @@
-/*	$NetBSD: sched_4bsd.c,v 1.35 2018/09/03 16:29:35 riastradh Exp $	*/
+/*	$NetBSD: sched_4bsd.c,v 1.36 2019/11/23 19:42:52 ad Exp $	*/
 
 /*
- * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2019
+ *     The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -68,7 +69,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.35 2018/09/03 16:29:35 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.36 2019/11/23 19:42:52 ad Exp $");
 
 #include "opt_ddb.h"
 #include "opt_lockdebug.h"
@@ -96,9 +97,6 @@ static int rrticks __read_mostly;
 /*
  * Force switch among equal priority processes every 100ms.
  * Called from hardclock every hz/10 == rrticks hardclock ticks.
- *
- * There's no need to lock anywhere in this routine, as it's
- * CPU-local and runs at IPL_SCHED (called from clock interrupt).
  */
 /* ARGSUSED */
 void
@@ -110,20 +108,27 @@ sched_tick(struct cpu_info *ci)
 	spc->spc_ticks = rrticks;
 
 	if (CURCPU_IDLE_P()) {
-		cpu_need_resched(ci, 0);
+		atomic_or_uint(&ci->ci_want_resched,
+		    RESCHED_IDLE | RESCHED_UPREEMPT);
 		return;
 	}
 	l = ci->ci_data.cpu_onproc;
 	if (l == NULL) {
 		return;
 	}
+	/*
+	 * Can only be spc_lwplock or a turnstile lock at this point
+	 * (if we interrupted priority inheritance trylock dance).
+	 */
+	KASSERT(l->l_mutex != spc->spc_mutex);
 	switch (l->l_class) {
 	case SCHED_FIFO:
 		/* No timeslicing for FIFO jobs. */
 		break;
 	case SCHED_RR:
 		/* Force it into mi_switch() to look for other jobs to run. */
-		cpu_need_resched(ci, RESCHED_KPREEMPT);
+		atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
+		cpu_need_resched(ci, l, RESCHED_KPREEMPT);
 		break;
 	default:
 		if (spc->spc_flags & SPCF_SHOULDYIELD) {
@@ -132,7 +137,8 @@ sched_tick(struct cpu_info *ci)
 			 * due to buggy or inefficient code.  Force a
 			 * kernel preemption.
 			 */
-			cpu_need_resched(ci, RESCHED_KPREEMPT);
+			atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
+			cpu_need_resched(ci, l, RESCHED_KPREEMPT);
 		} else if (spc->spc_flags & SPCF_SEENRR) {
 			/*
 			 * The process has already been through a roundrobin
@@ -140,7 +146,7 @@ sched_tick(struct cpu_info *ci)
 			 * Indicate that the process should yield.
 			 */
 			spc->spc_flags |= SPCF_SHOULDYIELD;
-			cpu_need_resched(ci, 0);
+			cpu_need_resched(ci, l, RESCHED_UPREEMPT);
 		} else {
 			spc->spc_flags |= SPCF_SEENRR;
 		}

Index: src/sys/kern/sys_aio.c
diff -u src/sys/kern/sys_aio.c:1.44 src/sys/kern/sys_aio.c:1.45
--- src/sys/kern/sys_aio.c:1.44	Sun Feb 10 17:13:33 2019
+++ src/sys/kern/sys_aio.c	Sat Nov 23 19:42:52 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: sys_aio.c,v 1.44 2019/02/10 17:13:33 christos Exp $	*/
+/*	$NetBSD: sys_aio.c,v 1.45 2019/11/23 19:42:52 ad Exp $	*/
 
 /*
  * Copyright (c) 2007 Mindaugas Rasiukevicius <rmind at NetBSD org>
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_aio.c,v 1.44 2019/02/10 17:13:33 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_aio.c,v 1.45 2019/11/23 19:42:52 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_ddb.h"
@@ -229,10 +229,9 @@ aio_procinit(struct proc *p)
 	/* Complete the initialization of thread, and run it */
 	aio->aio_worker = l;
 	lwp_lock(l);
-	l->l_stat = LSRUN;
-	l->l_priority = MAXPRI_USER;
-	sched_enqueue(l, false);
-	lwp_unlock(l);
+	lwp_changepri(l, MAXPRI_USER);
+	setrunnable(l);
+	/* LWP now unlocked */
 	mutex_exit(p->p_lock);
 
 	return 0;

Index: src/sys/kern/sys_lwp.c
diff -u src/sys/kern/sys_lwp.c:1.70 src/sys/kern/sys_lwp.c:1.71
--- src/sys/kern/sys_lwp.c:1.70	Mon Sep 30 21:13:33 2019
+++ src/sys/kern/sys_lwp.c	Sat Nov 23 19:42:52 2019
@@ -1,7 +1,7 @@
-/*	$NetBSD: sys_lwp.c,v 1.70 2019/09/30 21:13:33 kamil Exp $	*/
+/*	$NetBSD: sys_lwp.c,v 1.71 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
- * Copyright (c) 2001, 2006, 2007, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2001, 2006, 2007, 2008, 2019 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_lwp.c,v 1.70 2019/09/30 21:13:33 kamil Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_lwp.c,v 1.71 2019/11/23 19:42:52 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -102,7 +102,6 @@ do_lwp_create(lwp_t *l, void *arg, u_lon
 {
 	struct proc *p = l->l_proc;
 	struct lwp *l2;
-	struct schedstate_percpu *spc;
 	vaddr_t uaddr;
 	int error;
 
@@ -120,35 +119,7 @@ do_lwp_create(lwp_t *l, void *arg, u_lon
 	}
 
 	*new_lwp = l2->l_lid;
-
-	/*
-	 * Set the new LWP running, unless the caller has requested that
-	 * it be created in suspended state.  If the process is stopping,
-	 * then the LWP is created stopped.
-	 */
-	mutex_enter(p->p_lock);
-	lwp_lock(l2);
-	spc = &l2->l_cpu->ci_schedstate;
-	if ((flags & LWP_SUSPENDED) == 0 &&
-	    (l->l_flag & (LW_WREBOOT | LW_WSUSPEND | LW_WEXIT)) == 0) {
-	    	if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
-			KASSERT(l2->l_wchan == NULL);
-	    		l2->l_stat = LSSTOP;
-			p->p_nrlwps--;
-			lwp_unlock_to(l2, spc->spc_lwplock);
-		} else {
-			KASSERT(lwp_locked(l2, spc->spc_mutex));
-			l2->l_stat = LSRUN;
-			sched_enqueue(l2, false);
-			lwp_unlock(l2);
-		}
-	} else {
-		l2->l_stat = LSSUSPENDED;
-		p->p_nrlwps--;
-		lwp_unlock_to(l2, spc->spc_lwplock);
-	}
-	mutex_exit(p->p_lock);
-
+	lwp_start(l2, flags);
 	return 0;
 }
 

Index: src/sys/rump/librump/rumpkern/scheduler.c
diff -u src/sys/rump/librump/rumpkern/scheduler.c:1.44 src/sys/rump/librump/rumpkern/scheduler.c:1.45
--- src/sys/rump/librump/rumpkern/scheduler.c:1.44	Fri Feb 19 18:38:37 2016
+++ src/sys/rump/librump/rumpkern/scheduler.c	Sat Nov 23 19:42:52 2019
@@ -1,4 +1,4 @@
-/*      $NetBSD: scheduler.c,v 1.44 2016/02/19 18:38:37 pooka Exp $	*/
+/*      $NetBSD: scheduler.c,v 1.45 2019/11/23 19:42:52 ad Exp $	*/
 
 /*
  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.44 2016/02/19 18:38:37 pooka Exp $");
+__KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.45 2019/11/23 19:42:52 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/atomic.h>
@@ -572,15 +572,32 @@ sched_nice(struct proc *p, int level)
 }
 
 void
-sched_enqueue(struct lwp *l, bool swtch)
+setrunnable(struct lwp *l)
+{
+
+	sched_enqueue(l);
+}
+
+void
+sched_enqueue(struct lwp *l)
 {
 
-	if (swtch)
-		panic("sched_enqueue with switcheroo");
 	rump_thread_allow(l);
 }
 
 void
+sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
+{
+
+}
+
+void
+sched_resched_lwp(struct lwp *l, bool unlock)
+{
+
+}
+
+void
 sched_dequeue(struct lwp *l)
 {
 

Index: src/sys/sys/cpu.h
diff -u src/sys/sys/cpu.h:1.43 src/sys/sys/cpu.h:1.44
--- src/sys/sys/cpu.h:1.43	Thu Apr 19 21:19:07 2018
+++ src/sys/sys/cpu.h	Sat Nov 23 19:42:52 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.43 2018/04/19 21:19:07 christos Exp $	*/
+/*	$NetBSD: cpu.h,v 1.44 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
  * Copyright (c) 2007 YAMAMOTO Takashi,
@@ -50,17 +50,8 @@ void cpu_idle(void);
 #endif
 #endif
 
-/*
- * cpu_need_resched() must always be called with the target CPU
- * locked (via spc_lock() or another route), unless called locally.
- * If called locally, the caller need only be at IPL_SCHED.
- */
 #ifndef cpu_need_resched
-void cpu_need_resched(struct cpu_info *, int);
-#endif
-
-#ifndef cpu_did_resched
-#define	cpu_did_resched(l)	/* nothing */
+void cpu_need_resched(struct cpu_info *, struct lwp *, int);
 #endif
 
 /*
@@ -140,9 +131,13 @@ int cpu_ucode_md_open(firmware_handle_t 
 #endif
 #endif	/* !_LOCORE */
 
-/* flags for cpu_need_resched */
-#define	RESCHED_LAZY		0x01	/* request a ctx switch */
-#define	RESCHED_IMMED		0x02	/* request an immediate ctx switch */
-#define	RESCHED_KPREEMPT	0x04	/* request in-kernel preemption */
+/*
+ * Flags for cpu_need_resched.  RESCHED_KERNEL must be greater than
+ * RESCHED_USER; see sched_resched_cpu().
+ */
+#define	RESCHED_REMOTE		0x01	/* request is for a remote CPU */
+#define	RESCHED_IDLE		0x02	/* idle LWP observed */
+#define	RESCHED_UPREEMPT	0x04	/* immediate user ctx switch */
+#define	RESCHED_KPREEMPT	0x08	/* immediate kernel ctx switch */
 
 #endif	/* !_SYS_CPU_H_ */

Index: src/sys/sys/lwp.h
diff -u src/sys/sys/lwp.h:1.189 src/sys/sys/lwp.h:1.190
--- src/sys/sys/lwp.h:1.189	Thu Nov 21 19:47:21 2019
+++ src/sys/sys/lwp.h	Sat Nov 23 19:42:52 2019
@@ -1,7 +1,7 @@
-/*	$NetBSD: lwp.h,v 1.189 2019/11/21 19:47:21 ad Exp $	*/
+/*	$NetBSD: lwp.h,v 1.190 2019/11/23 19:42:52 ad Exp $	*/
 
 /*
- * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010
+ * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019
  *    The NetBSD Foundation, Inc.
  * All rights reserved.
  *
@@ -344,6 +344,7 @@ void	lwp_exit(lwp_t *);
 void	lwp_exit_switchaway(lwp_t *) __dead;
 int	lwp_suspend(lwp_t *, lwp_t *);
 int	lwp_create1(lwp_t *, const void *, size_t, u_long, lwpid_t *);
+void	lwp_start(lwp_t *, int);
 void	lwp_update_creds(lwp_t *);
 void	lwp_migrate(lwp_t *, struct cpu_info *);
 lwp_t *	lwp_find2(pid_t, lwpid_t);

Index: src/sys/sys/sched.h
diff -u src/sys/sys/sched.h:1.76 src/sys/sys/sched.h:1.77
--- src/sys/sys/sched.h:1.76	Sun Jul  3 14:24:59 2016
+++ src/sys/sys/sched.h	Sat Nov 23 19:42:52 2019
@@ -1,7 +1,8 @@
-/*	$NetBSD: sched.h,v 1.76 2016/07/03 14:24:59 christos Exp $	*/
+/*	$NetBSD: sched.h,v 1.77 2019/11/23 19:42:52 ad Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000, 2001, 2002, 2007, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2007, 2008, 2019
+ *    The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -228,7 +229,9 @@ void		sched_pstats_hook(struct lwp *, in
 /* Runqueue-related functions */
 bool		sched_curcpu_runnable_p(void);
 void		sched_dequeue(struct lwp *);
-void		sched_enqueue(struct lwp *, bool);
+void		sched_enqueue(struct lwp *);
+void		sched_resched_cpu(struct cpu_info *, pri_t, bool);
+void		sched_resched_lwp(struct lwp *, bool);
 struct lwp *	sched_nextlwp(void);
 void		sched_oncpu(struct lwp *);
 void		sched_newts(struct lwp *);

Reply via email to