Am Wed, Nov 04, 2020 at 09:13:22AM -0300 schrieb Martin Pieuchot:
> Diff below removes the recursive attribute of the SCHED_LOCK() by
> turning it into a IPL_NONE mutex.  I'm not intending to commit it
> yet as it raises multiple questions, see below. 
> 
> This work has been started by art@ more than a decade ago and I'm
> willing to finish it as I believe it's the easiest way to reduce
> the scope of this lock.  Having a global mutex is the first step to
> have a per runqueue and per sleepqueue mutex. 
> 
> This is also a way to avoid lock ordering problems exposed by the recent
> races in single_thread_set().
> 
> About the diff:
> 
>  The diff below includes a (hugly) refactoring of rw_exit() to avoid a
>  recursion on the SCHED_LOCK().  In this case the lock is used to protect
>  the global sleepqueue and is grabbed in sleep_setup().
> 
>  The same pattern can be observed in single_thread_check().  However in
>  this case the lock is used to protect different fields so there's no
>  "recursive access" to the same data structure.
> 
>  assertwaitok() has been moved down in mi_switch() which isn't ideal.
> 
>  It becomes obvious that the per-CPU and per-thread accounting fields
>  updated in mi_switch() won't need a separate mutex as proposed last
>  year and that splitting this global mutex will be enough.
> 
> It's unclear to me if/how WITNESS should be modified to handle this lock
> change.
> 
> This has been tested on sparc64 and amd64.  I'm not convinced it exposed
> all the recursions.  So if you want to give it a go and can break it, it
> is more than welcome.
> 
> Comments?  Questions?

I think that's a good direction.  This then allows us to add a few
smaller mutexes and move some users to those.

In the meantime apparently some stuff has changed a little bit.  I had
to fix one reject, which was very easy because only the context around
the diff changed a tiny bit.

I then found one schedlock recursion:

single_thread_check() takes the sched lock, which we call through
sleep_signal_check().  We do the call twice, one time with the sched
lock, and one time without.  My 'quick' fix is to introduce a locked
version that calls single_thread_check_locked().  There might be
a nicer way to do that, which I don't know yet, but this diff seems
to work fine on my arm64 machine with amdgpu.

Build times increase a little, which might be because mutexes are
not 'fair' like mplocks.  If we are looking for 'instant gratification'
we might have to do two steps at once (mplock -> mtx + move contention
to multiple mtxs).  In any case, I believe this is a good step to go
and I'll have a look at reducing the contention.

Patrick

diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 1e51f71301a..4e4e1fc2a27 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -671,7 +671,7 @@ void
 proc_trampoline_mp(void)
 {
        SCHED_ASSERT_LOCKED();
-       __mp_unlock(&sched_lock);
+       mtx_leave(&sched_lock);
        spl0();
        SCHED_ASSERT_UNLOCKED();
        KERNEL_ASSERT_UNLOCKED();
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
index 5cc55bb256a..2206ac53321 100644
--- a/sys/kern/kern_lock.c
+++ b/sys/kern/kern_lock.c
@@ -97,9 +97,6 @@ ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type 
*type)
        if (mpl == &kernel_lock)
                mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED |
                    LO_SLEEPABLE | (LO_CLASS_KERNEL_LOCK << LO_CLASSSHIFT);
-       else if (mpl == &sched_lock)
-               mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED |
-                   LO_RECURSABLE | (LO_CLASS_SCHED_LOCK << LO_CLASSSHIFT);
        WITNESS_INIT(&mpl->mpl_lock_obj, type);
 #endif
 }
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
index d79b59748e8..2feeff16943 100644
--- a/sys/kern/kern_rwlock.c
+++ b/sys/kern/kern_rwlock.c
@@ -129,36 +129,6 @@ rw_enter_write(struct rwlock *rwl)
        }
 }
 
-void
-rw_exit_read(struct rwlock *rwl)
-{
-       unsigned long owner;
-
-       rw_assert_rdlock(rwl);
-       WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
-
-       membar_exit_before_atomic();
-       owner = rwl->rwl_owner;
-       if (__predict_false((owner & RWLOCK_WAIT) ||
-           rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
-               rw_do_exit(rwl, 0);
-}
-
-void
-rw_exit_write(struct rwlock *rwl)
-{
-       unsigned long owner;
-
-       rw_assert_wrlock(rwl);
-       WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
-
-       membar_exit_before_atomic();
-       owner = rwl->rwl_owner;
-       if (__predict_false((owner & RWLOCK_WAIT) ||
-           rw_cas(&rwl->rwl_owner, owner, 0)))
-               rw_do_exit(rwl, RWLOCK_WRLOCK);
-}
-
 #ifdef DIAGNOSTIC
 /*
  * Put the diagnostic functions here to keep the main code free
@@ -313,9 +283,10 @@ retry:
 }
 
 void
-rw_exit(struct rwlock *rwl)
+_rw_exit(struct rwlock *rwl, int locked)
 {
        unsigned long wrlock;
+       unsigned long owner, set;
 
        /* Avoid deadlocks after panic or in DDB */
        if (panicstr || db_active)
@@ -329,15 +300,6 @@ rw_exit(struct rwlock *rwl)
        WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0);
 
        membar_exit_before_atomic();
-       rw_do_exit(rwl, wrlock);
-}
-
-/* membar_exit_before_atomic() has to precede call of this function. */
-void
-rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
-{
-       unsigned long owner, set;
-
        do {
                owner = rwl->rwl_owner;
                if (wrlock)
@@ -348,7 +310,13 @@ rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
        } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set)));
 
        if (owner & RWLOCK_WAIT)
-               wakeup(rwl);
+               wakeup_n(rwl, -1, locked);
+}
+
+void
+rw_exit(struct rwlock *rwl)
+{
+       _rw_exit(rwl, 0);
 }
 
 int
diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
index 5510108a7d5..7589b698d79 100644
--- a/sys/kern/kern_sched.c
+++ b/sys/kern/kern_sched.c
@@ -339,7 +339,7 @@ again:
                         * This is kind of like a stupid idle loop.
                         */
 #ifdef MULTIPROCESSOR
-                       __mp_unlock(&sched_lock);
+                       mtx_leave(&sched_lock);
 #endif
                        spl0();
                        delay(10);
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index f23ffe9c1d6..d867862b18a 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -1969,7 +1969,7 @@ single_thread_check_locked(struct proc *p, int deep, int 
s)
                                continue;
 
                        if (atomic_dec_int_nv(&pr->ps_singlecount) == 0)
-                               wakeup(&pr->ps_singlecount);
+                               wakeup_n(&pr->ps_singlecount, 1, 1);
 
                        if (pr->ps_flags & PS_SINGLEEXIT) {
                                SCHED_UNLOCK(s);
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index b476a6b4253..73438e8ecdd 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -66,6 +66,7 @@
 #endif
 
 int    sleep_signal_check(void);
+int    sleep_signal_check_locked(int s);
 int    thrsleep(struct proc *, struct sys___thrsleep_args *);
 int    thrsleep_unlock(void *);
 
@@ -308,7 +309,7 @@ rwsleep(const volatile void *ident, struct rwlock *rwl, int 
priority,
 
        sleep_setup(&sls, ident, priority, wmesg, timo);
 
-       rw_exit(rwl);
+       _rw_exit(rwl, 1);
        /* signal may stop the process, release rwlock before that */
        error = sleep_finish(&sls, 1);
 
@@ -410,7 +411,7 @@ sleep_finish(struct sleep_state *sls, int do_sleep)
                 * that case we need to unwind immediately.
                 */
                atomic_setbits_int(&p->p_flag, P_SINTR);
-               if ((error = sleep_signal_check()) != 0) {
+               if ((error = sleep_signal_check_locked(sls->sls_s)) != 0) {
                        p->p_stat = SONPROC;
                        sls->sls_catch = 0;
                        do_sleep = 0;
@@ -473,11 +474,23 @@ sleep_finish(struct sleep_state *sls, int do_sleep)
  */
 int
 sleep_signal_check(void)
+{
+       int err, s;
+
+       SCHED_LOCK(s);
+       err = sleep_signal_check_locked(s);
+       SCHED_UNLOCK(s);
+
+       return err;
+}
+
+int
+sleep_signal_check_locked(int s)
 {
        struct proc *p = curproc;
        int err, sig;
 
-       if ((err = single_thread_check(p, 1)) != 0)
+       if ((err = single_thread_check_locked(p, 1, s)) != 0)
                return err;
        if ((sig = cursig(p)) != 0) {
                if (p->p_p->ps_sigacts->ps_sigintr & sigmask(sig))
@@ -489,11 +502,12 @@ sleep_signal_check(void)
 }
 
 int
-wakeup_proc(struct proc *p, const volatile void *chan)
+_wakeup_proc_locked(struct proc *p, const volatile void *chan)
 {
-       int s, awakened = 0;
+       int awakened = 0;
+
+       SCHED_ASSERT_LOCKED();
 
-       SCHED_LOCK(s);
        if (p->p_wchan != NULL &&
           ((chan == NULL) || (p->p_wchan == chan))) {
                awakened = 1;
@@ -502,6 +516,17 @@ wakeup_proc(struct proc *p, const volatile void *chan)
                else
                        unsleep(p);
        }
+
+       return awakened;
+}
+
+int
+wakeup_proc(struct proc *p, const volatile void *chan)
+{
+       int s, awakened;
+
+       SCHED_LOCK(s);
+       awakened = _wakeup_proc_locked(p, chan);
        SCHED_UNLOCK(s);
 
        return awakened;
@@ -521,7 +546,7 @@ endtsleep(void *arg)
        int s;
 
        SCHED_LOCK(s);
-       if (wakeup_proc(p, NULL))
+       if (_wakeup_proc_locked(p, NULL))
                atomic_setbits_int(&p->p_flag, P_TIMEOUT);
        SCHED_UNLOCK(s);
 }
@@ -545,14 +570,18 @@ unsleep(struct proc *p)
  * Make a number of processes sleeping on the specified identifier runnable.
  */
 void
-wakeup_n(const volatile void *ident, int n)
+wakeup_n(const volatile void *ident, int n, int locked)
 {
        struct slpque *qp;
        struct proc *p;
        struct proc *pnext;
        int s;
 
-       SCHED_LOCK(s);
+       if (!locked)
+               SCHED_LOCK(s);
+       else
+               SCHED_ASSERT_LOCKED();
+
        qp = &slpque[LOOKUP(ident)];
        for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
                pnext = TAILQ_NEXT(p, p_runq);
@@ -569,10 +598,12 @@ wakeup_n(const volatile void *ident, int n)
                if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
                        panic("wakeup: p_stat is %d", (int)p->p_stat);
 #endif
-               if (wakeup_proc(p, ident))
+               if (_wakeup_proc_locked(p, ident))
                        --n;
        }
-       SCHED_UNLOCK(s);
+
+       if (!locked)
+               SCHED_UNLOCK(s);
 }
 
 /*
@@ -581,7 +612,7 @@ wakeup_n(const volatile void *ident, int n)
 void
 wakeup(const volatile void *chan)
 {
-       wakeup_n(chan, -1);
+       wakeup_n(chan, -1, 0);
 }
 
 int
diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
index 3b0a0b2cfc7..eea875e47db 100644
--- a/sys/kern/sched_bsd.c
+++ b/sys/kern/sched_bsd.c
@@ -59,7 +59,7 @@ int   lbolt;                  /* once a second sleep address 
*/
 int    rrticks_init;           /* # of hardclock ticks per roundrobin() */
 
 #ifdef MULTIPROCESSOR
-struct __mp_lock sched_lock;
+struct mutex sched_lock;
 #endif
 
 void                   schedcpu(void *);
@@ -337,10 +337,8 @@ mi_switch(void)
        struct timespec ts;
 #ifdef MULTIPROCESSOR
        int hold_count;
-       int sched_count;
 #endif
 
-       assertwaitok();
        KASSERT(p->p_stat != SONPROC);
 
        SCHED_ASSERT_LOCKED();
@@ -349,7 +347,6 @@ mi_switch(void)
        /*
         * Release the kernel_lock, as we are about to yield the CPU.
         */
-       sched_count = __mp_release_all_but_one(&sched_lock);
        if (_kernel_lock_held())
                hold_count = __mp_release_all(&kernel_lock);
        else
@@ -407,9 +404,10 @@ mi_switch(void)
         * just release it here.
         */
 #ifdef MULTIPROCESSOR
-       __mp_unlock(&sched_lock);
+       mtx_leave(&sched_lock);
 #endif
 
+       assertwaitok();
        SCHED_ASSERT_UNLOCKED();
 
        smr_idle();
@@ -431,7 +429,7 @@ mi_switch(void)
         */
        if (hold_count)
                __mp_acquire_count(&kernel_lock, hold_count);
-       __mp_acquire_count(&sched_lock, sched_count + 1);
+       mtx_enter(&sched_lock);
 #endif
 }
 
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 4dbc097f242..978fd5632cd 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -605,6 +605,7 @@ int single_thread_set(struct proc *, enum 
single_thread_mode, int);
 int    single_thread_wait(struct process *, int);
 void   single_thread_clear(struct proc *, int);
 int    single_thread_check(struct proc *, int);
+int    single_thread_check_locked(struct proc *, int, int);
 
 void   child_return(void *);
 
diff --git a/sys/sys/rwlock.h b/sys/sys/rwlock.h
index f5a2504f4e8..fbed8783805 100644
--- a/sys/sys/rwlock.h
+++ b/sys/sys/rwlock.h
@@ -40,15 +40,6 @@
  *  atomically test for the lock being 0 (it's not possible to have
  *  owner/read count unset and waiter bits set) and if 0 set the owner to
  *  the proc and RWLOCK_WRLOCK. While not zero, loop into rw_enter_wait.
- *
- * void rw_exit_read(struct rwlock *);
- *  atomically decrement lock by RWLOCK_READ_INCR and unset RWLOCK_WAIT and
- *  RWLOCK_WRWANT remembering the old value of lock and if RWLOCK_WAIT was set,
- *  call rw_exit_waiters with the old contents of the lock.
- *
- * void rw_exit_write(struct rwlock *);
- *  atomically swap the contents of the lock with 0 and if RWLOCK_WAIT was
- *  set, call rw_exit_waiters with the old contents of the lock.
  */
 
 #ifndef _SYS_RWLOCK_H
@@ -149,8 +140,8 @@ void        _rw_init_flags(struct rwlock *, const char *, 
int,
 
 void   rw_enter_read(struct rwlock *);
 void   rw_enter_write(struct rwlock *);
-void   rw_exit_read(struct rwlock *);
-void   rw_exit_write(struct rwlock *);
+#define rw_exit_read(rwl)      rw_exit(rwl)
+#define rw_exit_write(rwl)     rw_exit(rwl)
 
 #ifdef DIAGNOSTIC
 void   rw_assert_wrlock(struct rwlock *);
@@ -167,6 +158,7 @@ void        rw_assert_unlocked(struct rwlock *);
 int    rw_enter(struct rwlock *, int);
 void   rw_exit(struct rwlock *);
 int    rw_status(struct rwlock *);
+void   _rw_exit(struct rwlock *, int);
 
 static inline int
 rw_read_held(struct rwlock *rwl)
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
index e5d461dcf81..b85bba9297f 100644
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -195,37 +195,32 @@ void remrunqueue(struct proc *);
 } while (0)
 
 #if defined(MULTIPROCESSOR)
-#include <sys/lock.h>
+#include <sys/mutex.h>
 
-/*
- * XXX Instead of using struct lock for the kernel lock and thus requiring us
- * XXX to implement simplelocks, causing all sorts of fine-grained locks all
- * XXX over our tree to be activated, the sched_lock is a different kind of
- * XXX lock to avoid introducing locking protocol bugs.
- */
-extern struct __mp_lock sched_lock;
+extern struct mutex sched_lock;
 
 #define        SCHED_ASSERT_LOCKED()                                           
\
 do {                                                                   \
        splassert(IPL_SCHED);                                           \
-       KASSERT(__mp_lock_held(&sched_lock, curcpu()));                 \
+       MUTEX_ASSERT_LOCKED(&sched_lock);                               \
 } while (0)
+
 #define        SCHED_ASSERT_UNLOCKED()                                         
\
 do {                                                                   \
-       KASSERT(__mp_lock_held(&sched_lock, curcpu()) == 0);            \
+       MUTEX_ASSERT_UNLOCKED(&sched_lock);                             \
 } while (0)
 
-#define        SCHED_LOCK_INIT()       __mp_lock_init(&sched_lock)
+#define        SCHED_LOCK_INIT()       mtx_init(&sched_lock, IPL_NONE)
 
 #define        SCHED_LOCK(s)                                                   
\
 do {                                                                   \
        s = splsched();                                                 \
-       __mp_lock(&sched_lock);                                         \
+       mtx_enter(&sched_lock);                                         \
 } while (/* CONSTCOND */ 0)
 
 #define        SCHED_UNLOCK(s)                                                 
\
 do {                                                                   \
-       __mp_unlock(&sched_lock);                                       \
+       mtx_leave(&sched_lock);                                         \
        splx(s);                                                        \
 } while (/* CONSTCOND */ 0)
 
diff --git a/sys/sys/systm.h b/sys/sys/systm.h
index a0ef9a3dd7f..f4a816ff855 100644
--- a/sys/sys/systm.h
+++ b/sys/sys/systm.h
@@ -265,9 +265,9 @@ void        cond_signal(struct cond *);
 
 struct mutex;
 struct rwlock;
-void    wakeup_n(const volatile void *, int);
+void    wakeup_n(const volatile void *, int, int);
 void    wakeup(const volatile void *);
-#define wakeup_one(c) wakeup_n((c), 1)
+#define wakeup_one(c) wakeup_n((c), 1, 0)
 int    tsleep(const volatile void *, int, const char *, int);
 int    tsleep_nsec(const volatile void *, int, const char *, uint64_t);
 int    msleep(const volatile void *, struct mutex *, int,  const char*, int);

Reply via email to