Am Wed, Nov 04, 2020 at 09:13:22AM -0300 schrieb Martin Pieuchot: > Diff below removes the recursive attribute of the SCHED_LOCK() by > turning it into a IPL_NONE mutex. I'm not intending to commit it > yet as it raises multiple questions, see below. > > This work has been started by art@ more than a decade ago and I'm > willing to finish it as I believe it's the easiest way to reduce > the scope of this lock. Having a global mutex is the first step to > have a per runqueue and per sleepqueue mutex. > > This is also a way to avoid lock ordering problems exposed by the recent > races in single_thread_set(). > > About the diff: > > The diff below includes a (hugly) refactoring of rw_exit() to avoid a > recursion on the SCHED_LOCK(). In this case the lock is used to protect > the global sleepqueue and is grabbed in sleep_setup(). > > The same pattern can be observed in single_thread_check(). However in > this case the lock is used to protect different fields so there's no > "recursive access" to the same data structure. > > assertwaitok() has been moved down in mi_switch() which isn't ideal. > > It becomes obvious that the per-CPU and per-thread accounting fields > updated in mi_switch() won't need a separate mutex as proposed last > year and that splitting this global mutex will be enough. > > It's unclear to me if/how WITNESS should be modified to handle this lock > change. > > This has been tested on sparc64 and amd64. I'm not convinced it exposed > all the recursions. So if you want to give it a go and can break it, it > is more than welcome. > > Comments? Questions?
I think that's a good direction. This then allows us to add a few smaller mutexes and move some users to those. In the meantime apparently some stuff has changed a little bit. I had to fix one reject, which was very easy because only the context around the diff changed a tiny bit. I then found one schedlock recursion: single_thread_check() takes the sched lock, which we call through sleep_signal_check(). We do the call twice, one time with the sched lock, and one time without. My 'quick' fix is to introduce a locked version that calls single_thread_check_locked(). There might be a nicer way to do that, which I don't know yet, but this diff seems to work fine on my arm64 machine with amdgpu. Build times increase a little, which might be because mutexes are not 'fair' like mplocks. If we are looking for 'instant gratification' we might have to do two steps at once (mplock -> mtx + move contention to multiple mtxs). In any case, I believe this is a good step to go and I'll have a look at reducing the contention. Patrick diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 1e51f71301a..4e4e1fc2a27 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -671,7 +671,7 @@ void proc_trampoline_mp(void) { SCHED_ASSERT_LOCKED(); - __mp_unlock(&sched_lock); + mtx_leave(&sched_lock); spl0(); SCHED_ASSERT_UNLOCKED(); KERNEL_ASSERT_UNLOCKED(); diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c index 5cc55bb256a..2206ac53321 100644 --- a/sys/kern/kern_lock.c +++ b/sys/kern/kern_lock.c @@ -97,9 +97,6 @@ ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type *type) if (mpl == &kernel_lock) mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED | LO_SLEEPABLE | (LO_CLASS_KERNEL_LOCK << LO_CLASSSHIFT); - else if (mpl == &sched_lock) - mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED | - LO_RECURSABLE | (LO_CLASS_SCHED_LOCK << LO_CLASSSHIFT); WITNESS_INIT(&mpl->mpl_lock_obj, type); #endif } diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c index d79b59748e8..2feeff16943 100644 --- a/sys/kern/kern_rwlock.c +++ b/sys/kern/kern_rwlock.c @@ -129,36 +129,6 @@ rw_enter_write(struct rwlock *rwl) } } -void -rw_exit_read(struct rwlock *rwl) -{ - unsigned long owner; - - rw_assert_rdlock(rwl); - WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0); - - membar_exit_before_atomic(); - owner = rwl->rwl_owner; - if (__predict_false((owner & RWLOCK_WAIT) || - rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR))) - rw_do_exit(rwl, 0); -} - -void -rw_exit_write(struct rwlock *rwl) -{ - unsigned long owner; - - rw_assert_wrlock(rwl); - WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); - - membar_exit_before_atomic(); - owner = rwl->rwl_owner; - if (__predict_false((owner & RWLOCK_WAIT) || - rw_cas(&rwl->rwl_owner, owner, 0))) - rw_do_exit(rwl, RWLOCK_WRLOCK); -} - #ifdef DIAGNOSTIC /* * Put the diagnostic functions here to keep the main code free @@ -313,9 +283,10 @@ retry: } void -rw_exit(struct rwlock *rwl) +_rw_exit(struct rwlock *rwl, int locked) { unsigned long wrlock; + unsigned long owner, set; /* Avoid deadlocks after panic or in DDB */ if (panicstr || db_active) @@ -329,15 +300,6 @@ rw_exit(struct rwlock *rwl) WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0); membar_exit_before_atomic(); - rw_do_exit(rwl, wrlock); -} - -/* membar_exit_before_atomic() has to precede call of this function. */ -void -rw_do_exit(struct rwlock *rwl, unsigned long wrlock) -{ - unsigned long owner, set; - do { owner = rwl->rwl_owner; if (wrlock) @@ -348,7 +310,13 @@ rw_do_exit(struct rwlock *rwl, unsigned long wrlock) } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set))); if (owner & RWLOCK_WAIT) - wakeup(rwl); + wakeup_n(rwl, -1, locked); +} + +void +rw_exit(struct rwlock *rwl) +{ + _rw_exit(rwl, 0); } int diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c index 5510108a7d5..7589b698d79 100644 --- a/sys/kern/kern_sched.c +++ b/sys/kern/kern_sched.c @@ -339,7 +339,7 @@ again: * This is kind of like a stupid idle loop. */ #ifdef MULTIPROCESSOR - __mp_unlock(&sched_lock); + mtx_leave(&sched_lock); #endif spl0(); delay(10); diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index f23ffe9c1d6..d867862b18a 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -1969,7 +1969,7 @@ single_thread_check_locked(struct proc *p, int deep, int s) continue; if (atomic_dec_int_nv(&pr->ps_singlecount) == 0) - wakeup(&pr->ps_singlecount); + wakeup_n(&pr->ps_singlecount, 1, 1); if (pr->ps_flags & PS_SINGLEEXIT) { SCHED_UNLOCK(s); diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index b476a6b4253..73438e8ecdd 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -66,6 +66,7 @@ #endif int sleep_signal_check(void); +int sleep_signal_check_locked(int s); int thrsleep(struct proc *, struct sys___thrsleep_args *); int thrsleep_unlock(void *); @@ -308,7 +309,7 @@ rwsleep(const volatile void *ident, struct rwlock *rwl, int priority, sleep_setup(&sls, ident, priority, wmesg, timo); - rw_exit(rwl); + _rw_exit(rwl, 1); /* signal may stop the process, release rwlock before that */ error = sleep_finish(&sls, 1); @@ -410,7 +411,7 @@ sleep_finish(struct sleep_state *sls, int do_sleep) * that case we need to unwind immediately. */ atomic_setbits_int(&p->p_flag, P_SINTR); - if ((error = sleep_signal_check()) != 0) { + if ((error = sleep_signal_check_locked(sls->sls_s)) != 0) { p->p_stat = SONPROC; sls->sls_catch = 0; do_sleep = 0; @@ -473,11 +474,23 @@ sleep_finish(struct sleep_state *sls, int do_sleep) */ int sleep_signal_check(void) +{ + int err, s; + + SCHED_LOCK(s); + err = sleep_signal_check_locked(s); + SCHED_UNLOCK(s); + + return err; +} + +int +sleep_signal_check_locked(int s) { struct proc *p = curproc; int err, sig; - if ((err = single_thread_check(p, 1)) != 0) + if ((err = single_thread_check_locked(p, 1, s)) != 0) return err; if ((sig = cursig(p)) != 0) { if (p->p_p->ps_sigacts->ps_sigintr & sigmask(sig)) @@ -489,11 +502,12 @@ sleep_signal_check(void) } int -wakeup_proc(struct proc *p, const volatile void *chan) +_wakeup_proc_locked(struct proc *p, const volatile void *chan) { - int s, awakened = 0; + int awakened = 0; + + SCHED_ASSERT_LOCKED(); - SCHED_LOCK(s); if (p->p_wchan != NULL && ((chan == NULL) || (p->p_wchan == chan))) { awakened = 1; @@ -502,6 +516,17 @@ wakeup_proc(struct proc *p, const volatile void *chan) else unsleep(p); } + + return awakened; +} + +int +wakeup_proc(struct proc *p, const volatile void *chan) +{ + int s, awakened; + + SCHED_LOCK(s); + awakened = _wakeup_proc_locked(p, chan); SCHED_UNLOCK(s); return awakened; @@ -521,7 +546,7 @@ endtsleep(void *arg) int s; SCHED_LOCK(s); - if (wakeup_proc(p, NULL)) + if (_wakeup_proc_locked(p, NULL)) atomic_setbits_int(&p->p_flag, P_TIMEOUT); SCHED_UNLOCK(s); } @@ -545,14 +570,18 @@ unsleep(struct proc *p) * Make a number of processes sleeping on the specified identifier runnable. */ void -wakeup_n(const volatile void *ident, int n) +wakeup_n(const volatile void *ident, int n, int locked) { struct slpque *qp; struct proc *p; struct proc *pnext; int s; - SCHED_LOCK(s); + if (!locked) + SCHED_LOCK(s); + else + SCHED_ASSERT_LOCKED(); + qp = &slpque[LOOKUP(ident)]; for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) { pnext = TAILQ_NEXT(p, p_runq); @@ -569,10 +598,12 @@ wakeup_n(const volatile void *ident, int n) if (p->p_stat != SSLEEP && p->p_stat != SSTOP) panic("wakeup: p_stat is %d", (int)p->p_stat); #endif - if (wakeup_proc(p, ident)) + if (_wakeup_proc_locked(p, ident)) --n; } - SCHED_UNLOCK(s); + + if (!locked) + SCHED_UNLOCK(s); } /* @@ -581,7 +612,7 @@ wakeup_n(const volatile void *ident, int n) void wakeup(const volatile void *chan) { - wakeup_n(chan, -1); + wakeup_n(chan, -1, 0); } int diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c index 3b0a0b2cfc7..eea875e47db 100644 --- a/sys/kern/sched_bsd.c +++ b/sys/kern/sched_bsd.c @@ -59,7 +59,7 @@ int lbolt; /* once a second sleep address */ int rrticks_init; /* # of hardclock ticks per roundrobin() */ #ifdef MULTIPROCESSOR -struct __mp_lock sched_lock; +struct mutex sched_lock; #endif void schedcpu(void *); @@ -337,10 +337,8 @@ mi_switch(void) struct timespec ts; #ifdef MULTIPROCESSOR int hold_count; - int sched_count; #endif - assertwaitok(); KASSERT(p->p_stat != SONPROC); SCHED_ASSERT_LOCKED(); @@ -349,7 +347,6 @@ mi_switch(void) /* * Release the kernel_lock, as we are about to yield the CPU. */ - sched_count = __mp_release_all_but_one(&sched_lock); if (_kernel_lock_held()) hold_count = __mp_release_all(&kernel_lock); else @@ -407,9 +404,10 @@ mi_switch(void) * just release it here. */ #ifdef MULTIPROCESSOR - __mp_unlock(&sched_lock); + mtx_leave(&sched_lock); #endif + assertwaitok(); SCHED_ASSERT_UNLOCKED(); smr_idle(); @@ -431,7 +429,7 @@ mi_switch(void) */ if (hold_count) __mp_acquire_count(&kernel_lock, hold_count); - __mp_acquire_count(&sched_lock, sched_count + 1); + mtx_enter(&sched_lock); #endif } diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 4dbc097f242..978fd5632cd 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -605,6 +605,7 @@ int single_thread_set(struct proc *, enum single_thread_mode, int); int single_thread_wait(struct process *, int); void single_thread_clear(struct proc *, int); int single_thread_check(struct proc *, int); +int single_thread_check_locked(struct proc *, int, int); void child_return(void *); diff --git a/sys/sys/rwlock.h b/sys/sys/rwlock.h index f5a2504f4e8..fbed8783805 100644 --- a/sys/sys/rwlock.h +++ b/sys/sys/rwlock.h @@ -40,15 +40,6 @@ * atomically test for the lock being 0 (it's not possible to have * owner/read count unset and waiter bits set) and if 0 set the owner to * the proc and RWLOCK_WRLOCK. While not zero, loop into rw_enter_wait. - * - * void rw_exit_read(struct rwlock *); - * atomically decrement lock by RWLOCK_READ_INCR and unset RWLOCK_WAIT and - * RWLOCK_WRWANT remembering the old value of lock and if RWLOCK_WAIT was set, - * call rw_exit_waiters with the old contents of the lock. - * - * void rw_exit_write(struct rwlock *); - * atomically swap the contents of the lock with 0 and if RWLOCK_WAIT was - * set, call rw_exit_waiters with the old contents of the lock. */ #ifndef _SYS_RWLOCK_H @@ -149,8 +140,8 @@ void _rw_init_flags(struct rwlock *, const char *, int, void rw_enter_read(struct rwlock *); void rw_enter_write(struct rwlock *); -void rw_exit_read(struct rwlock *); -void rw_exit_write(struct rwlock *); +#define rw_exit_read(rwl) rw_exit(rwl) +#define rw_exit_write(rwl) rw_exit(rwl) #ifdef DIAGNOSTIC void rw_assert_wrlock(struct rwlock *); @@ -167,6 +158,7 @@ void rw_assert_unlocked(struct rwlock *); int rw_enter(struct rwlock *, int); void rw_exit(struct rwlock *); int rw_status(struct rwlock *); +void _rw_exit(struct rwlock *, int); static inline int rw_read_held(struct rwlock *rwl) diff --git a/sys/sys/sched.h b/sys/sys/sched.h index e5d461dcf81..b85bba9297f 100644 --- a/sys/sys/sched.h +++ b/sys/sys/sched.h @@ -195,37 +195,32 @@ void remrunqueue(struct proc *); } while (0) #if defined(MULTIPROCESSOR) -#include <sys/lock.h> +#include <sys/mutex.h> -/* - * XXX Instead of using struct lock for the kernel lock and thus requiring us - * XXX to implement simplelocks, causing all sorts of fine-grained locks all - * XXX over our tree to be activated, the sched_lock is a different kind of - * XXX lock to avoid introducing locking protocol bugs. - */ -extern struct __mp_lock sched_lock; +extern struct mutex sched_lock; #define SCHED_ASSERT_LOCKED() \ do { \ splassert(IPL_SCHED); \ - KASSERT(__mp_lock_held(&sched_lock, curcpu())); \ + MUTEX_ASSERT_LOCKED(&sched_lock); \ } while (0) + #define SCHED_ASSERT_UNLOCKED() \ do { \ - KASSERT(__mp_lock_held(&sched_lock, curcpu()) == 0); \ + MUTEX_ASSERT_UNLOCKED(&sched_lock); \ } while (0) -#define SCHED_LOCK_INIT() __mp_lock_init(&sched_lock) +#define SCHED_LOCK_INIT() mtx_init(&sched_lock, IPL_NONE) #define SCHED_LOCK(s) \ do { \ s = splsched(); \ - __mp_lock(&sched_lock); \ + mtx_enter(&sched_lock); \ } while (/* CONSTCOND */ 0) #define SCHED_UNLOCK(s) \ do { \ - __mp_unlock(&sched_lock); \ + mtx_leave(&sched_lock); \ splx(s); \ } while (/* CONSTCOND */ 0) diff --git a/sys/sys/systm.h b/sys/sys/systm.h index a0ef9a3dd7f..f4a816ff855 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -265,9 +265,9 @@ void cond_signal(struct cond *); struct mutex; struct rwlock; -void wakeup_n(const volatile void *, int); +void wakeup_n(const volatile void *, int, int); void wakeup(const volatile void *); -#define wakeup_one(c) wakeup_n((c), 1) +#define wakeup_one(c) wakeup_n((c), 1, 0) int tsleep(const volatile void *, int, const char *, int); int tsleep_nsec(const volatile void *, int, const char *, uint64_t); int msleep(const volatile void *, struct mutex *, int, const char*, int);