Am Mon, Sep 06, 2021 at 09:43:29PM +0200 schrieb Patrick Wildt: > Am Fri, Jul 30, 2021 at 07:55:29PM +0200 schrieb Alexander Bluhm: > > On Mon, Jul 26, 2021 at 08:12:39AM -0500, Scott Cheloha wrote: > > > On Fri, Jun 25, 2021 at 06:09:27PM -0500, Scott Cheloha wrote: > > > 1 month bump. I really appreciate the tests I've gotten so far, thank > > > you. > > > > On my Xeon machine it works and all regress tests pass. > > > > But it fails on my old Opteron machine. It hangs after attaching > > cpu1. > > This seems to be caused by contention on the mutex in i8254's gettick(). > > With Scott's diff, delay_func is i8254_delay() on that old AMD machine. > Its gettick() implementation uses a mutex to protect I/O access to the > i8254. > > When secondary CPUs come up, they will wait for CPU0 to let them boot up > further by checking for a flag: > > /* > * We need to wait until we can identify, otherwise dmesg > * output will be messy. > */ > while ((ci->ci_flags & CPUF_IDENTIFY) == 0) > delay(10); > > Now that machine has 3 secondary cores that are spinning like that. At > the same time CPU0 waits for the core to come up: > > /* wait for it to identify */ > for (i = 2000000; (ci->ci_flags & CPUF_IDENTIFY) && i > 0; i--) > delay(10); > > That means we have 3-4 cores spinning just to be able to delay(). Our > mutex implementation isn't fair, which means whoever manages to claim > the free mutex wins. Now if CPU2 and CPU3 are spinning all the time, > CPU1 identifies and needs delay() and CPU0 waits for CPU1, maybe the > one that needs to make progress never gets it. > > I changed those delay(10) in cpu_hatch() to CPU_BUSY_CYCLE() and it went > ahead a bit better instead of hanging forever. > > Then I remembered an idea something from years ago: fair kernel mutexes, > so basically mutexes implemented as ticket lock, like our kerne lock. > > I did a quick diff, which probably contains a million bugs, but with > this bluhm's machine boots up well. > > I'm not saying this is the solution, but it might be. > > Patrick
Cleaned the diff up a little, changes since last time: * Rename the struct members to be the same as mplock. * Change the code to use ticket/user numbers like mplock. This has one obvious downside: If a mutex is not initialized, trying to get this mutex will result in a hang. At least that just let me find some uninitialized mutexes. * More consistent use of the 'ci' variable. * Definitely compiles with/without DIAGNOSTIC. * Made sure mtx_enter() still has the membar. * No need for READ_ONCE() when members are volatile. Apart from being fair, this diff also changes the behaviour while spinning for a lock. Previously mtx_enter called mtx_enter_try in a loop until it got the lock. mtx_enter_try does splraise, try lock, splx. This diff currently spins with the SPL raised, so that's a change in behaviour. I'm sure I can change the diff to splraise/splx while looping, if we prefer that behaviour. Patrick diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c index 5cc55bb256a..1eeb30e0c40 100644 --- a/sys/kern/kern_lock.c +++ b/sys/kern/kern_lock.c @@ -248,6 +248,8 @@ __mtx_init(struct mutex *mtx, int wantipl) mtx->mtx_owner = NULL; mtx->mtx_wantipl = wantipl; mtx->mtx_oldipl = IPL_NONE; + mtx->mtx_users = 0; + mtx->mtx_ticket = 1; } #ifdef MULTIPROCESSOR @@ -255,15 +257,26 @@ void mtx_enter(struct mutex *mtx) { struct schedstate_percpu *spc = &curcpu()->ci_schedstate; + struct cpu_info *ci = curcpu(); #ifdef MP_LOCKDEBUG int nticks = __mp_lock_spinout; #endif + u_int t; + int s; + + /* Avoid deadlocks after panic or in DDB */ + if (panicstr || db_active) + return; WITNESS_CHECKORDER(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE | LOP_NEWORDER, NULL); + if (mtx->mtx_wantipl != IPL_NONE) + s = splraise(mtx->mtx_wantipl); + spc->spc_spinning++; - while (mtx_enter_try(mtx) == 0) { + t = atomic_inc_int_nv(&mtx->mtx_users); + while (mtx->mtx_ticket != t) { CPU_BUSY_CYCLE(); #ifdef MP_LOCKDEBUG @@ -275,12 +288,22 @@ mtx_enter(struct mutex *mtx) #endif } spc->spc_spinning--; + + membar_enter_after_atomic(); + mtx->mtx_owner = ci; + if (mtx->mtx_wantipl != IPL_NONE) + mtx->mtx_oldipl = s; +#ifdef DIAGNOSTIC + ci->ci_mutex_level++; +#endif + WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); } int mtx_enter_try(struct mutex *mtx) { - struct cpu_info *owner, *ci = curcpu(); + struct cpu_info *ci = curcpu(); + u_int t; int s; /* Avoid deadlocks after panic or in DDB */ @@ -290,13 +313,15 @@ mtx_enter_try(struct mutex *mtx) if (mtx->mtx_wantipl != IPL_NONE) s = splraise(mtx->mtx_wantipl); - owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci); #ifdef DIAGNOSTIC - if (__predict_false(owner == ci)) + if (__predict_false(mtx->mtx_owner == ci)) panic("mtx %p: locking against myself", mtx); #endif - if (owner == NULL) { + + t = mtx->mtx_ticket - 1; + if (atomic_cas_uint(&mtx->mtx_users, t, t + 1) == t) { membar_enter_after_atomic(); + mtx->mtx_owner = ci; if (mtx->mtx_wantipl != IPL_NONE) mtx->mtx_oldipl = s; #ifdef DIAGNOSTIC @@ -369,6 +394,9 @@ mtx_leave(struct mutex *mtx) membar_exit(); #endif mtx->mtx_owner = NULL; +#ifdef MULTIPROCESSOR + mtx->mtx_ticket++; +#endif if (mtx->mtx_wantipl != IPL_NONE) splx(s); } diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h index 22a375a0b96..5204e8b482e 100644 --- a/sys/sys/mutex.h +++ b/sys/sys/mutex.h @@ -43,6 +43,8 @@ struct mutex { volatile void *mtx_owner; int mtx_wantipl; int mtx_oldipl; + volatile u_int mtx_ticket; + u_int mtx_users; #ifdef WITNESS struct lock_object mtx_lock_obj; #endif @@ -64,10 +66,10 @@ struct mutex { #ifdef WITNESS #define MUTEX_INITIALIZER_FLAGS(ipl, name, flags) \ - { NULL, __MUTEX_IPL((ipl)), IPL_NONE, MTX_LO_INITIALIZER(name, flags) } + { NULL, __MUTEX_IPL((ipl)), IPL_NONE, 1, 0, MTX_LO_INITIALIZER(name, flags) } #else #define MUTEX_INITIALIZER_FLAGS(ipl, name, flags) \ - { NULL, __MUTEX_IPL((ipl)), IPL_NONE } + { NULL, __MUTEX_IPL((ipl)), IPL_NONE, 1, 0 } #endif void __mtx_init(struct mutex *, int);