Re: [please test] amd64: schedule clock interrupts against system clock

Patrick Wildt Tue, 07 Sep 2021 05:44:11 -0700

Am Mon, Sep 06, 2021 at 09:43:29PM +0200 schrieb Patrick Wildt:
> Am Fri, Jul 30, 2021 at 07:55:29PM +0200 schrieb Alexander Bluhm:
> > On Mon, Jul 26, 2021 at 08:12:39AM -0500, Scott Cheloha wrote:
> > > On Fri, Jun 25, 2021 at 06:09:27PM -0500, Scott Cheloha wrote:
> > > 1 month bump.  I really appreciate the tests I've gotten so far, thank
> > > you.
> > 
> > On my Xeon machine it works and all regress tests pass.
> > 
> > But it fails on my old Opteron machine.  It hangs after attaching
> > cpu1.
> 
> This seems to be caused by contention on the mutex in i8254's gettick().
> 
> With Scott's diff, delay_func is i8254_delay() on that old AMD machine.
> Its gettick() implementation uses a mutex to protect I/O access to the
> i8254.
> 
> When secondary CPUs come up, they will wait for CPU0 to let them boot up
> further by checking for a flag:
> 
>       /*
>        * We need to wait until we can identify, otherwise dmesg
>        * output will be messy.
>        */
>       while ((ci->ci_flags & CPUF_IDENTIFY) == 0)
>               delay(10);
> 
> Now that machine has 3 secondary cores that are spinning like that.  At
> the same time CPU0 waits for the core to come up:
> 
>       /* wait for it to identify */
>       for (i = 2000000; (ci->ci_flags & CPUF_IDENTIFY) && i > 0; i--)
>               delay(10);
> 
> That means we have 3-4 cores spinning just to be able to delay().  Our
> mutex implementation isn't fair, which means whoever manages to claim
> the free mutex wins.  Now if CPU2 and CPU3 are spinning all the time,
> CPU1 identifies and needs delay() and CPU0 waits for CPU1, maybe the
> one that needs to make progress never gets it.
> 
> I changed those delay(10) in cpu_hatch() to CPU_BUSY_CYCLE() and it went
> ahead a bit better instead of hanging forever.
> 
> Then I remembered an idea something from years ago: fair kernel mutexes,
> so basically mutexes implemented as ticket lock, like our kerne lock.
> 
> I did a quick diff, which probably contains a million bugs, but with
> this bluhm's machine boots up well.
> 
> I'm not saying this is the solution, but it might be.
> 
> Patrick


Cleaned the diff up a little, changes since last time:

* Rename the struct members to be the same as mplock.
* Change the code to use ticket/user numbers like mplock.  This
  has one obvious downside: If a mutex is not initialized, trying
  to get this mutex will result in a hang.  At least that just let
  me find some uninitialized mutexes.
* More consistent use of the 'ci' variable.
* Definitely compiles with/without DIAGNOSTIC.
* Made sure mtx_enter() still has the membar.
* No need for READ_ONCE() when members are volatile.

Apart from being fair, this diff also changes the behaviour while
spinning for a lock.  Previously mtx_enter called mtx_enter_try
in a loop until it got the lock.  mtx_enter_try does splraise,
try lock, splx.  This diff currently spins with the SPL raised,
so that's a change in behaviour.  I'm sure I can change the diff
to splraise/splx while looping, if we prefer that behaviour.

Patrick

diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
index 5cc55bb256a..1eeb30e0c40 100644
--- a/sys/kern/kern_lock.c
+++ b/sys/kern/kern_lock.c
@@ -248,6 +248,8 @@ __mtx_init(struct mutex *mtx, int wantipl)
        mtx->mtx_owner = NULL;
        mtx->mtx_wantipl = wantipl;
        mtx->mtx_oldipl = IPL_NONE;
+       mtx->mtx_users = 0;
+       mtx->mtx_ticket = 1;
 }
 
 #ifdef MULTIPROCESSOR
@@ -255,15 +257,26 @@ void
 mtx_enter(struct mutex *mtx)
 {
        struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
+       struct cpu_info *ci = curcpu();
 #ifdef MP_LOCKDEBUG
        int nticks = __mp_lock_spinout;
 #endif
+       u_int t;
+       int s;
+
+       /* Avoid deadlocks after panic or in DDB */
+       if (panicstr || db_active)
+               return;
 
        WITNESS_CHECKORDER(MUTEX_LOCK_OBJECT(mtx),
            LOP_EXCLUSIVE | LOP_NEWORDER, NULL);
 
+       if (mtx->mtx_wantipl != IPL_NONE)
+               s = splraise(mtx->mtx_wantipl);
+
        spc->spc_spinning++;
-       while (mtx_enter_try(mtx) == 0) {
+       t = atomic_inc_int_nv(&mtx->mtx_users);
+       while (mtx->mtx_ticket != t) {
                CPU_BUSY_CYCLE();
 
 #ifdef MP_LOCKDEBUG
@@ -275,12 +288,22 @@ mtx_enter(struct mutex *mtx)
 #endif
        }
        spc->spc_spinning--;
+
+       membar_enter_after_atomic();
+       mtx->mtx_owner = ci;
+       if (mtx->mtx_wantipl != IPL_NONE)
+               mtx->mtx_oldipl = s;
+#ifdef DIAGNOSTIC
+       ci->ci_mutex_level++;
+#endif
+       WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE);
 }
 
 int
 mtx_enter_try(struct mutex *mtx)
 {
-       struct cpu_info *owner, *ci = curcpu();
+       struct cpu_info *ci = curcpu();
+       u_int t;
        int s;
 
        /* Avoid deadlocks after panic or in DDB */
@@ -290,13 +313,15 @@ mtx_enter_try(struct mutex *mtx)
        if (mtx->mtx_wantipl != IPL_NONE)
                s = splraise(mtx->mtx_wantipl);
 
-       owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci);
 #ifdef DIAGNOSTIC
-       if (__predict_false(owner == ci))
+       if (__predict_false(mtx->mtx_owner == ci))
                panic("mtx %p: locking against myself", mtx);
 #endif
-       if (owner == NULL) {
+
+       t = mtx->mtx_ticket - 1;
+       if (atomic_cas_uint(&mtx->mtx_users, t, t + 1) == t) {
                membar_enter_after_atomic();
+               mtx->mtx_owner = ci;
                if (mtx->mtx_wantipl != IPL_NONE)
                        mtx->mtx_oldipl = s;
 #ifdef DIAGNOSTIC
@@ -369,6 +394,9 @@ mtx_leave(struct mutex *mtx)
        membar_exit();
 #endif
        mtx->mtx_owner = NULL;
+#ifdef MULTIPROCESSOR
+       mtx->mtx_ticket++;
+#endif
        if (mtx->mtx_wantipl != IPL_NONE)
                splx(s);
 }
diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h
index 22a375a0b96..5204e8b482e 100644
--- a/sys/sys/mutex.h
+++ b/sys/sys/mutex.h
@@ -43,6 +43,8 @@ struct mutex {
        volatile void *mtx_owner;
        int mtx_wantipl;
        int mtx_oldipl;
+       volatile u_int mtx_ticket;
+       u_int mtx_users;
 #ifdef WITNESS
        struct lock_object mtx_lock_obj;
 #endif
@@ -64,10 +66,10 @@ struct mutex {
 
 #ifdef WITNESS
 #define MUTEX_INITIALIZER_FLAGS(ipl, name, flags) \
-       { NULL, __MUTEX_IPL((ipl)), IPL_NONE, MTX_LO_INITIALIZER(name, flags) }
+       { NULL, __MUTEX_IPL((ipl)), IPL_NONE, 1, 0, MTX_LO_INITIALIZER(name, 
flags) }
 #else
 #define MUTEX_INITIALIZER_FLAGS(ipl, name, flags) \
-       { NULL, __MUTEX_IPL((ipl)), IPL_NONE }
+       { NULL, __MUTEX_IPL((ipl)), IPL_NONE, 1, 0 }
 #endif
 
 void __mtx_init(struct mutex *, int);

Re: [please test] amd64: schedule clock interrupts against system clock

Reply via email to