On Tue, 18 May 2021 10:22:11 -0500
Tony Asleson <tony.asle...@gmail.com> wrote:

> On Tue, May 18, 2021 at 1:12 AM Peter J. Philipp <p...@delphinusdns.org> 
> wrote:
> >
> > Just to follow up.  Nope that's not why, mine (my consaddr is at 98004000) 
> > is
> > high too, I just booted my loud G5.  So I apologize for the bad idea.
> >
> > But perhaps checking the RAM is a good idea anyhow.
> 
> I'm assuming your G5 is multiprocessor too?
> 
> I'll give this a try later by pulling all but a couple of sticks.  I'm
> also going to see if I can pick up some ECC memory for this system
> cheap on eBay.  I remembered that the model I have supports it.
> 
> I'll post a follow-up on what I find.
> 
> Thanks

Hello, Tony, Peter.

My G5 did hang in the same way, and I found the bug in the OpenBSD
kernel.  I got unlucky when the system randomly reordered my kernel.
Then the kernel got an unlucky page fault while trying to acquire or
release the mp kernel's pmap's hash lock.  The unlucky fault hung at
the "using parent" line.  The fault can happen only on G5.

The problem might go away by again reordering the kernel.  The easiest
way (but I'm not sure) might be to boot bsd.rd and use its "upgrade"
feature to reinstall base69.tgz.  I took a different way: I had been
compiling my own kernels, so when my bsd.mp froze, I booted an older
bsd.mp that still worked, and compiled a new bsd.mp.

My diff, below, tries to fix the problem.  I would like people, who
have a macppc with more than one cpu, to compile and run a GENERIC.MP
kernel with my diff, and tell me if it has trouble.  The diff is for
OpenBSD-current but should also apply to 6.9.  Something like,

$ cd /usr
$ cvs ... checkout ... -P src/sys
$ cd /sys
$ patch -p0 < ~/this-email

https://www.openbsd.org/faq/faq5.html#Bld has help with cvs.
release(8) shows how to build a kernel, but because I have 2 cpus,
after "make config" I do "make -j2 && make install".

More info in my mails to tech, "macppc bsd.mp pmap's hash lock",
archived at https://marc.info/?l=openbsd-tech&m=162035717625663&w=2

--George

Index: sys/arch/powerpc/include/mplock.h
===================================================================
RCS file: /cvs/src/sys/arch/powerpc/include/mplock.h,v
retrieving revision 1.4
diff -u -p -r1.4 mplock.h
--- sys/arch/powerpc/include/mplock.h   15 Apr 2020 08:09:00 -0000      1.4
+++ sys/arch/powerpc/include/mplock.h   16 May 2021 00:51:41 -0000
@@ -30,13 +30,14 @@
 #define __USE_MI_MPLOCK
 
 /*
+ * __ppc_lock exists because pte_spill_r() can't use __mp_lock.
  * Really simple spinlock implementation with recursive capabilities.
  * Correctness is paramount, no fancyness allowed.
  */
 
 struct __ppc_lock {
-       volatile struct cpu_info *mpl_cpu;
-       volatile long           mpl_count;
+       struct cpu_info *volatile       mpl_cpu;
+       long                            mpl_count;
 };
 
 #ifndef _LOCORE
@@ -44,10 +45,6 @@ struct __ppc_lock {
 void __ppc_lock_init(struct __ppc_lock *);
 void __ppc_lock(struct __ppc_lock *);
 void __ppc_unlock(struct __ppc_lock *);
-int __ppc_release_all(struct __ppc_lock *);
-int __ppc_release_all_but_one(struct __ppc_lock *);
-void __ppc_acquire_count(struct __ppc_lock *, int);
-int __ppc_lock_held(struct __ppc_lock *, struct cpu_info *);
 
 #endif
 
Index: sys/arch/powerpc/powerpc/lock_machdep.c
===================================================================
RCS file: /cvs/src/sys/arch/powerpc/powerpc/lock_machdep.c,v
retrieving revision 1.9
diff -u -p -r1.9 lock_machdep.c
--- sys/arch/powerpc/powerpc/lock_machdep.c     15 Apr 2020 08:09:00 -0000      
1.9
+++ sys/arch/powerpc/powerpc/lock_machdep.c     16 May 2021 00:51:41 -0000
@@ -1,6 +1,7 @@
 /*     $OpenBSD: lock_machdep.c,v 1.9 2020/04/15 08:09:00 mpi Exp $    */
 
 /*
+ * Copyright (c) 2021 George Koehler <gkoeh...@openbsd.org>
  * Copyright (c) 2007 Artur Grabowski <a...@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
@@ -22,10 +23,21 @@
 #include <sys/atomic.h>
 
 #include <machine/cpu.h>
-#include <machine/psl.h>
 
 #include <ddb/db_output.h>
 
+/*
+ * If __ppc_lock() crosses a page boundary in the kernel text, then it
+ * may cause a page fault (on G5 with ppc_nobat), and pte_spill_r()
+ * would recursively call __ppc_lock().  The lock must be in a valid
+ * state when the page fault happens.  We acquire or release the lock
+ * with a 32-bit atomic write to mpl_owner, so the lock is always in a
+ * valid state, before or after the write.
+ *
+ * Acquired the lock:  mpl->mpl_cpu == curcpu()
+ * Released the lock:  mpl->mpl_cpu == NULL
+ */
+
 void
 __ppc_lock_init(struct __ppc_lock *lock)
 {
@@ -46,12 +58,12 @@ static __inline void
 __ppc_lock_spin(struct __ppc_lock *mpl)
 {
 #ifndef MP_LOCKDEBUG
-       while (mpl->mpl_count != 0)
+       while (mpl->mpl_cpu != NULL)
                CPU_BUSY_CYCLE();
 #else
        int nticks = __mp_lock_spinout;
 
-       while (mpl->mpl_count != 0 && --nticks > 0)
+       while (mpl->mpl_cpu != NULL && --nticks > 0)
                CPU_BUSY_CYCLE();
 
        if (nticks == 0) {
@@ -65,32 +77,33 @@ void
 __ppc_lock(struct __ppc_lock *mpl)
 {
        /*
-        * Please notice that mpl_count gets incremented twice for the
-        * first lock. This is on purpose. The way we release the lock
-        * in mp_unlock is to decrement the mpl_count and then check if
-        * the lock should be released. Since mpl_count is what we're
-        * spinning on, decrementing it in mpl_unlock to 0 means that
-        * we can't clear mpl_cpu, because we're no longer holding the
-        * lock. In theory mpl_cpu doesn't need to be cleared, but it's
-        * safer to clear it and besides, setting mpl_count to 2 on the
-        * first lock makes most of this code much simpler.
+        * Please notice that mpl_count stays at 0 for the first lock.
+        * A page fault might recursively call __ppc_lock() after we
+        * set mpl_cpu, but before we can increase mpl_count.
+        *
+        * After we acquire the lock, we need a "bc; isync" memory
+        * barrier, but we might not reach the barrier before the next
+        * page fault.  Then the fault's recursive __ppc_lock() must
+        * have a barrier.  membar_enter() is just "isync" and must
+        * come after a conditional branch for holding the lock.
         */
 
        while (1) {
-               int s;
+               struct cpu_info *owner = mpl->mpl_cpu;
+               struct cpu_info *ci = curcpu();
 
-               s = ppc_intr_disable();
-               if (atomic_cas_ulong(&mpl->mpl_count, 0, 1) == 0) {
+               if (owner == NULL) {
+                       /* Try to acquire the lock. */
+                       if (atomic_cas_ptr(&mpl->mpl_cpu, NULL, ci) == NULL) {
+                               membar_enter();
+                               break;
+                       }
+               } else if (owner == ci) {
+                       /* We hold the lock, but might need a barrier. */
                        membar_enter();
-                       mpl->mpl_cpu = curcpu();
-               }
-
-               if (mpl->mpl_cpu == curcpu()) {
                        mpl->mpl_count++;
-                       ppc_intr_enable(s);
                        break;
                }
-               ppc_intr_enable(s);
 
                __ppc_lock_spin(mpl);
        }
@@ -99,8 +112,6 @@ __ppc_lock(struct __ppc_lock *mpl)
 void
 __ppc_unlock(struct __ppc_lock *mpl)
 {
-       int s;
-
 #ifdef MP_LOCKDEBUG
        if (mpl->mpl_cpu != curcpu()) {
                db_printf("__ppc_unlock(%p): not held lock\n", mpl);
@@ -108,63 +119,10 @@ __ppc_unlock(struct __ppc_lock *mpl)
        }
 #endif
 
-       s = ppc_intr_disable();
-       if (--mpl->mpl_count == 1) {
-               mpl->mpl_cpu = NULL;
+       if (mpl->mpl_count == 0) {
+               /* Release the lock. */
                membar_exit();
-               mpl->mpl_count = 0;
-       }
-       ppc_intr_enable(s);
-}
-
-int
-__ppc_release_all(struct __ppc_lock *mpl)
-{
-       int rv = mpl->mpl_count - 1;
-       int s;
-
-#ifdef MP_LOCKDEBUG
-       if (mpl->mpl_cpu != curcpu()) {
-               db_printf("__ppc_release_all(%p): not held lock\n", mpl);
-               db_enter();
-       }
-#endif
-
-       s = ppc_intr_disable();
-       mpl->mpl_cpu = NULL;
-       membar_exit();
-       mpl->mpl_count = 0;
-       ppc_intr_enable(s);
-
-       return (rv);
-}
-
-int
-__ppc_release_all_but_one(struct __ppc_lock *mpl)
-{
-       int rv = mpl->mpl_count - 2;
-
-#ifdef MP_LOCKDEBUG
-       if (mpl->mpl_cpu != curcpu()) {
-               db_printf("__ppc_release_all_but_one(%p): not held lock\n", 
mpl);
-               db_enter();
-       }
-#endif
-
-       mpl->mpl_count = 2;
-
-       return (rv);
-}
-
-void
-__ppc_acquire_count(struct __ppc_lock *mpl, int count)
-{
-       while (count--)
-               __ppc_lock(mpl);
-}
-
-int
-__ppc_lock_held(struct __ppc_lock *mpl, struct cpu_info *ci)
-{
-       return mpl->mpl_cpu == ci;
+               mpl->mpl_cpu = NULL;
+       } else
+               mpl->mpl_count--;
 }

Reply via email to