Since appx. November, 2012, I've had 2 amd64 systems hang while
spewing "ehci_idone: ex=0xffff800000...... is done!" messages to the
serial console.  The hangs are intermittent.  The system is
unresponsive to the keyboard and doesn't respond to network ping.  A
hardware reset is necessary to regain control.

In order to help troubleshoot, I patched /usr/src/sys/dev/usb/ehci.c
to panic when the forementioned message had occurred 9 times and then
built a custom kernel with EHCI_DEBUG defined.  In the past day, the
new panic has occurred on the same machine with both an mp and sp
kernel and I have collected basic ddb information as well as crash
dumps.

Will the ddb results from my patch [below] help troubleshoot the hang?
If so, the largish console logs, usbdevs, pcidump and acpidump are
located at <http://arp.thrush.com/openbsd/ehci_idone/x4/>.

NB: ehcidebug=0 in the sp session, while ehcidebug=3 or 2 in the mp session.
Setting ehcidebug=3 seemed to hang but I was able to interrupt ddb, set
ehcidebug=2 and continue the ddb session.

I appreciate any help diagnosing this problem.

Thanks, Bob


Index: dev/usb/ehci.c
===================================================================
RCS file: /pub2/cvsroot/OpenBSD/src/sys/dev/usb/ehci.c,v
retrieving revision 1.134
diff -u -p -w -b -u -r1.134 ehci.c
--- dev/usb/ehci.c      12 Jun 2013 11:42:01 -0000      1.134
+++ dev/usb/ehci.c      12 Jun 2013 12:47:18 -0000
@@ -81,6 +81,8 @@ struct cfdriver ehci_cd = {
 #define DPRINTF(x)     do { if (ehcidebug) printf x; } while(0)
 #define DPRINTFN(n,x)  do { if (ehcidebug>(n)) printf x; } while (0)
 int ehcidebug = 0;
+int ehcicount = 0;
+int ehcicount_max = 10; /* panic - use ddb to gather more info before 
restarting */
 #define bitmask_snprintf(q,f,b,l) snprintf((b), (l), "%b", (q), (f))
 #else
 #define DPRINTF(x)
@@ -808,12 +810,15 @@ ehci_idone(struct ehci_xfer *ex)
        {
                int s = splhigh();
                if (ex->isdone) {
+                       if ( ++ehcicount >= ehcicount_max ) {
+                               panic("ehci_idone: ex is done!\n");
+                       }
                        splx(s);
 #ifdef EHCI_DEBUG
-                       printf("ehci_idone: ex is done!\n   ");
+                       printf("ehci_idone: ex is done!ehcicount=%d\n   ", 
ehcicount);
                        ehci_dump_exfer(ex);
 #else
-                       printf("ehci_idone: ex=%p is done!\n", ex);
+                       printf("ehci_idone: ex=%p is done!ehcicount=%d\n", ex, 
ehcicount);
 #endif
                        return;
                }

Reply via email to