On Tue, 16 Sep 2014 11:56:14 -0400
Alan Stern <st...@rowland.harvard.edu> wrote:

> On Tue, 16 Sep 2014, Joe Lawrence wrote:
> 
> > > Anyway, the log above means that a QH was linked before the HC died,
> > > but then it was never unlinked.  Please add a line at the start of
> > > ehci_endpoint_disable, printing out the value of ep->hcpriv for
> > > comparison with the values listed above.
> > 
> > Will do.
> > 
> > > Also, let's see the contents of *qh and *(qh->hw) for the left-over QH
> > > at *(ehci->async->qh_next.qh).  You can get those right away from your 
> > > crash dump data, right?
> > 
> > Yep, interestingly enough, *(ehci->async->qh_next.qh) appears to be
> > mostly full of zeros:
> 
> > crash> struct ehci_qh 0xffff88083d6b4558         << 
> > *(ehci->async->qh_next.qh)
> > struct ehci_qh {
> >   hw = 0x0,
> >   qh_dma = 0x0, 
> >   qh_next = {
> >     qh = 0x0, 
> >     itd = 0x0, 
> >     sitd = 0x0, 
> >     fstn = 0x0, 
> >     hw_next = 0x0, 
> >     ptr = 0x0
> >   }, 
> >   qtd_list = {
> >     next = 0x0, 
> >     prev = 0x0
> >   }, 
> >   intr_node = {
> >     next = 0x0, 
> >     prev = 0x0
> >   }, 
> >   dummy = 0x0, 
> >   unlink_node = {
> >     next = 0x0, 
> >     prev = 0x0
> >   }, 
> >   unlink_cycle = 0x6, 
> >   qh_state = 0x0, 
> >   xacterrs = 0x0, 
> >   usecs = 0x0, 
> >   gap_uf = 0x0, 
> >   c_usecs = 0x0, 
> >   tt_usecs = 0x0, 
> >   period = 0x0, 
> >   start = 0x0, 
> >   dev = 0x6, 
> >   is_out = 0x0, 
> >   clearing_tt = 0x0, 
> >   dequeue_during_giveback = 0x0, 
> >   exception = 0x0
> > }
> 
> So this QH was destroyed properly, but somehow the link pointer in
> ehci->async continued to point at it.  And it got destroyed without
> passing through single_unlink_async.
> 
> Maybe you should also add a check at the end of ehci_endpoint_disable, 
> just before the done: label.  For example,
> 
>       BUG_ON(ehci->async->qh_next.qh == qh);
> 
> Then if you really want to be thorough, in single_unlink_async, just
> after the "while" loop, print the values of prev, prev->qh_next.qh, qh,
> and qh->qh_next.qh.  And in qh_link_async, just after the wmb(), print
> the values of head->qh_next.qh and qh.  (This could end up generating
> quite a lot of output...)
> 
> Alan Stern
> 

Quick turn around on this one :)

crash> log | grep 0000:2c:00.0

pci 0000:2c:00.0: [8086:1d26] type 00 class 0x0c0320
pci 0000:2c:00.0: reg 0x10: [mem 0x90000000-0x900003ff]
pci 0000:2c:00.0: PME# supported from D0 D3hot D3cold
ehci-pci 0000:2c:00.0: EHCI Host Controller
ehci-pci 0000:2c:00.0: new USB bus registered, assigned bus number 1
ehci-pci 0000:2c:00.0: debug port 2
ehci-pci 0000:2c:00.0: cache line size of 64 is not supported
ehci-pci 0000:2c:00.0: irq 10, io mem 0x90000000
ehci-pci 0000:2c:00.0: USB 2.0 started, EHCI 1.00
usb usb1: SerialNumber: 0000:2c:00.0
ehci-pci 0000:2c:00.0: qh_link_async:1003 ehci = ffff88084ff1b088, 
head->qh_next.qh =           (null), qh = ffff88083eacae50
ehci-pci 0000:2c:00.0: HC died; cleaning up
ehci-pci 0000:2c:00.0: ehci_endpoint_disable:944 ep->hcpriv = ffff88083eacae50, 
ehci(ffff88084ff1b088)->num_async = 1, ehci->async->qh_next.qh = 
ffff88083eacae50

------------[ cut here ]------------
kernel BUG at drivers/usb/host/ehci-hcd.c:1002!
invalid opcode: 0000 [#1] SMP 
...
CPU: 0 PID: 207 Comm: khubd Tainted: PF          O--------------   
3.10.0-123.6.3.el7.bigphysarea_expedient1_usb4.x86_64 #1
Hardware name: Stratus ftServer 6400/G7LAZ, BIOS BIOS Version 6.3:57 12/25/2013
task: ffff880853ff8cb0 ti: ffff880853434000 task.ti: ffff880853434000
RIP: 0010:[<ffffffff814157c6>]  [<ffffffff814157c6>] 
ehci_endpoint_disable+0x1f6/0x200
RSP: 0018:ffff880853435ca0  EFLAGS: 00010046
RAX: ffff8800372cc720 RBX: ffff88083eacae50 RCX: dead000000200200
RDX: ffff88085f800710 RSI: 0000000000000047 RDI: 0000000000000047
RBP: ffff880853435ce0 R08: ffff88085f800710 R09: 0000000100230022
R10: ffff88085f807600 R11: ffffffff81412158 R12: ffff88084ff1ae68
R13: ffff88084ff1b174 R14: ffff880851914a88 R15: 0000000000000246
FS:  0000000000000000(0000) GS:ffff88085fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffc501a2000 CR3: 0000000850402000 CR4: 00000000000407f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Stack:
 ffff880800000001 ffff88083eacae50 ffff88084ff1b088 ffff880851914a40
 ffff880851914a88 0000000000000001 ffff880851914a40 00000000ffffff00
 ffff880853435d00 ffffffff813fabf3 ffff880851914a40 ffff880851914a88
Call Trace:
 [<ffffffff813fabf3>] usb_hcd_disable_endpoint+0x33/0x40
 [<ffffffff813fd8b9>] usb_disable_endpoint+0x69/0x90
 [<ffffffff813effbc>] usb_ep0_reinit+0x1c/0x40
 [<ffffffff813f4fba>] hub_port_connect_change+0x33a/0xb50
 [<ffffffff813f5b49>] hub_events+0x379/0xa30
 [<ffffffff813f6245>] hub_thread+0x45/0x1a0
 [<ffffffff81086ac0>] ? wake_up_bit+0x30/0x30
 [<ffffffff813f6200>] ? hub_events+0xa30/0xa30
 [<ffffffff81085aff>] kthread+0xcf/0xe0
 [<ffffffff81085a30>] ? kthread_create_on_node+0x140/0x140
 [<ffffffff815f31ec>] ret_from_fork+0x7c/0xb0
 [<ffffffff81085a30>] ? kthread_create_on_node+0x140/0x140


Just to review, I pasted the USB related patches to this kernel below.  The
drivers/usb/core/hub.c changes include:

  d8521af "usb: assign default peer ports for root hubs" (maxchild parts)
  c605f3c "usb: hub: take hub->hdev reference when processing from eventlist"

the rest are debugging for the qh_destroy BUG.

-- Joe

diff -Nupr before/drivers/usb/core/hub.c after/drivers/usb/core/hub.c
--- before/drivers/usb/core/hub.c       2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/core/hub.c        2014-09-16 12:35:26.816249313 -0400
@@ -1304,6 +1304,7 @@ static int hub_configure(struct usb_hub
        char *message = "out of memory";
        unsigned unit_load;
        unsigned full_load;
+       unsigned maxchild;
 
        hub->buffer = kmalloc(sizeof(*hub->buffer), GFP_KERNEL);
        if (!hub->buffer) {
@@ -1342,12 +1343,11 @@ static int hub_configure(struct usb_hub
                goto fail;
        }
 
-       hdev->maxchild = hub->descriptor->bNbrPorts;
-       dev_info (hub_dev, "%d port%s detected\n", hdev->maxchild,
-               (hdev->maxchild == 1) ? "" : "s");
+       maxchild = hub->descriptor->bNbrPorts;
+       dev_info (hub_dev, "%d port%s detected\n", maxchild,
+               (maxchild == 1) ? "" : "s");
 
-       hub->ports = kzalloc(hdev->maxchild * sizeof(struct usb_port *),
-                            GFP_KERNEL);
+       hub->ports = kzalloc(maxchild * sizeof(struct usb_port *), GFP_KERNEL);
        if (!hub->ports) {
                ret = -ENOMEM;
                goto fail;
@@ -1368,11 +1368,11 @@ static int hub_configure(struct usb_hub
                int     i;
                char    portstr [USB_MAXCHILDREN + 1];
 
-               for (i = 0; i < hdev->maxchild; i++)
+               for (i = 0; i < maxchild; i++)
                        portstr[i] = hub->descriptor->u.hs.DeviceRemovable
                                    [((i + 1) / 8)] & (1 << ((i + 1) % 8))
                                ? 'F' : 'R';
-               portstr[hdev->maxchild] = 0;
+               portstr[maxchild] = 0;
                dev_dbg(hub_dev, "compound device; port removable status: 
%s\n", portstr);
        } else
                dev_dbg(hub_dev, "standalone hub\n");
@@ -1484,7 +1484,7 @@ static int hub_configure(struct usb_hub
                if (hcd->power_budget > 0)
                        hdev->bus_mA = hcd->power_budget;
                else
-                       hdev->bus_mA = full_load * hdev->maxchild;
+                       hdev->bus_mA = full_load * maxchild;
                if (hdev->bus_mA >= full_load)
                        hub->mA_per_port = full_load;
                else {
@@ -1499,7 +1499,7 @@ static int hub_configure(struct usb_hub
                        hub->descriptor->bHubContrCurrent);
                hub->limited_power = 1;
 
-               if (remaining < hdev->maxchild * unit_load)
+               if (remaining < maxchild * unit_load)
                        dev_warn(hub_dev,
                                        "insufficient power available "
                                        "to use all downstream ports\n");
@@ -1567,15 +1567,17 @@ static int hub_configure(struct usb_hub
        if (hub->has_indicators && blinkenlights)
                hub->indicator [0] = INDICATOR_CYCLE;
 
-       for (i = 0; i < hdev->maxchild; i++) {
+       for (i = 0; i < maxchild; i++) {
                ret = usb_hub_create_port_device(hub, i + 1);
                if (ret < 0) {
                        dev_err(hub->intfdev,
                                "couldn't create port%d device.\n", i + 1);
-                       hdev->maxchild = i;
-                       goto fail_keep_maxchild;
+                       break;
                }
        }
+       hdev->maxchild = i;
+       if (ret < 0)
+               goto fail;
 
        usb_hub_adjust_deviceremovable(hdev, hub->descriptor);
 
@@ -1583,8 +1585,6 @@ static int hub_configure(struct usb_hub
        return 0;
 
 fail:
-       hdev->maxchild = 0;
-fail_keep_maxchild:
        dev_err (hub_dev, "config failed, %s (err %d)\n",
                        message, ret);
        /* hub_disconnect() frees urb and descriptor */
@@ -4650,9 +4650,10 @@ static void hub_events(void)
 
                hub = list_entry(tmp, struct usb_hub, event_list);
                kref_get(&hub->kref);
+               hdev = hub->hdev;
+               usb_get_dev(hdev);
                spin_unlock_irq(&hub_event_lock);
 
-               hdev = hub->hdev;
                hub_dev = hub->intfdev;
                intf = to_usb_interface(hub_dev);
                dev_dbg(hub_dev, "state %d ports %d chg %04x evt %04x\n",
@@ -4864,6 +4865,7 @@ static void hub_events(void)
                usb_autopm_put_interface(intf);
  loop_disconnected:
                usb_unlock_device(hdev);
+               usb_put_dev(hdev);
                kref_put(&hub->kref, hub_release);
 
         } /* end while (1) */
diff -Nupr before/drivers/usb/host/ehci.h after/drivers/usb/host/ehci.h
--- before/drivers/usb/host/ehci.h      2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci.h       2014-09-16 12:35:26.836249471 -0400
@@ -229,6 +229,8 @@ struct ehci_hcd {                   /* one per controlle
        struct dentry           *debug_dir;
 #endif
 
+       int num_async;
+
        /* platform-specific data -- must come last */
        unsigned long           priv[0] __aligned(sizeof(s64));
 };
diff -Nupr before/drivers/usb/host/ehci-hcd.c after/drivers/usb/host/ehci-hcd.c
--- before/drivers/usb/host/ehci-hcd.c  2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci-hcd.c   2014-09-16 12:42:24.603336165 -0400
@@ -417,6 +417,7 @@ static void ehci_stop (struct usb_hcd *h
 
        ehci_dbg (ehci, "stop\n");
 
+       ehci_info(ehci, "%s:%d ehci(%p)->num_async = %d, 
ehci->async->qh_next.qh = %p\n", __func__, __LINE__, ehci, ehci->num_async, 
ehci->async->qh_next.qh);
        /* no more interrupts ... */
 
        spin_lock_irq(&ehci->lock);
@@ -940,6 +941,8 @@ ehci_endpoint_disable (struct usb_hcd *h
        /* ASSERT:  any requests/urbs are being unlinked */
        /* ASSERT:  nobody can be submitting urbs for this any more */
 
+       ehci_info(ehci, "%s:%d ep->hcpriv = %p, ehci(%p)->num_async = %d, 
ehci->async->qh_next.qh = %p\n", __func__, __LINE__, ep->hcpriv, ehci, 
ehci->num_async, ehci->async->qh_next.qh);
+
 rescan:
        spin_lock_irqsave (&ehci->lock, flags);
        qh = ep->hcpriv;
@@ -995,6 +998,9 @@ idle_timeout:
                        list_empty (&qh->qtd_list) ? "" : "(has tds)");
                break;
        }
+
+       BUG_ON(ehci->async->qh_next.qh == qh);
+
  done:
        ep->hcpriv = NULL;
        spin_unlock_irqrestore (&ehci->lock, flags);
diff -Nupr before/drivers/usb/host/ehci-q.c after/drivers/usb/host/ehci-q.c
--- before/drivers/usb/host/ehci-q.c    2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci-q.c     2014-09-16 12:42:21.553284025 -0400
@@ -959,6 +959,25 @@ static void disable_async(struct ehci_hc
        ehci_poll_ASS(ehci);
 }
 
+static void check_async_ring(struct ehci_hcd *ehci, int add)
+{
+       struct ehci_qh *qh;
+       int n;
+
+       qh = ehci->async->qh_next.qh;
+       n = ehci->num_async += add;
+
+       while (qh && n > 0) {
+               qh = qh->qh_next.qh;
+               --n;
+       }
+       if (qh || n != 0) {
+               ehci_err(ehci, "EHCI async list corrupted: num %d n %d qh %p\n",
+                               ehci->num_async, n, qh);
+               BUG();
+       }
+}
+
 /* move qh (and its qtds) onto async queue; maybe enable queue.  */
 
 static void qh_link_async (struct ehci_hcd *ehci, struct ehci_qh *qh)
@@ -981,6 +1000,8 @@ static void qh_link_async (struct ehci_h
        qh->hw->hw_next = head->hw->hw_next;
        wmb ();
 
+       ehci_info(ehci, "%s:%d ehci = %p, head->qh_next.qh = %p, qh = %p\n", 
__func__, __LINE__, ehci, head->qh_next.qh, qh);
+
        head->qh_next.qh = qh;
        head->hw->hw_next = dma;
 
@@ -990,6 +1011,8 @@ static void qh_link_async (struct ehci_h
        /* qtd completions reported later by interrupt */
 
        enable_async(ehci);
+
+       check_async_ring(ehci, 1);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -1254,10 +1277,14 @@ static void single_unlink_async(struct e
        while (prev->qh_next.qh != qh)
                prev = prev->qh_next.qh;
 
+       ehci_info(ehci, "%s:%d ehci = %p prev = %p, prev->qh_next.qh = %p, qh = 
%p, qh->qh_next.qh = %p\n", __func__, __LINE__, ehci, prev, prev->qh_next.qh, 
qh, qh->qh_next.qh);
+
        prev->hw->hw_next = qh->hw->hw_next;
        prev->qh_next = qh->qh_next;
        if (ehci->qh_scan_next == qh)
                ehci->qh_scan_next = qh->qh_next.qh;
+
+       check_async_ring(ehci, -1);
 }
 
 static void start_iaa_cycle(struct ehci_hcd *ehci)


--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to