All:

I have been working with TCP in gigabit networks (igb driver actually) and have
found a very nasty problem with the way the driver is doing its put back when
it fills the out-bound transmit queue.

Basically it has taken a packet from the head of the ring buffer, and then 
realizes it can't fit it into the transmit queue. So it just re-enqueue's it
into the ring buffer. Whats wrong with that? Well most of the time there
are anywhere from 10-50 packets (maybe more) in that ring buffer when you are
operating at full speed (or trying to). This means you will see 10 duplicate
ACKs, do a fast retransmit and cut your cwnd in half.. not very nice actually.

The patch I have attached makes it so that

1) There are ways to swap back.
2) Use the peek in the ring buffer and only
   dequeue the packet if we put it into the transmit ring
3) If something goes wrong and the transmit frees the packet we dequeue it.
4) If the transmit changed it (defrag etc) then swap out the new mbuf that
   has taken its place.

I have fixed the four intel drivers that had this systemic issue, but there
are still more to fix.

Comments/review .. rotten egg's etc.. would be most welcome before
I commit this..

Jack are you out there?

Thanks

R

Index: dev/e1000/if_em.c
===================================================================
--- dev/e1000/if_em.c   (revision 246323)
+++ dev/e1000/if_em.c   (working copy)
@@ -894,7 +894,7 @@ static int
 em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
 {
        struct adapter  *adapter = txr->adapter;
-        struct mbuf     *next;
+        struct mbuf     *next, *next, *dequeued;
         int             err = 0, enq = 0;
 
        if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
@@ -905,22 +905,27 @@ em_mq_start_locked(struct ifnet *ifp, struct tx_ri
        }
 
        enq = 0;
-       if (m == NULL) {
-               next = drbr_dequeue(ifp, txr->br);
-       } else if (drbr_needs_enqueue(ifp, txr->br)) {
-               if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
+       if (m) {
+               err = drbr_enqueue(ifp, txr->br, m);
+               if (err) {
                        return (err);
-               next = drbr_dequeue(ifp, txr->br);
-       } else
-               next = m;
+               }
+       } 
 
        /* Process the queue */
-       while (next != NULL) {
+       while ((next = drbr_peek(ifp, txr->br)) != NULL) {
+               snext = next;
                if ((err = em_xmit(txr, &next)) != 0) {
-                        if (next != NULL)
-                                err = drbr_enqueue(ifp, txr->br, next);
-                        break;
+                       if (next == NULL) {
+                               dequeued = drbr_dequeue(ifp, txr->br);
+                               KASSERT(dequeued == snext, ("dequeued incorrect 
packet from buf_ring"));
+                       } else if (next != snext) {
+                               drbr_swap(ifp, txr->br, next, snext);
+                       }
+                       break;
                }
+               dequeued = drbr_dequeue(ifp, txr->br);
+               KASSERT(dequeued == snext, ("dequeued incorrect packet from 
buf_ring"));
                enq++;
                ifp->if_obytes += next->m_pkthdr.len;
                if (next->m_flags & M_MCAST)
Index: dev/e1000/if_igb.c
===================================================================
--- dev/e1000/if_igb.c  (revision 246323)
+++ dev/e1000/if_igb.c  (working copy)
@@ -981,7 +981,7 @@ static int
 igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
 {
        struct adapter  *adapter = txr->adapter;
-        struct mbuf     *next;
+        struct mbuf     *next, *snext, *dequeued;
         int             err = 0, enq;
 
        IGB_TX_LOCK_ASSERT(txr);
@@ -994,12 +994,21 @@ igb_mq_start_locked(struct ifnet *ifp, struct tx_r
        enq = 0;
 
        /* Process the queue */
-       while ((next = drbr_dequeue(ifp, txr->br)) != NULL) {
+       while ((next = drbr_peek(ifp, txr->br)) != NULL) {
+               snext = next;
                if ((err = igb_xmit(txr, &next)) != 0) {
-                       if (next != NULL)
-                               err = drbr_enqueue(ifp, txr->br, next);
+                       if (next == NULL) {
+                               /* It was freed, dequeue it */
+                               dequeued = drbr_dequeue(ifp, txr->br);
+                               KASSERT(dequeued == snext, ("dequeued incorrect 
packet from buf_ring"));
+                       } else if (next != snext) {
+                               /* it was changed -- defrag? pullup? */
+                               drbr_swap(ifp, txr->br, next, snext);
+                       }
                        break;
                }
+               dequeued = drbr_dequeue(ifp, txr->br);
+               KASSERT(dequeued == snext, ("dequeued incorrect packet from 
buf_ring"));
                enq++;
                ifp->if_obytes += next->m_pkthdr.len;
                if (next->m_flags & M_MCAST)
Index: dev/ixgbe/ixgbe.c
===================================================================
--- dev/ixgbe/ixgbe.c   (revision 246323)
+++ dev/ixgbe/ixgbe.c   (working copy)
@@ -821,7 +821,7 @@ static int
 ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
 {
        struct adapter  *adapter = txr->adapter;
-        struct mbuf     *next;
+        struct mbuf     *next, *snext, *dequeued;
         int             enqueued, err = 0;
 
        if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ||
@@ -832,22 +832,27 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx
        }
 
        enqueued = 0;
-       if (m == NULL) {
-               next = drbr_dequeue(ifp, txr->br);
-       } else if (drbr_needs_enqueue(ifp, txr->br)) {
-               if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
+       if (m) {
+               err = drbr_enqueue(ifp, txr->br, m);
+               if (err) {
                        return (err);
-               next = drbr_dequeue(ifp, txr->br);
-       } else
-               next = m;
+               }
+       }
 
        /* Process the queue */
-       while (next != NULL) {
+       while ((next = drbr_peek(ifp, txr->br)) != NULL) {
+               snext = next;
                if ((err = ixgbe_xmit(txr, &next)) != 0) {
-                       if (next != NULL)
-                               err = drbr_enqueue(ifp, txr->br, next);
+                       if (next == NULL) {
+                               dequeued = drbr_dequeue(ifp, txr->br);
+                               KASSERT(dequeued == snext, ("dequeued incorrect 
packet from buf_ring"));
+                       } else if (next != snext) {
+                               drbr_swap(ifp, txr->br, next, snext);
+                       }
                        break;
                }
+               dequeued = drbr_dequeue(ifp, txr->br);
+               KASSERT(dequeued == snext, ("dequeued incorrect packet from 
buf_ring"));
                enqueued++;
                /* Send a copy of the frame to the BPF listener */
                ETHER_BPF_MTAP(ifp, next);
Index: dev/ixgbe/ixv.c
===================================================================
--- dev/ixgbe/ixv.c     (revision 246323)
+++ dev/ixgbe/ixv.c     (working copy)
@@ -605,7 +605,7 @@ static int
 ixv_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
 {
        struct adapter  *adapter = txr->adapter;
-        struct mbuf     *next;
+        struct mbuf     *next, *snext, *dequeued;
         int             enqueued, err = 0;
 
        if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
@@ -620,22 +620,26 @@ ixv_mq_start_locked(struct ifnet *ifp, struct tx_r
                ixv_txeof(txr);
 
        enqueued = 0;
-       if (m == NULL) {
-               next = drbr_dequeue(ifp, txr->br);
-       } else if (drbr_needs_enqueue(ifp, txr->br)) {
-               if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
+       if (m) {
+               err = drbr_enqueue(ifp, txr->br, m);
+               if (err) {
                        return (err);
-               next = drbr_dequeue(ifp, txr->br);
-       } else
-               next = m;
-
+               }
+       }
        /* Process the queue */
-       while (next != NULL) {
+       while ((next = drbr_peek(ifp, txr->br)) != NULL) {
+               snext = next;
                if ((err = ixv_xmit(txr, &next)) != 0) {
-                       if (next != NULL)
-                               err = drbr_enqueue(ifp, txr->br, next);
+                       if (next == NULL) {
+                               dequeued = drbr_dequeue(ifp, txr->br);
+                               KASSERT(dequeued == snext, ("dequeued incorrect 
packet from buf_ring"));
+                       } else if (next != snext) {
+                               drbr_swap(ifp, txr->br, next, snext);
+                       }
                        break;
                }
+               dequeued = drbr_dequeue(ifp, txr->br);
+               KASSERT(dequeued == snext, ("dequeued incorrect packet from 
buf_ring"));
                enqueued++;
                ifp->if_obytes += next->m_pkthdr.len;
                if (next->m_flags & M_MCAST)
Index: net/if_var.h
===================================================================
--- net/if_var.h        (revision 246323)
+++ net/if_var.h        (working copy)
@@ -622,6 +622,41 @@ drbr_enqueue(struct ifnet *ifp, struct buf_ring *b
 }
 
 static __inline void
+drbr_swap(struct ifnet *ifp, struct buf_ring *br, struct mbuf *new, struct 
mbuf *prev)
+{
+       /*
+        * The top of the list needs to be swapped 
+        * for this one.
+        */
+#ifdef ALTQ
+       struct mbuf *m;
+       if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+               /* Pull it off and put it back in */
+               IFQ_DEQUEUE(&ifp->if_snd, m);
+               KASSERT(m == prev, ("Swap out failed to find prev mbuf"));
+               IFQ_DRV_DEQUEUE(&ifp->if_snd, new);
+               return;
+       }
+#endif
+       buf_ring_swap(br, new, prev);
+}
+
+static __inline struct mbuf *
+drbr_peek(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+       struct mbuf *m;
+       if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+               /* Pull it off and put it back in */
+               IFQ_DEQUEUE(&ifp->if_snd, m);
+               IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
+               return (m);
+       }
+#endif
+       return(buf_ring_peek(br));
+}
+
+static __inline void
 drbr_flush(struct ifnet *ifp, struct buf_ring *br)
 {
        struct mbuf *m;
Index: sys/buf_ring.h
===================================================================
--- sys/buf_ring.h      (revision 246323)
+++ sys/buf_ring.h      (working copy)
@@ -208,6 +208,27 @@ buf_ring_dequeue_sc(struct buf_ring *br)
 }
 
 /*
+ * Used to return a differnt mbuf to the
+ * top of the ring. This can happen if
+ * the driver changed the packets (some deframentation
+ * for example) and then realized the transmit
+ * ring was full. In such a case the old packet
+ * is now freed, but we want the order of the actual
+ * data (being sent in the new packet) to remain
+ * the same.
+ */
+static __inline void
+buf_ring_swap(struct buf_ring *br, void *new, void *old)
+{
+       int ret;
+       if (br->br_cons_head == br->br_prod_tail) 
+               /* Huh? */
+               return;
+       ret = atomic_cmpset_long((uint64_t *)&br->br_ring[br->br_cons_head], 
(uint64_t)old, (uint64_t)new);
+       KASSERT(ret, ("Swap out failed old:%p new:%p ret:%d", old, new, ret));
+}
+
+/*
  * return a pointer to the first entry in the ring
  * without modifying it, or NULL if the ring is empty
  * race-prone if not protected by a lock
-----
Randall Stewart
rand...@lakerest.net




_______________________________________________
freebsd-net@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-net
To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"

Reply via email to