Author: glebius
Date: Sun May  4 23:25:32 2014
New Revision: 265338
URL: http://svnweb.freebsd.org/changeset/base/265338

Log:
  The FreeBSD-SA-14:08.tcp was a lesson on not doing acrobatics with
  mixing on stack memory and UMA memory in one linked list.
  
  Thus, rewrite TCP reassembly code in terms of memory usage. The
  algorithm remains unchanged.
  
  We actually do not need extra memory to build a reassembly queue.
  Arriving mbufs are always packet header mbufs. So we got the length
  of data as pkthdr.len. We got m_nextpkt for linkage. And we need
  only one pointer to point at the tcphdr, use PH_loc for that.
  
  In tcpcb the t_segq fields becomes mbuf pointer. The t_segqlen
  field now counts not packets, but bytes in the queue. This gives
  us more precision when comparing to socket buffer limits.
  
  Sponsored by: Netflix
  Sponsored by: Nginx, Inc.

Modified:
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_reass.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h
  head/sys/sys/mbuf.h

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c        Sun May  4 20:21:41 2014        
(r265337)
+++ head/sys/netinet/tcp_input.c        Sun May  4 23:25:32 2014        
(r265338)
@@ -1639,8 +1639,7 @@ tcp_do_segment(struct mbuf *m, struct tc
            tp->snd_nxt == tp->snd_max &&
            tiwin && tiwin == tp->snd_wnd && 
            ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
-           LIST_EMPTY(&tp->t_segq) &&
-           ((to.to_flags & TOF_TS) == 0 ||
+           tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 ||
             TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
                /*
@@ -2908,8 +2907,7 @@ dodata:                                                   
/* XXX */
                 * immediately when segments are out of order (so
                 * fast retransmit can work).
                 */
-               if (th->th_seq == tp->rcv_nxt &&
-                   LIST_EMPTY(&tp->t_segq) &&
+               if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL &&
                    TCPS_HAVEESTABLISHED(tp->t_state)) {
                        if (DELAY_ACK(tp, tlen))
                                tp->t_flags |= TF_DELACK;

Modified: head/sys/netinet/tcp_reass.c
==============================================================================
--- head/sys/netinet/tcp_reass.c        Sun May  4 20:21:41 2014        
(r265337)
+++ head/sys/netinet/tcp_reass.c        Sun May  4 23:25:32 2014        
(r265338)
@@ -71,19 +71,10 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
-#ifdef TCPDEBUG
-#include <netinet/tcp_debug.h>
-#endif /* TCPDEBUG */
 
 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
     "TCP Segment Reassembly Queue");
 
-static VNET_DEFINE(int, tcp_reass_maxseg) = 0;
-#define        V_tcp_reass_maxseg              VNET(tcp_reass_maxseg)
-SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
-    &VNET_NAME(tcp_reass_maxseg), 0,
-    "Global maximum number of TCP Segments in Reassembly Queue");
-
 static VNET_DEFINE(int, tcp_reass_overflows) = 0;
 #define        V_tcp_reass_overflows           VNET(tcp_reass_overflows)
 SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
@@ -91,78 +82,32 @@ SYSCTL_VNET_INT(_net_inet_tcp_reass, OID
     &VNET_NAME(tcp_reass_overflows), 0,
     "Global number of TCP Segment Reassembly Queue Overflows");
 
-static VNET_DEFINE(uma_zone_t, tcp_reass_zone);
-#define        V_tcp_reass_zone                VNET(tcp_reass_zone)
-SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_VNET,
-    &VNET_NAME(tcp_reass_zone),
-    "Global number of TCP Segments currently in Reassembly Queue");
-
-/* Initialize TCP reassembly queue */
-static void
-tcp_reass_zone_change(void *tag)
-{
-
-       /* Set the zone limit and read back the effective value. */
-       V_tcp_reass_maxseg = nmbclusters / 16;
-       V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
-           V_tcp_reass_maxseg);
-}
-
-void
-tcp_reass_init(void)
-{
-
-       V_tcp_reass_maxseg = nmbclusters / 16;
-       TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
-           &V_tcp_reass_maxseg);
-       V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
-           NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-       /* Set the zone limit and read back the effective value. */
-       V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
-           V_tcp_reass_maxseg);
-       EVENTHANDLER_REGISTER(nmbclusters_change,
-           tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
-}
-
-#ifdef VIMAGE
-void
-tcp_reass_destroy(void)
-{
-
-       uma_zdestroy(V_tcp_reass_zone);
-}
-#endif
-
 void
 tcp_reass_flush(struct tcpcb *tp)
 {
-       struct tseg_qent *qe;
+       struct mbuf *m;
 
        INP_WLOCK_ASSERT(tp->t_inpcb);
 
-       while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) {
-               LIST_REMOVE(qe, tqe_q);
-               m_freem(qe->tqe_m);
-               uma_zfree(V_tcp_reass_zone, qe);
-               tp->t_segqlen--;
+       while ((m = tp->t_segq) != NULL) {
+               tp->t_segq = m->m_nextpkt;
+               tp->t_segqlen -= m->m_pkthdr.len;
+               m_freem(m);
        }
 
        KASSERT((tp->t_segqlen == 0),
-           ("TCP reass queue %p segment count is %d instead of 0 after flush.",
+           ("TCP reass queue %p length is %d instead of 0 after flush.",
            tp, tp->t_segqlen));
 }
 
+#define        M_TCPHDR(m)     ((struct tcphdr *)((m)->m_pkthdr.pkt_tcphdr))
+
 int
 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
 {
-       struct tseg_qent *q;
-       struct tseg_qent *p = NULL;
-       struct tseg_qent *nq;
-       struct tseg_qent *te = NULL;
        struct socket *so = tp->t_inpcb->inp_socket;
-       char *s = NULL;
-       int flags;
-       struct tseg_qent tqs;
+       struct mbuf *mq, *mp;
+       int flags, wakeup;
 
        INP_WLOCK_ASSERT(tp->t_inpcb);
 
@@ -178,6 +123,10 @@ tcp_reass(struct tcpcb *tp, struct tcphd
        if (th == NULL)
                goto present;
 
+       M_ASSERTPKTHDR(m);
+       KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__,
+           *tlenp, m->m_pkthdr.len));
+
        /*
         * Limit the number of segments that can be queued to reduce the
         * potential for mbuf exhaustion. For best performance, we want to be
@@ -188,19 +137,17 @@ tcp_reass(struct tcpcb *tp, struct tcphd
         * Always let the missing segment through which caused this queue.
         * NB: Access to the socket buffer is left intentionally unlocked as we
         * can tolerate stale information here.
-        *
-        * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat
-        * should work but causes packets to be dropped when they shouldn't.
-        * Investigate why and re-evaluate the below limit after the behaviour
-        * is understood.
         */
        if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
-           tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) {
+           tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) {
+               char *s;
+
                V_tcp_reass_overflows++;
                TCPSTAT_INC(tcps_rcvmemdrop);
                m_freem(m);
                *tlenp = 0;
-               if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) 
{
+               if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
+                   NULL))) {
                        log(LOG_DEBUG, "%s; %s: queue limit reached, "
                            "segment dropped\n", s, __func__);
                        free(s, M_TCPLOG);
@@ -209,46 +156,13 @@ tcp_reass(struct tcpcb *tp, struct tcphd
        }
 
        /*
-        * Allocate a new queue entry. If we can't, or hit the zone limit
-        * just drop the pkt.
-        *
-        * Use a temporary structure on the stack for the missing segment
-        * when the zone is exhausted. Otherwise we may get stuck.
-        */
-       te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT);
-       if (te == NULL) {
-               if (th->th_seq != tp->rcv_nxt || 
!TCPS_HAVEESTABLISHED(tp->t_state)) {
-                       TCPSTAT_INC(tcps_rcvmemdrop);
-                       m_freem(m);
-                       *tlenp = 0;
-                       if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
-                           NULL))) {
-                               log(LOG_DEBUG, "%s; %s: global zone limit "
-                                   "reached, segment dropped\n", s, __func__);
-                               free(s, M_TCPLOG);
-                       }
-                       return (0);
-               } else {
-                       bzero(&tqs, sizeof(struct tseg_qent));
-                       te = &tqs;
-                       if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
-                           NULL))) {
-                               log(LOG_DEBUG,
-                                   "%s; %s: global zone limit reached, using "
-                                   "stack for missing segment\n", s, __func__);
-                               free(s, M_TCPLOG);
-                       }
-               }
-       }
-       tp->t_segqlen++;
-
-       /*
         * Find a segment which begins after this one does.
         */
-       LIST_FOREACH(q, &tp->t_segq, tqe_q) {
-               if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
+       mp = NULL;
+       for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) {
+               if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq))
                        break;
-               p = q;
+               mp = mq;
        }
 
        /*
@@ -256,18 +170,16 @@ tcp_reass(struct tcpcb *tp, struct tcphd
         * our data already.  If so, drop the data from the incoming
         * segment.  If it provides all of our data, drop us.
         */
-       if (p != NULL) {
+       if (mp != NULL) {
                int i;
+
                /* conversion to int (in i) handles seq wraparound */
-               i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
+               i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq;
                if (i > 0) {
                        if (i >= *tlenp) {
                                TCPSTAT_INC(tcps_rcvduppack);
                                TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
                                m_freem(m);
-                               if (te != &tqs)
-                                       uma_zfree(V_tcp_reass_zone, te);
-                               tp->t_segqlen--;
                                /*
                                 * Try to present any queued data
                                 * at the left window edge to the user.
@@ -289,37 +201,40 @@ tcp_reass(struct tcpcb *tp, struct tcphd
         * While we overlap succeeding segments trim them or,
         * if they are completely covered, dequeue them.
         */
-       while (q) {
-               int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
+       while (mq) {
+               struct mbuf *nq;
+               int i;
+
+               i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq;
                if (i <= 0)
                        break;
-               if (i < q->tqe_len) {
-                       q->tqe_th->th_seq += i;
-                       q->tqe_len -= i;
-                       m_adj(q->tqe_m, i);
+               if (i < mq->m_pkthdr.len) {
+                       M_TCPHDR(mq)->th_seq += i;
+                       m_adj(mq, i);
+                       tp->t_segqlen -= i;
                        break;
                }
 
-               nq = LIST_NEXT(q, tqe_q);
-               LIST_REMOVE(q, tqe_q);
-               m_freem(q->tqe_m);
-               uma_zfree(V_tcp_reass_zone, q);
-               tp->t_segqlen--;
-               q = nq;
+               nq = mq->m_nextpkt;
+               tp->t_segqlen -= mq->m_pkthdr.len;
+               m_freem(mq);
+               if (mp)
+                       mp->m_nextpkt = nq;
+               else
+                       tp->t_segq = nq;
+               mq = nq;
        }
 
        /* Insert the new segment queue entry into place. */
-       te->tqe_m = m;
-       te->tqe_th = th;
-       te->tqe_len = *tlenp;
-
-       if (p == NULL) {
-               LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+       if (mp) {
+               m->m_nextpkt = mp->m_nextpkt;
+               mp->m_nextpkt = m;
        } else {
-               KASSERT(te != &tqs, ("%s: temporary stack based entry not "
-                   "first element in queue", __func__));
-               LIST_INSERT_AFTER(p, te, tqe_q);
+               m->m_nextpkt = tp->t_segq;
+               tp->t_segq = m ;
        }
+       m->m_pkthdr.pkt_tcphdr = th;
+       tp->t_segqlen += m->m_pkthdr.len;
 
 present:
        /*
@@ -328,25 +243,30 @@ present:
         */
        if (!TCPS_HAVEESTABLISHED(tp->t_state))
                return (0);
-       q = LIST_FIRST(&tp->t_segq);
-       if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
-               return (0);
+
+       flags = 0;
+       wakeup = 0;
        SOCKBUF_LOCK(&so->so_rcv);
-       do {
-               tp->rcv_nxt += q->tqe_len;
-               flags = q->tqe_th->th_flags & TH_FIN;
-               nq = LIST_NEXT(q, tqe_q);
-               LIST_REMOVE(q, tqe_q);
+       while ((mq = tp->t_segq) != NULL &&
+           M_TCPHDR(mq)->th_seq == tp->rcv_nxt) {
+               tp->t_segq = mq->m_nextpkt;
+
+               tp->rcv_nxt += mq->m_pkthdr.len;
+               tp->t_segqlen -= mq->m_pkthdr.len;
+               flags = M_TCPHDR(mq)->th_flags & TH_FIN;
+
                if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
-                       m_freem(q->tqe_m);
-               else
-                       sbappendstream_locked(&so->so_rcv, q->tqe_m);
-               if (q != &tqs)
-                       uma_zfree(V_tcp_reass_zone, q);
-               tp->t_segqlen--;
-               q = nq;
-       } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
+                       m_freem(mq);
+               else {
+                       mq->m_nextpkt = NULL;
+                       sbappendstream_locked(&so->so_rcv, mq);
+                       wakeup = 1;
+               }
+       }
        ND6_HINT(tp);
-       sorwakeup_locked(so);
+       if (wakeup)
+               sorwakeup_locked(so);
+       else
+               SOCKBUF_UNLOCK(&so->so_rcv);
        return (flags);
 }

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c Sun May  4 20:21:41 2014        (r265337)
+++ head/sys/netinet/tcp_subr.c Sun May  4 23:25:32 2014        (r265338)
@@ -375,7 +375,6 @@ tcp_init(void)
        tcp_tw_init();
        syncache_init();
        tcp_hc_init();
-       tcp_reass_init();
 
        TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
        V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
@@ -433,7 +432,6 @@ tcp_destroy(void)
 {
        int error;
 
-       tcp_reass_destroy();
        tcp_hc_destroy();
        syncache_destroy();
        tcp_tw_destroy();

Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c       Sun May  4 20:21:41 2014        
(r265337)
+++ head/sys/netinet/tcp_usrreq.c       Sun May  4 23:25:32 2014        
(r265338)
@@ -1949,7 +1949,7 @@ db_print_tcpcb(struct tcpcb *tp, const c
 
        db_print_indent(indent);
        db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
-          LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
+          tp->t_segq, tp->t_segqlen, tp->t_dupacks);
 
        db_print_indent(indent);
        db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h  Sun May  4 20:21:41 2014        (r265337)
+++ head/sys/netinet/tcp_var.h  Sun May  4 23:25:32 2014        (r265338)
@@ -46,15 +46,6 @@ VNET_DECLARE(int, tcp_do_rfc1323);
 
 #endif /* _KERNEL */
 
-/* TCP segment queue entry */
-struct tseg_qent {
-       LIST_ENTRY(tseg_qent) tqe_q;
-       int     tqe_len;                /* TCP segment data length */
-       struct  tcphdr *tqe_th;         /* a pointer to tcp header */
-       struct  mbuf    *tqe_m;         /* mbuf contains packet */
-};
-LIST_HEAD(tsegqe_head, tseg_qent);
-
 struct sackblk {
        tcp_seq start;          /* start seq no. of sack block */
        tcp_seq end;            /* end seq no. */
@@ -100,7 +91,7 @@ do {                                                         
\
  * Organized for 16 byte cacheline efficiency.
  */
 struct tcpcb {
-       struct  tsegqe_head t_segq;     /* segment reassembly queue */
+       struct  mbuf *t_segq;           /* segment reassembly queue */
        void    *t_pspare[2];           /* new reassembly queue */
        int     t_segqlen;              /* segment reassembly queue length */
        int     t_dupacks;              /* consecutive dup acks recd */
@@ -663,11 +654,7 @@ char       *tcp_log_addrs(struct in_conninfo *
 char   *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
            const void *);
 int     tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
-void    tcp_reass_init(void);
 void    tcp_reass_flush(struct tcpcb *);
-#ifdef VIMAGE
-void    tcp_reass_destroy(void);
-#endif
 void    tcp_input(struct mbuf *, int);
 u_long  tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
 u_long  tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);

Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h Sun May  4 20:21:41 2014        (r265337)
+++ head/sys/sys/mbuf.h Sun May  4 23:25:32 2014        (r265338)
@@ -159,6 +159,7 @@ struct pkthdr {
 #define        tso_segsz       PH_per.sixteen[1]
 #define        csum_phsum      PH_per.sixteen[2]
 #define        csum_data       PH_per.thirtytwo[1]
+#define        pkt_tcphdr      PH_loc.ptr
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to