Hi,

I would be grateful if someone could test the attached patch which
deals with the following problem:

        on all *BSD version, socket buffers contain a list of
        incoming and/or outgoing mbufs. Unfortunately the list only
        has a pointer to the head, meaning that all append operations
        require to scan the full list. The overhead can be very
        bad in some cases (e.g. small UDP packets), and becomes
        worse and worse as the socket buffer size increases (which
        is what one would commonly do when expecting a lot of
        traffic!).

The attached patch implements a tail pointer to the list, so that
you can append in constant time. By default, the code works exactly
as before -- the tail of the list is reached with the usual linear
scan, and the pointer to the tail is only used for comparison
purposes to make sure that it yields the same value.

If you enable the fast behaviour with

        sysctl -w kern.ipc.fastscan=1

then the new code takes over and linear scans are replaced by
dereferences of the tail pointer.

Apart from the obvious benefits of using O(1) instead of O(n)
algorithms, your mileage may vary.

When the socket buffer is almost always empty (fast receivers) then
you have no gain. When the socket buffer is almost always full you
also have no gain, because the decision to drop the packet only
requires a comparison. However, this code can really avoid trashing
in those cases where the queue size oscillates.

I'd like to commit this (or similar) code after proper testing,
so i'd like people to try it out -- I am reasonably confident
about it, and have done a fair amount of testing under heavy
udp load, but want to be sure that there are no side effects.

The attached patch applies to 4.2 (and hopefully to CURRENT
as well).

        cheers
        luigi


Index: sys/socketvar.h
===================================================================
RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/sys/socketvar.h,v
retrieving revision 1.46.2.4
diff -u -r1.46.2.4 socketvar.h
--- sys/socketvar.h     2000/11/26 02:30:08     1.46.2.4
+++ sys/socketvar.h     2001/02/08 00:58:31
@@ -93,6 +93,7 @@
                u_long  sb_mbmax;       /* max chars of mbufs to use */
                long    sb_lowat;       /* low water mark */
                struct  mbuf *sb_mb;    /* the mbuf chain */
+               struct  mbuf *sb_mb_tail; /* last pkt in chain (if sb_mb != NULL) */
                struct  selinfo sb_sel; /* process selecting read/write */
                short   sb_flags;       /* flags, see below */
                short   sb_timeo;       /* timeout for read/write */
Index: kern/uipc_socket.c
===================================================================
RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.68.2.10
diff -u -r1.68.2.10 uipc_socket.c
--- kern/uipc_socket.c  2000/11/17 19:47:27     1.68.2.10
+++ kern/uipc_socket.c  2001/02/08 01:40:04
@@ -772,6 +772,12 @@
                goto restart;
        }
 dontblock:
+       /* 
+        * On entry here, m points to the first record on the socket buffer.
+        * While we process the initial mbufs containing address and control
+        * info we save a copy of m->m_nextpkt into nextrecord. We do need
+        * to take care of sb_mb_tail until later.
+        */
        if (uio->uio_procp)
                uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
        nextrecord = m->m_nextpkt;
@@ -815,9 +821,17 @@
                        controlp = &(*controlp)->m_next;
                }
        }
+       /*
+        * If m is non-null, we have some data to read. From now on, make
+        * sure to keep sb_mb_tail consistent when working on the last
+        * packet on the chain (nextrecord==NULL) and we change m->m_nextpkt.
+        */
        if (m) {
-               if ((flags & MSG_PEEK) == 0)
+               if ((flags & MSG_PEEK) == 0) {
                        m->m_nextpkt = nextrecord;
+                       if (nextrecord == NULL)
+                               so->so_rcv.sb_mb_tail = m ;
+               }
                type = m->m_type;
                if (type == MT_OOBDATA)
                        flags |= MSG_OOB;
@@ -873,8 +887,11 @@
                                        MFREE(m, so->so_rcv.sb_mb);
                                        m = so->so_rcv.sb_mb;
                                }
-                               if (m)
+                               if (m) {
                                        m->m_nextpkt = nextrecord;
+                                       if (nextrecord == NULL)
+                                               so->so_rcv.sb_mb_tail = m ;
+                               }
                        }
                } else {
                        if (flags & MSG_PEEK)
@@ -931,8 +948,11 @@
                        (void) sbdroprecord(&so->so_rcv);
        }
        if ((flags & MSG_PEEK) == 0) {
-               if (m == 0)
+               if (m == 0) {
                        so->so_rcv.sb_mb = nextrecord;
+                       if (nextrecord == NULL || nextrecord->m_nextpkt == NULL)
+                               so->so_rcv.sb_mb_tail = nextrecord ;
+               }
                if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
                        (*pr->pr_usrreqs->pru_rcvd)(so, flags);
        }
Index: kern/uipc_socket2.c
===================================================================
RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.55.2.7
diff -u -r1.55.2.7 uipc_socket2.c
--- kern/uipc_socket2.c 2000/09/07 19:13:37     1.55.2.7
+++ kern/uipc_socket2.c 2001/02/08 01:44:53
@@ -55,6 +55,8 @@
 
 int    maxsockets;
 
+int fastscan; /* XXX see below */
+
 /*
  * Primitive routines for operating on sockets and socket buffers
  */
@@ -477,7 +479,51 @@
  * or sbdroprecord() when the data is acknowledged by the peer.
  */
 
+static struct mbuf *
+sbgettail(struct sockbuf *sb, char *msg, struct mbuf *m0);
 /*
+ * sbgettail returns a pointer to the last record of the socketbuffer.
+ * If m0 is non-null, it also appends m0 to the chain.
+ */
+static struct mbuf *
+sbgettail(struct sockbuf *sb, char *msg, struct mbuf *m0)
+{
+    struct mbuf *m = sb->sb_mb;
+
+    if (m == NULL)
+       goto done;
+    if (sb->sb_mb_tail == NULL)
+       printf("%s: null tail\n", msg);
+    if (fastscan && sb->sb_mb_tail != NULL) {
+       m = sb->sb_mb_tail ;
+       if (m == NULL)
+           panic ("sbgettail returns NULL");
+       if (m->m_nextpkt == NULL) /* ok ... */
+           goto done;
+       /* otherwise continue scan */
+       printf("%s: sbgettail m_nextpkt != NULL\n", msg);
+    }
+    while (m->m_nextpkt)
+       m = m->m_nextpkt ;
+    if (m != sb->sb_mb_tail) {
+       if (sb->sb_mb_tail != NULL)
+           printf("%s: bad tail 0x%p instead of 0x%p\n",
+               msg, sb->sb_mb_tail, m);
+       sb->sb_mb_tail = m ;
+    }
+done:
+    if (m0) {
+       if (m)
+           m->m_nextpkt = m0;
+       else
+           sb->sb_mb = m0;
+       sb->sb_mb_tail = m0 ;
+       m = m0 ;
+    }
+    return m ;
+}
+
+/*
  * Append mbuf chain m to the last record in the
  * socket buffer sb.  The additional space associated
  * the mbuf chain is recorded in sb.  Empty mbufs are
@@ -492,10 +538,8 @@
 
        if (m == 0)
                return;
-       n = sb->sb_mb;
+       n = sbgettail(sb, "sbappend", NULL);
        if (n) {
-               while (n->m_nextpkt)
-                       n = n->m_nextpkt;
                do {
                        if (n->m_flags & M_EOR) {
                                sbappendrecord(sb, m); /* XXXXXX!!!! */
@@ -545,19 +589,13 @@
 
        if (m0 == 0)
                return;
-       m = sb->sb_mb;
-       if (m)
-               while (m->m_nextpkt)
-                       m = m->m_nextpkt;
        /*
         * Put the first mbuf on the queue.
         * Note this permits zero length records.
         */
        sballoc(sb, m0);
-       if (m)
-               m->m_nextpkt = m0;
-       else
-               sb->sb_mb = m0;
+       m = sbgettail(sb, "sbappendrecord", m0);
+       if (m0->m_nextpkt != NULL) printf("ouch! sbappendrecord nextpkt!=NULL\n");
        m = m0->m_next;
        m0->m_next = 0;
        if (m && (m0->m_flags & M_EOR)) {
@@ -603,6 +641,8 @@
         */
        sballoc(sb, m0);
        m0->m_nextpkt = *mp;
+       if (*mp == NULL) /* m0 is actually the new tail */
+               sb->sb_mb_tail = m0 ;
        *mp = m0;
        m = m0->m_next;
        m0->m_next = 0;
@@ -653,13 +693,7 @@
        m->m_next = control;
        for (n = m; n; n = n->m_next)
                sballoc(sb, n);
-       n = sb->sb_mb;
-       if (n) {
-               while (n->m_nextpkt)
-                       n = n->m_nextpkt;
-               n->m_nextpkt = m;
-       } else
-               sb->sb_mb = m;
+       n = sbgettail(sb, "sbappendaddr", m);
        return (1);
 }
 
@@ -686,13 +720,7 @@
        n->m_next = m0;                 /* concatenate data to control */
        for (m = control; m; m = m->m_next)
                sballoc(sb, m);
-       n = sb->sb_mb;
-       if (n) {
-               while (n->m_nextpkt)
-                       n = n->m_nextpkt;
-               n->m_nextpkt = control;
-       } else
-               sb->sb_mb = control;
+       n = sbgettail(sb, "sbappendcontrol", control);
        return (1);
 }
 
@@ -731,7 +759,7 @@
                if (n)
                        n->m_next = m;
                else
-                       sb->sb_mb = m;
+                       sb->sb_mb = sb->sb_mb_tail = m;
                sballoc(sb, m);
                n = m;
                m->m_flags &= ~M_EOR;
@@ -1003,6 +1031,8 @@
 SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
     &sb_efficiency, 0, "");
 
+SYSCTL_INT(_kern_ipc, OID_AUTO, fastscan, CTLFLAG_RW,
+    &fastscan, 0, "Fast scanning of socket queues for append");
 /*
  * Initialise maxsockets 
  */


To Unsubscribe: send mail to [EMAIL PROTECTED]
with "unsubscribe freebsd-net" in the body of the message

Reply via email to