Hi,
I would be grateful if someone could test the attached patch which
deals with the following problem:
on all *BSD version, socket buffers contain a list of
incoming and/or outgoing mbufs. Unfortunately the list only
has a pointer to the head, meaning that all append operations
require to scan the full list. The overhead can be very
bad in some cases (e.g. small UDP packets), and becomes
worse and worse as the socket buffer size increases (which
is what one would commonly do when expecting a lot of
traffic!).
The attached patch implements a tail pointer to the list, so that
you can append in constant time. By default, the code works exactly
as before -- the tail of the list is reached with the usual linear
scan, and the pointer to the tail is only used for comparison
purposes to make sure that it yields the same value.
If you enable the fast behaviour with
sysctl -w kern.ipc.fastscan=1
then the new code takes over and linear scans are replaced by
dereferences of the tail pointer.
Apart from the obvious benefits of using O(1) instead of O(n)
algorithms, your mileage may vary.
When the socket buffer is almost always empty (fast receivers) then
you have no gain. When the socket buffer is almost always full you
also have no gain, because the decision to drop the packet only
requires a comparison. However, this code can really avoid trashing
in those cases where the queue size oscillates.
I'd like to commit this (or similar) code after proper testing,
so i'd like people to try it out -- I am reasonably confident
about it, and have done a fair amount of testing under heavy
udp load, but want to be sure that there are no side effects.
The attached patch applies to 4.2 (and hopefully to CURRENT
as well).
cheers
luigi
Index: sys/socketvar.h
===================================================================
RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/sys/socketvar.h,v
retrieving revision 1.46.2.4
diff -u -r1.46.2.4 socketvar.h
--- sys/socketvar.h 2000/11/26 02:30:08 1.46.2.4
+++ sys/socketvar.h 2001/02/08 00:58:31
@@ -93,6 +93,7 @@
u_long sb_mbmax; /* max chars of mbufs to use */
long sb_lowat; /* low water mark */
struct mbuf *sb_mb; /* the mbuf chain */
+ struct mbuf *sb_mb_tail; /* last pkt in chain (if sb_mb != NULL) */
struct selinfo sb_sel; /* process selecting read/write */
short sb_flags; /* flags, see below */
short sb_timeo; /* timeout for read/write */
Index: kern/uipc_socket.c
===================================================================
RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.68.2.10
diff -u -r1.68.2.10 uipc_socket.c
--- kern/uipc_socket.c 2000/11/17 19:47:27 1.68.2.10
+++ kern/uipc_socket.c 2001/02/08 01:40:04
@@ -772,6 +772,12 @@
goto restart;
}
dontblock:
+ /*
+ * On entry here, m points to the first record on the socket buffer.
+ * While we process the initial mbufs containing address and control
+ * info we save a copy of m->m_nextpkt into nextrecord. We do need
+ * to take care of sb_mb_tail until later.
+ */
if (uio->uio_procp)
uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
nextrecord = m->m_nextpkt;
@@ -815,9 +821,17 @@
controlp = &(*controlp)->m_next;
}
}
+ /*
+ * If m is non-null, we have some data to read. From now on, make
+ * sure to keep sb_mb_tail consistent when working on the last
+ * packet on the chain (nextrecord==NULL) and we change m->m_nextpkt.
+ */
if (m) {
- if ((flags & MSG_PEEK) == 0)
+ if ((flags & MSG_PEEK) == 0) {
m->m_nextpkt = nextrecord;
+ if (nextrecord == NULL)
+ so->so_rcv.sb_mb_tail = m ;
+ }
type = m->m_type;
if (type == MT_OOBDATA)
flags |= MSG_OOB;
@@ -873,8 +887,11 @@
MFREE(m, so->so_rcv.sb_mb);
m = so->so_rcv.sb_mb;
}
- if (m)
+ if (m) {
m->m_nextpkt = nextrecord;
+ if (nextrecord == NULL)
+ so->so_rcv.sb_mb_tail = m ;
+ }
}
} else {
if (flags & MSG_PEEK)
@@ -931,8 +948,11 @@
(void) sbdroprecord(&so->so_rcv);
}
if ((flags & MSG_PEEK) == 0) {
- if (m == 0)
+ if (m == 0) {
so->so_rcv.sb_mb = nextrecord;
+ if (nextrecord == NULL || nextrecord->m_nextpkt == NULL)
+ so->so_rcv.sb_mb_tail = nextrecord ;
+ }
if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
(*pr->pr_usrreqs->pru_rcvd)(so, flags);
}
Index: kern/uipc_socket2.c
===================================================================
RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.55.2.7
diff -u -r1.55.2.7 uipc_socket2.c
--- kern/uipc_socket2.c 2000/09/07 19:13:37 1.55.2.7
+++ kern/uipc_socket2.c 2001/02/08 01:44:53
@@ -55,6 +55,8 @@
int maxsockets;
+int fastscan; /* XXX see below */
+
/*
* Primitive routines for operating on sockets and socket buffers
*/
@@ -477,7 +479,51 @@
* or sbdroprecord() when the data is acknowledged by the peer.
*/
+static struct mbuf *
+sbgettail(struct sockbuf *sb, char *msg, struct mbuf *m0);
/*
+ * sbgettail returns a pointer to the last record of the socketbuffer.
+ * If m0 is non-null, it also appends m0 to the chain.
+ */
+static struct mbuf *
+sbgettail(struct sockbuf *sb, char *msg, struct mbuf *m0)
+{
+ struct mbuf *m = sb->sb_mb;
+
+ if (m == NULL)
+ goto done;
+ if (sb->sb_mb_tail == NULL)
+ printf("%s: null tail\n", msg);
+ if (fastscan && sb->sb_mb_tail != NULL) {
+ m = sb->sb_mb_tail ;
+ if (m == NULL)
+ panic ("sbgettail returns NULL");
+ if (m->m_nextpkt == NULL) /* ok ... */
+ goto done;
+ /* otherwise continue scan */
+ printf("%s: sbgettail m_nextpkt != NULL\n", msg);
+ }
+ while (m->m_nextpkt)
+ m = m->m_nextpkt ;
+ if (m != sb->sb_mb_tail) {
+ if (sb->sb_mb_tail != NULL)
+ printf("%s: bad tail 0x%p instead of 0x%p\n",
+ msg, sb->sb_mb_tail, m);
+ sb->sb_mb_tail = m ;
+ }
+done:
+ if (m0) {
+ if (m)
+ m->m_nextpkt = m0;
+ else
+ sb->sb_mb = m0;
+ sb->sb_mb_tail = m0 ;
+ m = m0 ;
+ }
+ return m ;
+}
+
+/*
* Append mbuf chain m to the last record in the
* socket buffer sb. The additional space associated
* the mbuf chain is recorded in sb. Empty mbufs are
@@ -492,10 +538,8 @@
if (m == 0)
return;
- n = sb->sb_mb;
+ n = sbgettail(sb, "sbappend", NULL);
if (n) {
- while (n->m_nextpkt)
- n = n->m_nextpkt;
do {
if (n->m_flags & M_EOR) {
sbappendrecord(sb, m); /* XXXXXX!!!! */
@@ -545,19 +589,13 @@
if (m0 == 0)
return;
- m = sb->sb_mb;
- if (m)
- while (m->m_nextpkt)
- m = m->m_nextpkt;
/*
* Put the first mbuf on the queue.
* Note this permits zero length records.
*/
sballoc(sb, m0);
- if (m)
- m->m_nextpkt = m0;
- else
- sb->sb_mb = m0;
+ m = sbgettail(sb, "sbappendrecord", m0);
+ if (m0->m_nextpkt != NULL) printf("ouch! sbappendrecord nextpkt!=NULL\n");
m = m0->m_next;
m0->m_next = 0;
if (m && (m0->m_flags & M_EOR)) {
@@ -603,6 +641,8 @@
*/
sballoc(sb, m0);
m0->m_nextpkt = *mp;
+ if (*mp == NULL) /* m0 is actually the new tail */
+ sb->sb_mb_tail = m0 ;
*mp = m0;
m = m0->m_next;
m0->m_next = 0;
@@ -653,13 +693,7 @@
m->m_next = control;
for (n = m; n; n = n->m_next)
sballoc(sb, n);
- n = sb->sb_mb;
- if (n) {
- while (n->m_nextpkt)
- n = n->m_nextpkt;
- n->m_nextpkt = m;
- } else
- sb->sb_mb = m;
+ n = sbgettail(sb, "sbappendaddr", m);
return (1);
}
@@ -686,13 +720,7 @@
n->m_next = m0; /* concatenate data to control */
for (m = control; m; m = m->m_next)
sballoc(sb, m);
- n = sb->sb_mb;
- if (n) {
- while (n->m_nextpkt)
- n = n->m_nextpkt;
- n->m_nextpkt = control;
- } else
- sb->sb_mb = control;
+ n = sbgettail(sb, "sbappendcontrol", control);
return (1);
}
@@ -731,7 +759,7 @@
if (n)
n->m_next = m;
else
- sb->sb_mb = m;
+ sb->sb_mb = sb->sb_mb_tail = m;
sballoc(sb, m);
n = m;
m->m_flags &= ~M_EOR;
@@ -1003,6 +1031,8 @@
SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
&sb_efficiency, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, fastscan, CTLFLAG_RW,
+ &fastscan, 0, "Fast scanning of socket queues for append");
/*
* Initialise maxsockets
*/
To Unsubscribe: send mail to [EMAIL PROTECTED]
with "unsubscribe freebsd-net" in the body of the message