In article <local.mail.freebsd-net/[EMAIL PROTECTED]> you
write:
>
>I would be grateful if someone could test the attached patch which
>deals with the following problem:
>
> on all *BSD version, socket buffers contain a list of
> incoming and/or outgoing mbufs. Unfortunately the list only
> has a pointer to the head, meaning that all append operations
> require to scan the full list. The overhead can be very
> bad in some cases (e.g. small UDP packets), and becomes
> worse and worse as the socket buffer size increases (which
> is what one would commonly do when expecting a lot of
> traffic!).
>
>The attached patch implements a tail pointer to the list, so that
>you can append in constant time. By default, the code works exactly
>as before -- the tail of the list is reached with the usual linear
>scan, and the pointer to the tail is only used for comparison
>purposes to make sure that it yields the same value.
Aside from the obvious style bugs, and debugging cruft, I have a couple
of issues with this patch:
- I dislike having sb_mb_tail only being valid if sb_mb is NULL;
to me, this is non-obvious, and I feel it would be cleaner to
always make it valid
- sbgettail() is misnamed, it should reflect the function that
it is being used for in most cases (appending the new mbuf to
an existing chain)
- I don't think that the fastscan sysctl is appropriate; either
the code works, or it should be debugged further before committing.
You could move the debugging checks under a KASSERT though.
- Also, I believe that there may be a problem with the original
patch, in that the sb_mb_tail is not updated in sbdrop(), in
the case where we drop partial mbufs from the first packet
train.
In that spirit, I offer an amended patch below.
--
Jonathan
Index: kern/uipc_socket.c
===================================================================
RCS file: /ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.68.2.11
diff -u -r1.68.2.11 uipc_socket.c
--- kern/uipc_socket.c 2000/12/22 10:25:21 1.68.2.11
+++ kern/uipc_socket.c 2001/02/09 23:57:14
@@ -778,6 +778,11 @@
goto restart;
}
dontblock:
+ /*
+ * On entry here, m points to the first record on the socket buffer.
+ * While we process the initial mbufs containing address and control
+ * info we save a copy of m->m_nextpkt into nextrecord.
+ */
if (uio->uio_procp)
uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
nextrecord = m->m_nextpkt;
@@ -821,6 +826,18 @@
controlp = &(*controlp)->m_next;
}
}
+ /*
+ * if nextrecord == NULL (this is a single chain) then m_tail
+ * may not be valid here if m was changed earlier.
+ */
+ if (nextrecord == NULL && (flags & MSG_PEEK) == 0)
+ so->so_rcv.sb_mb_tail = m;
+
+ /*
+ * If m is non-null, we have some data to read. From now on, make
+ * sure to keep sb_mb_tail consistent when working on the last
+ * packet on the chain (nextrecord==NULL) and we change m->m_nextpkt.
+ */
if (m) {
if ((flags & MSG_PEEK) == 0)
m->m_nextpkt = nextrecord;
@@ -881,6 +898,8 @@
}
if (m)
m->m_nextpkt = nextrecord;
+ if (nextrecord == NULL)
+ so->so_rcv.sb_mb_tail = m;
}
} else {
if (flags & MSG_PEEK)
@@ -937,8 +956,11 @@
(void) sbdroprecord(&so->so_rcv);
}
if ((flags & MSG_PEEK) == 0) {
- if (m == 0)
+ if (m == 0) {
so->so_rcv.sb_mb = nextrecord;
+ if (nextrecord == NULL || nextrecord->m_nextpkt == NULL)
+ so->so_rcv.sb_mb_tail = nextrecord;
+ }
if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
(*pr->pr_usrreqs->pru_rcvd)(so, flags);
}
Index: kern/uipc_socket2.c
===================================================================
RCS file: /ncvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.55.2.8
diff -u -r1.55.2.8 uipc_socket2.c
--- kern/uipc_socket2.c 2001/02/04 14:49:45 1.55.2.8
+++ kern/uipc_socket2.c 2001/02/09 23:59:16
@@ -63,6 +65,9 @@
static u_long sb_efficiency = 8; /* parameter for sbreserve() */
+static int sbtailchk(struct sockbuf *sb);
+static __inline void sbappendmbuf(struct sockbuf *sb, struct mbuf *m0);
+
/*
* Procedures to manipulate state flags of socket
* and do appropriate wakeups. Normal sequence from the
@@ -477,6 +482,29 @@
* or sbdroprecord() when the data is acknowledged by the peer.
*/
+static int
+sbtailchk(struct sockbuf *sb)
+{
+ struct mbuf *m = sb->sb_mb;
+
+ while (m && m->m_nextpkt)
+ m = m->m_nextpkt;
+ return (m == sb->sb_mb_tail);
+}
+
+static __inline void
+sbappendmbuf(struct sockbuf *sb, struct mbuf *m0)
+{
+ struct mbuf *m = sb->sb_mb_tail;
+
+ KASSERT(sbtailchk(sb), ("sbappendmbuf: bad sb_mb_tail"));
+ if (m)
+ m->m_nextpkt = m0;
+ else
+ sb->sb_mb = m0;
+ sb->sb_mb_tail = m0;
+}
+
/*
* Append mbuf chain m to the last record in the
* socket buffer sb. The additional space associated
@@ -492,10 +520,9 @@
if (m == 0)
return;
- n = sb->sb_mb;
+ KASSERT(sbtailchk(sb), ("sbappend: bad sb_mb_tail"));
+ n = sb->sb_mb_tail;
if (n) {
- while (n->m_nextpkt)
- n = n->m_nextpkt;
do {
if (n->m_flags & M_EOR) {
sbappendrecord(sb, m); /* XXXXXX!!!! */
@@ -545,19 +572,12 @@
if (m0 == 0)
return;
- m = sb->sb_mb;
- if (m)
- while (m->m_nextpkt)
- m = m->m_nextpkt;
/*
* Put the first mbuf on the queue.
* Note this permits zero length records.
*/
sballoc(sb, m0);
- if (m)
- m->m_nextpkt = m0;
- else
- sb->sb_mb = m0;
+ sbappendmbuf(sb, m0);
m = m0->m_next;
m0->m_next = 0;
if (m && (m0->m_flags & M_EOR)) {
@@ -603,6 +623,8 @@
*/
sballoc(sb, m0);
m0->m_nextpkt = *mp;
+ if (*mp == NULL) /* m0 is actually the new tail */
+ sb->sb_mb_tail = m0;
*mp = m0;
m = m0->m_next;
m0->m_next = 0;
@@ -653,13 +675,7 @@
m->m_next = control;
for (n = m; n; n = n->m_next)
sballoc(sb, n);
- n = sb->sb_mb;
- if (n) {
- while (n->m_nextpkt)
- n = n->m_nextpkt;
- n->m_nextpkt = m;
- } else
- sb->sb_mb = m;
+ sbappendmbuf(sb, m);
return (1);
}
@@ -686,13 +702,7 @@
n->m_next = m0; /* concatenate data to control */
for (m = control; m; m = m->m_next)
sballoc(sb, m);
- n = sb->sb_mb;
- if (n) {
- while (n->m_nextpkt)
- n = n->m_nextpkt;
- n->m_nextpkt = control;
- } else
- sb->sb_mb = control;
+ sbappendmbuf(sb, control);
return (1);
}
@@ -733,7 +743,7 @@
if (n)
n->m_next = m;
else
- sb->sb_mb = m;
+ sb->sb_mb = sb->sb_mb_tail = m;
sballoc(sb, m);
n = m;
m->m_flags &= ~M_EOR;
@@ -813,6 +823,9 @@
m->m_nextpkt = next;
} else
sb->sb_mb = next;
+ m = sb->sb_mb;
+ if (m == NULL || m->m_nextpkt == NULL)
+ sb->sb_mb_tail = m;
}
/*
@@ -833,6 +846,9 @@
MFREE(m, mn);
m = mn;
} while (m);
+ m = sb->sb_mb;
+ if (m == NULL || m->m_nextpkt == NULL)
+ sb->sb_mb_tail = m;
}
}
Index: sys/socketvar.h
===================================================================
RCS file: /ncvs/src/sys/sys/socketvar.h,v
retrieving revision 1.46.2.4
diff -u -r1.46.2.4 socketvar.h
--- sys/socketvar.h 2000/11/26 02:30:08 1.46.2.4
+++ sys/socketvar.h 2001/02/08 17:55:46
@@ -93,6 +93,7 @@
u_long sb_mbmax; /* max chars of mbufs to use */
long sb_lowat; /* low water mark */
struct mbuf *sb_mb; /* the mbuf chain */
+ struct mbuf *sb_mb_tail; /* last pkt in chain */
struct selinfo sb_sel; /* process selecting read/write */
short sb_flags; /* flags, see below */
short sb_timeo; /* timeout for read/write */
To Unsubscribe: send mail to [EMAIL PROTECTED]
with "unsubscribe freebsd-net" in the body of the message