The "qemu-slirp-performance" patch contains three improvements to qemu
slirp networking performance.  Booting my virtual machine (which
NFS-mounts its root filesystem from the host) has been accelerated by
8x, from over 5 minutes to 40 seconds.  TCP throughput has been
accelerated from about 2 megabytes/sec to 9 megabytes/sec, in both
directions (measured using a simple python script).  The system is
subjectively more responsive (for activities such as logging in or
running simple python scripts).

The specific problems fixed are:

   - the mss for the slirp-to-vm direction was 512 bytes (now 1460);
   - qemu would block in select() for up to four milliseconds at a
time, even when data was waiting on slirp sockets;
   - slirp was deliberately delaying acks until timer expiration
(TF_DELACK), preventing the vm from opening its send window, in
violation of rfc2581.

These fixes are together in one patch (qemu-slirp-performance.patch).

I have also attached some related patches that fix fairly serious
slirp bugs for IP datagrams larger than 4k.  Before these patches,
large packets can corrupt the heap or get reassembled in some
entertaining but incorrect orders (because ip_off was being sorted as
though it was signed!)  These patches are attached in the order I
apply them.

I hope they are helpful.  If there's anything I can do to make them
more likely to be accepted into the mainline, please let me know.

Thanks,
    -Ken
diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/mbuf.c 
qemu-snapshot-2006-03-27_23/slirp/mbuf.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/mbuf.c       2004-04-22 
00:10:47.000000000 +0000
+++ qemu-snapshot-2006-03-27_23/slirp/mbuf.c    2006-04-05 13:03:03.000000000 
+0000
@@ -146,18 +146,19 @@
         struct mbuf *m;
         int size;
 {
+       int datasize;
+
        /* some compiles throw up on gotos.  This one we can fake. */
         if(m->m_size>size) return;
 
         if (m->m_flags & M_EXT) {
-         /* datasize = m->m_data - m->m_ext; */
+         datasize = m->m_data - m->m_ext;
          m->m_ext = (char *)realloc(m->m_ext,size);
 /*             if (m->m_ext == NULL)
  *                     return (struct mbuf *)NULL;
  */            
-         /* m->m_data = m->m_ext + datasize; */
+         m->m_data = m->m_ext + datasize;
         } else {
-         int datasize;
          char *dat;
          datasize = m->m_data - m->m_dat;
          dat = (char *)malloc(size);




diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c 
qemu-snapshot-2006-03-27_23/slirp/ip_input.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c   2004-04-22 
00:10:47.000000000 +0000
+++ qemu-snapshot-2006-03-27_23/slirp/ip_input.c        2006-04-06 
06:02:52.000000000 +0000
@@ -344,8 +344,8 @@
        while (q != (struct ipasfrag *)fp) {
          struct mbuf *t;
          t = dtom(q);
-         m_cat(m, t);
          q = (struct ipasfrag *) q->ipf_next;
+         m_cat(m, t);
        }
 
        /*




diff -burN qemu-snapshot-2006-03-27_23.orig/slirp/ip.h 
qemu-snapshot-2006-03-27_23/slirp/ip.h
--- qemu-snapshot-2006-03-27_23.orig/slirp/ip.h 2004-04-21 17:10:47.000000000 
-0700
+++ qemu-snapshot-2006-03-27_23/slirp/ip.h      2006-04-06 00:28:49.000000000 
-0700
@@ -79,6 +79,11 @@
  * We declare ip_len and ip_off to be short, rather than u_short
  * pragmatically since otherwise unsigned comparisons can result
  * against negative integers quite easily, and fail in subtle ways.
+ *
+ * The only problem with the above theory is that these quantities
+ * are in fact unsigned, and sorting fragments by a signed version
+ * of ip_off doesn't work very well, nor does length checks on
+ * ip packets with a signed version of their length!
  */
 struct ip {
 #ifdef WORDS_BIGENDIAN
@@ -101,6 +106,9 @@
        struct  in_addr ip_src,ip_dst;  /* source and dest address */
 };
 
+#define IP_OFF(ip) (*(u_int16_t *)&((ip)->ip_off))
+#define IP_LEN(ip) (*(u_int16_t *)&((ip)->ip_len))
+
 #define        IP_MAXPACKET    65535           /* maximum packet size */
 
 /*
diff -burN qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c 
qemu-snapshot-2006-03-27_23/slirp/ip_input.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c   2004-04-21 
17:10:47.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/slirp/ip_input.c        2006-04-06 
00:32:19.000000000 -0700
@@ -111,7 +111,7 @@
         * Convert fields to host representation.
         */
        NTOHS(ip->ip_len);
-       if (ip->ip_len < hlen) {
+       if (IP_LEN(ip) < hlen) {
                ipstat.ips_badlen++;
                goto bad;
        }
@@ -124,13 +124,13 @@
         * Trim mbufs if longer than we expect.
         * Drop packet if shorter than we expect.
         */
-       if (m->m_len < ip->ip_len) {
+       if (m->m_len < IP_LEN(ip)) {
                ipstat.ips_tooshort++;
                goto bad;
        }
        /* Should drop packet if mbuf too long? hmmm... */
-       if (m->m_len > ip->ip_len)
-          m_adj(m, ip->ip_len - m->m_len);
+       if (m->m_len > IP_LEN(ip))
+          m_adj(m, IP_LEN(ip) - m->m_len);
 
        /* check ip_ttl for a correct ICMP reply */
        if(ip->ip_ttl==0 || ip->ip_ttl==1) {
@@ -191,7 +191,7 @@
                 * or if this is not the first fragment,
                 * attempt reassembly; if it succeeds, proceed.
                 */
-               if (((struct ipasfrag *)ip)->ipf_mff & 1 || ip->ip_off) {
+               if (((struct ipasfrag *)ip)->ipf_mff & 1 || IP_OFF(ip)) {
                        ipstat.ips_fragments++;
                        ip = ip_reass((struct ipasfrag *)ip, fp);
                        if (ip == 0)
@@ -281,7 +281,7 @@
         */
        for (q = (struct ipasfrag *)fp->ipq_next; q != (struct ipasfrag *)fp;
            q = (struct ipasfrag *)q->ipf_next)
-               if (q->ip_off > ip->ip_off)
+               if (IP_OFF(q) > IP_OFF(ip))
                        break;
 
        /*
@@ -290,10 +290,10 @@
         * segment.  If it provides all of our data, drop us.
         */
        if (q->ipf_prev != (ipasfragp_32)fp) {
-               i = ((struct ipasfrag *)(q->ipf_prev))->ip_off +
-                 ((struct ipasfrag *)(q->ipf_prev))->ip_len - ip->ip_off;
+               i = IP_OFF((struct ipasfrag *)(q->ipf_prev)) +
+                 IP_LEN((struct ipasfrag *)(q->ipf_prev)) - IP_OFF(ip);
                if (i > 0) {
-                       if (i >= ip->ip_len)
+                       if (i >= IP_LEN(ip))
                                goto dropfrag;
                        m_adj(dtom(ip), i);
                        ip->ip_off += i;
@@ -305,9 +305,9 @@
         * While we overlap succeeding segments trim them or,
         * if they are completely covered, dequeue them.
         */
-       while (q != (struct ipasfrag *)fp && ip->ip_off + ip->ip_len > 
q->ip_off) {
-               i = (ip->ip_off + ip->ip_len) - q->ip_off;
-               if (i < q->ip_len) {
+       while (q != (struct ipasfrag *)fp && IP_OFF(ip) + IP_LEN(ip) > 
IP_OFF(q)) {
+               i = (IP_OFF(ip) + IP_LEN(ip)) - IP_OFF(q);
+               if (i < IP_LEN(q)) {
                        q->ip_len -= i;
                        q->ip_off += i;
                        m_adj(dtom(q), i);
@@ -327,9 +327,9 @@
        next = 0;
        for (q = (struct ipasfrag *) fp->ipq_next; q != (struct ipasfrag *)fp;
             q = (struct ipasfrag *) q->ipf_next) {
-               if (q->ip_off != next)
+               if (IP_OFF(q) != next)
                        return (0);
-               next += q->ip_len;
+               next += IP_LEN(q);
        }
        if (((struct ipasfrag *)(q->ipf_prev))->ipf_mff & 1)
                return (0);
diff -burN qemu-snapshot-2006-03-27_23.orig/slirp/udp.c 
qemu-snapshot-2006-03-27_23/slirp/udp.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/udp.c        2006-04-06 
00:24:30.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/slirp/udp.c     2006-04-06 00:32:55.000000000 
-0700
@@ -111,12 +111,12 @@
         */
        len = ntohs((u_int16_t)uh->uh_ulen);
 
-       if (ip->ip_len != len) {
-               if (len > ip->ip_len) {
+       if (IP_LEN(ip) != len) {
+               if (len > IP_LEN(ip)) {
                        udpstat.udps_badlen++;
                        goto bad;
                }
-               m_adj(m, len - ip->ip_len);
+               m_adj(m, len - IP_LEN(ip));
                ip->ip_len = len;
        }
        




diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/tcp.h 
qemu-snapshot-2006-03-27_23/slirp/tcp.h
--- qemu-snapshot-2006-03-27_23.orig/slirp/tcp.h        2004-04-21 
17:10:47.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/slirp/tcp.h     2006-04-11 15:22:05.000000000 
-0700
@@ -100,8 +100,10 @@
  * With an IP MSS of 576, this is 536,
  * but 512 is probably more convenient.
  * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)).
+ *
+ * We make this 1460 because we only care about Ethernet in the qemu context.
  */
-#define        TCP_MSS 512
+#define        TCP_MSS 1460
 
 #define        TCP_MAXWIN      65535   /* largest value for (unscaled) window 
*/
 
diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/tcp_input.c 
qemu-snapshot-2006-03-27_23/slirp/tcp_input.c
--- qemu-snapshot-2006-03-27_23.orig/slirp/tcp_input.c  2004-10-07 
16:27:35.000000000 -0700
+++ qemu-snapshot-2006-03-27_23/slirp/tcp_input.c       2006-04-11 
15:22:05.000000000 -0700
@@ -580,28 +580,11 @@
                         *      congestion avoidance sender won't send more 
until
                         *      he gets an ACK.
                         * 
-                        * Here are 3 interpretations of what should happen.
-                        * The best (for me) is to delay-ack everything except
-                        * if it's a one-byte packet containing an ESC
-                        * (this means it's an arrow key (or similar) sent using
-                        * Nagel, hence there will be no echo)
-                        * The first of these is the original, the second is the
-                        * middle ground between the other 2
+                        * It is better to not delay acks at all to maximize
+                        * TCP throughput.  See RFC 2581.
                         */ 
-/*                     if (((unsigned)ti->ti_len < tp->t_maxseg)) {
- */                         
-/*                     if (((unsigned)ti->ti_len < tp->t_maxseg && 
- *                          (so->so_iptos & IPTOS_LOWDELAY) == 0) ||
- *                         ((so->so_iptos & IPTOS_LOWDELAY) && 
- *                          ((struct tcpiphdr_2 *)ti)->first_char == 
(char)27)) {
- */
-                       if ((unsigned)ti->ti_len == 1 &&
-                           ((struct tcpiphdr_2 *)ti)->first_char == (char)27) {
-                               tp->t_flags |= TF_ACKNOW;
-                               tcp_output(tp);
-                       } else {
-                               tp->t_flags |= TF_DELACK;
-                       }
+                       tp->t_flags |= TF_ACKNOW;
+                       tcp_output(tp);
                        return;
                }
        } /* header prediction */
diff -BurN qemu-snapshot-2006-03-27_23.orig/vl.c 
qemu-snapshot-2006-03-27_23/vl.c
--- qemu-snapshot-2006-03-27_23.orig/vl.c       2006-04-11 15:21:27.000000000 
-0700
+++ qemu-snapshot-2006-03-27_23/vl.c    2006-04-11 15:22:05.000000000 -0700
@@ -4026,7 +4026,7 @@
 void main_loop_wait(int timeout)
 {
     IOHandlerRecord *ioh, *ioh_next;
-    fd_set rfds, wfds;
+    fd_set rfds, wfds, xfds;
     int ret, nfds;
     struct timeval tv;
 
@@ -4041,6 +4041,7 @@
     nfds = -1;
     FD_ZERO(&rfds);
     FD_ZERO(&wfds);
+    FD_ZERO(&xfds);
     for(ioh = first_io_handler; ioh != NULL; ioh = ioh->next) {
         if (ioh->fd_read &&
             (!ioh->fd_read_poll ||
@@ -4062,7 +4063,12 @@
 #else
     tv.tv_usec = timeout * 1000;
 #endif
-    ret = select(nfds + 1, &rfds, &wfds, NULL, &tv);
+#if defined(CONFIG_SLIRP)
+    if (slirp_inited) {
+        slirp_select_fill(&nfds, &rfds, &wfds, &xfds);
+    }
+#endif
+    ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv);
     if (ret > 0) {
         /* XXX: better handling of removal */
         for(ioh = first_io_handler; ioh != NULL; ioh = ioh_next) {




_______________________________________________
Qemu-devel mailing list
Qemu-devel@nongnu.org
http://lists.nongnu.org/mailman/listinfo/qemu-devel

Reply via email to