The "qemu-slirp-performance" patch contains three improvements to qemu slirp networking performance. Booting my virtual machine (which NFS-mounts its root filesystem from the host) has been accelerated by 8x, from over 5 minutes to 40 seconds. TCP throughput has been accelerated from about 2 megabytes/sec to 9 megabytes/sec, in both directions (measured using a simple python script). The system is subjectively more responsive (for activities such as logging in or running simple python scripts).
The specific problems fixed are: - the mss for the slirp-to-vm direction was 512 bytes (now 1460); - qemu would block in select() for up to four milliseconds at a time, even when data was waiting on slirp sockets; - slirp was deliberately delaying acks until timer expiration (TF_DELACK), preventing the vm from opening its send window, in violation of rfc2581. These fixes are together in one patch (qemu-slirp-performance.patch). I have also attached some related patches that fix fairly serious slirp bugs for IP datagrams larger than 4k. Before these patches, large packets can corrupt the heap or get reassembled in some entertaining but incorrect orders (because ip_off was being sorted as though it was signed!) These patches are attached in the order I apply them. I hope they are helpful. If there's anything I can do to make them more likely to be accepted into the mainline, please let me know. Thanks, -Ken
diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/mbuf.c qemu-snapshot-2006-03-27_23/slirp/mbuf.c --- qemu-snapshot-2006-03-27_23.orig/slirp/mbuf.c 2004-04-22 00:10:47.000000000 +0000 +++ qemu-snapshot-2006-03-27_23/slirp/mbuf.c 2006-04-05 13:03:03.000000000 +0000 @@ -146,18 +146,19 @@ struct mbuf *m; int size; { + int datasize; + /* some compiles throw up on gotos. This one we can fake. */ if(m->m_size>size) return; if (m->m_flags & M_EXT) { - /* datasize = m->m_data - m->m_ext; */ + datasize = m->m_data - m->m_ext; m->m_ext = (char *)realloc(m->m_ext,size); /* if (m->m_ext == NULL) * return (struct mbuf *)NULL; */ - /* m->m_data = m->m_ext + datasize; */ + m->m_data = m->m_ext + datasize; } else { - int datasize; char *dat; datasize = m->m_data - m->m_dat; dat = (char *)malloc(size);
diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c qemu-snapshot-2006-03-27_23/slirp/ip_input.c --- qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c 2004-04-22 00:10:47.000000000 +0000 +++ qemu-snapshot-2006-03-27_23/slirp/ip_input.c 2006-04-06 06:02:52.000000000 +0000 @@ -344,8 +344,8 @@ while (q != (struct ipasfrag *)fp) { struct mbuf *t; t = dtom(q); - m_cat(m, t); q = (struct ipasfrag *) q->ipf_next; + m_cat(m, t); } /*
diff -burN qemu-snapshot-2006-03-27_23.orig/slirp/ip.h qemu-snapshot-2006-03-27_23/slirp/ip.h --- qemu-snapshot-2006-03-27_23.orig/slirp/ip.h 2004-04-21 17:10:47.000000000 -0700 +++ qemu-snapshot-2006-03-27_23/slirp/ip.h 2006-04-06 00:28:49.000000000 -0700 @@ -79,6 +79,11 @@ * We declare ip_len and ip_off to be short, rather than u_short * pragmatically since otherwise unsigned comparisons can result * against negative integers quite easily, and fail in subtle ways. + * + * The only problem with the above theory is that these quantities + * are in fact unsigned, and sorting fragments by a signed version + * of ip_off doesn't work very well, nor does length checks on + * ip packets with a signed version of their length! */ struct ip { #ifdef WORDS_BIGENDIAN @@ -101,6 +106,9 @@ struct in_addr ip_src,ip_dst; /* source and dest address */ }; +#define IP_OFF(ip) (*(u_int16_t *)&((ip)->ip_off)) +#define IP_LEN(ip) (*(u_int16_t *)&((ip)->ip_len)) + #define IP_MAXPACKET 65535 /* maximum packet size */ /* diff -burN qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c qemu-snapshot-2006-03-27_23/slirp/ip_input.c --- qemu-snapshot-2006-03-27_23.orig/slirp/ip_input.c 2004-04-21 17:10:47.000000000 -0700 +++ qemu-snapshot-2006-03-27_23/slirp/ip_input.c 2006-04-06 00:32:19.000000000 -0700 @@ -111,7 +111,7 @@ * Convert fields to host representation. */ NTOHS(ip->ip_len); - if (ip->ip_len < hlen) { + if (IP_LEN(ip) < hlen) { ipstat.ips_badlen++; goto bad; } @@ -124,13 +124,13 @@ * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ - if (m->m_len < ip->ip_len) { + if (m->m_len < IP_LEN(ip)) { ipstat.ips_tooshort++; goto bad; } /* Should drop packet if mbuf too long? hmmm... */ - if (m->m_len > ip->ip_len) - m_adj(m, ip->ip_len - m->m_len); + if (m->m_len > IP_LEN(ip)) + m_adj(m, IP_LEN(ip) - m->m_len); /* check ip_ttl for a correct ICMP reply */ if(ip->ip_ttl==0 || ip->ip_ttl==1) { @@ -191,7 +191,7 @@ * or if this is not the first fragment, * attempt reassembly; if it succeeds, proceed. */ - if (((struct ipasfrag *)ip)->ipf_mff & 1 || ip->ip_off) { + if (((struct ipasfrag *)ip)->ipf_mff & 1 || IP_OFF(ip)) { ipstat.ips_fragments++; ip = ip_reass((struct ipasfrag *)ip, fp); if (ip == 0) @@ -281,7 +281,7 @@ */ for (q = (struct ipasfrag *)fp->ipq_next; q != (struct ipasfrag *)fp; q = (struct ipasfrag *)q->ipf_next) - if (q->ip_off > ip->ip_off) + if (IP_OFF(q) > IP_OFF(ip)) break; /* @@ -290,10 +290,10 @@ * segment. If it provides all of our data, drop us. */ if (q->ipf_prev != (ipasfragp_32)fp) { - i = ((struct ipasfrag *)(q->ipf_prev))->ip_off + - ((struct ipasfrag *)(q->ipf_prev))->ip_len - ip->ip_off; + i = IP_OFF((struct ipasfrag *)(q->ipf_prev)) + + IP_LEN((struct ipasfrag *)(q->ipf_prev)) - IP_OFF(ip); if (i > 0) { - if (i >= ip->ip_len) + if (i >= IP_LEN(ip)) goto dropfrag; m_adj(dtom(ip), i); ip->ip_off += i; @@ -305,9 +305,9 @@ * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ - while (q != (struct ipasfrag *)fp && ip->ip_off + ip->ip_len > q->ip_off) { - i = (ip->ip_off + ip->ip_len) - q->ip_off; - if (i < q->ip_len) { + while (q != (struct ipasfrag *)fp && IP_OFF(ip) + IP_LEN(ip) > IP_OFF(q)) { + i = (IP_OFF(ip) + IP_LEN(ip)) - IP_OFF(q); + if (i < IP_LEN(q)) { q->ip_len -= i; q->ip_off += i; m_adj(dtom(q), i); @@ -327,9 +327,9 @@ next = 0; for (q = (struct ipasfrag *) fp->ipq_next; q != (struct ipasfrag *)fp; q = (struct ipasfrag *) q->ipf_next) { - if (q->ip_off != next) + if (IP_OFF(q) != next) return (0); - next += q->ip_len; + next += IP_LEN(q); } if (((struct ipasfrag *)(q->ipf_prev))->ipf_mff & 1) return (0); diff -burN qemu-snapshot-2006-03-27_23.orig/slirp/udp.c qemu-snapshot-2006-03-27_23/slirp/udp.c --- qemu-snapshot-2006-03-27_23.orig/slirp/udp.c 2006-04-06 00:24:30.000000000 -0700 +++ qemu-snapshot-2006-03-27_23/slirp/udp.c 2006-04-06 00:32:55.000000000 -0700 @@ -111,12 +111,12 @@ */ len = ntohs((u_int16_t)uh->uh_ulen); - if (ip->ip_len != len) { - if (len > ip->ip_len) { + if (IP_LEN(ip) != len) { + if (len > IP_LEN(ip)) { udpstat.udps_badlen++; goto bad; } - m_adj(m, len - ip->ip_len); + m_adj(m, len - IP_LEN(ip)); ip->ip_len = len; }
diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/tcp.h qemu-snapshot-2006-03-27_23/slirp/tcp.h --- qemu-snapshot-2006-03-27_23.orig/slirp/tcp.h 2004-04-21 17:10:47.000000000 -0700 +++ qemu-snapshot-2006-03-27_23/slirp/tcp.h 2006-04-11 15:22:05.000000000 -0700 @@ -100,8 +100,10 @@ * With an IP MSS of 576, this is 536, * but 512 is probably more convenient. * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)). + * + * We make this 1460 because we only care about Ethernet in the qemu context. */ -#define TCP_MSS 512 +#define TCP_MSS 1460 #define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ diff -BurN qemu-snapshot-2006-03-27_23.orig/slirp/tcp_input.c qemu-snapshot-2006-03-27_23/slirp/tcp_input.c --- qemu-snapshot-2006-03-27_23.orig/slirp/tcp_input.c 2004-10-07 16:27:35.000000000 -0700 +++ qemu-snapshot-2006-03-27_23/slirp/tcp_input.c 2006-04-11 15:22:05.000000000 -0700 @@ -580,28 +580,11 @@ * congestion avoidance sender won't send more until * he gets an ACK. * - * Here are 3 interpretations of what should happen. - * The best (for me) is to delay-ack everything except - * if it's a one-byte packet containing an ESC - * (this means it's an arrow key (or similar) sent using - * Nagel, hence there will be no echo) - * The first of these is the original, the second is the - * middle ground between the other 2 + * It is better to not delay acks at all to maximize + * TCP throughput. See RFC 2581. */ -/* if (((unsigned)ti->ti_len < tp->t_maxseg)) { - */ -/* if (((unsigned)ti->ti_len < tp->t_maxseg && - * (so->so_iptos & IPTOS_LOWDELAY) == 0) || - * ((so->so_iptos & IPTOS_LOWDELAY) && - * ((struct tcpiphdr_2 *)ti)->first_char == (char)27)) { - */ - if ((unsigned)ti->ti_len == 1 && - ((struct tcpiphdr_2 *)ti)->first_char == (char)27) { - tp->t_flags |= TF_ACKNOW; - tcp_output(tp); - } else { - tp->t_flags |= TF_DELACK; - } + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); return; } } /* header prediction */ diff -BurN qemu-snapshot-2006-03-27_23.orig/vl.c qemu-snapshot-2006-03-27_23/vl.c --- qemu-snapshot-2006-03-27_23.orig/vl.c 2006-04-11 15:21:27.000000000 -0700 +++ qemu-snapshot-2006-03-27_23/vl.c 2006-04-11 15:22:05.000000000 -0700 @@ -4026,7 +4026,7 @@ void main_loop_wait(int timeout) { IOHandlerRecord *ioh, *ioh_next; - fd_set rfds, wfds; + fd_set rfds, wfds, xfds; int ret, nfds; struct timeval tv; @@ -4041,6 +4041,7 @@ nfds = -1; FD_ZERO(&rfds); FD_ZERO(&wfds); + FD_ZERO(&xfds); for(ioh = first_io_handler; ioh != NULL; ioh = ioh->next) { if (ioh->fd_read && (!ioh->fd_read_poll || @@ -4062,7 +4063,12 @@ #else tv.tv_usec = timeout * 1000; #endif - ret = select(nfds + 1, &rfds, &wfds, NULL, &tv); +#if defined(CONFIG_SLIRP) + if (slirp_inited) { + slirp_select_fill(&nfds, &rfds, &wfds, &xfds); + } +#endif + ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv); if (ret > 0) { /* XXX: better handling of removal */ for(ioh = first_io_handler; ioh != NULL; ioh = ioh_next) {
_______________________________________________ Qemu-devel mailing list Qemu-devel@nongnu.org http://lists.nongnu.org/mailman/listinfo/qemu-devel