On Tue, Jan 22, 2013 at 08:12:15AM +0100, Luigi Rizzo wrote: > reposting a version without changes that implement bounded > queues in net/queue.c > > Hi, > the attached patch implements a qemu backend for the "netmap" API > thus allowing machines to attach to the VALE software switch as > well as netmap-supported cards (links below). > > http://info.iet.unipi.it/~luigi/netmap/ > http://info.iet.unipi.it/~luigi/vale/ > > This is a cleaned up version of code written last summer.
Looks like a clean software approach to low-level packet I/O. I guess the biggest competitor would be a userspace library with NIC drivers using modern IOMMUs to avoid the security/reliability problem that previous userspace approaches suffered. Pretty cool that netmap reuses kernel NIC driver implementations to avoid duplicating all that code. I wonder how/if netmaps handles advanced NIC features like multiqueue and offloads? But I've only read the webpage, not the papers or source code :). > diff --git a/net/Makefile.objs b/net/Makefile.objs > index a08cd14..068253f 100644 > --- a/net/Makefile.objs > +++ b/net/Makefile.objs > @@ -10,3 +10,4 @@ common-obj-$(CONFIG_AIX) += tap-aix.o > common-obj-$(CONFIG_HAIKU) += tap-haiku.o > common-obj-$(CONFIG_SLIRP) += slirp.o > common-obj-$(CONFIG_VDE) += vde.o > +common-obj-$(CONFIG_NETMAP) += qemu-netmap.o Please drop the "qemu-" prefix. > diff --git a/net/net.c b/net/net.c > index cdd9b04..816c987 100644 > --- a/net/net.c > +++ b/net/net.c > @@ -618,6 +618,9 @@ static int (* const > net_client_init_fun[NET_CLIENT_OPTIONS_KIND_MAX])( > [NET_CLIENT_OPTIONS_KIND_BRIDGE] = net_init_bridge, > #endif > [NET_CLIENT_OPTIONS_KIND_HUBPORT] = net_init_hubport, > +#ifdef CONFIG_NETMAP > + [NET_CLIENT_OPTIONS_KIND_NETMAP] = net_init_netmap, > +#endif Please use 4 spaces for indentation and run scripts/checkpatch.pl to scan patches. > }; > > > diff --git a/net/qemu-netmap.c b/net/qemu-netmap.c > new file mode 100644 > index 0000000..79d7c09 > --- /dev/null > +++ b/net/qemu-netmap.c > @@ -0,0 +1,353 @@ > +/* > + * netmap access for qemu > + * > + * Copyright (c) 2012-2013 Luigi Rizzo > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > copy > + * of this software and associated documentation files (the "Software"), to > deal > + * in the Software without restriction, including without limitation the > rights > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > + * copies of the Software, and to permit persons to whom the Software is > + * furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > FROM, > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > + * THE SOFTWARE. > + */ > + > +#include "config-host.h" > + > +/* note paths are different for -head and 1.3 */ Please drop this comment from the patch. > +#define ND(fd, ... ) // debugging > +#define D(format, ...) \ > + do { \ > + struct timeval __xxts; \ > + gettimeofday(&__xxts, NULL); \ > + printf("%03d.%06d %s [%d] " format "\n", \ > + (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ > + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ > + } while (0) > + > +/* rate limited, lps indicates how many per second */ > +#define RD(lps, format, ...) \ > + do { \ > + static int t0, __cnt; \ > + struct timeval __xxts; \ > + gettimeofday(&__xxts, NULL); \ > + if (t0 != __xxts.tv_sec) { \ > + t0 = __xxts.tv_sec; \ > + __cnt = 0; \ > + } \ > + if (__cnt++ < lps) \ > + D(format, ##__VA_ARGS__); \ > + } while (0) Have you seen docs/tracing.txt? It can do fprintf(stderr) but also SystemTap/DTrace and a simple built-in binary tracer. > + > + > + > +/* > + * private netmap device info > + */ > +struct netmap_state { QEMU code should use: typedef struct { ... } NetmapState; See ./HACKING and ./CODING_STYLE. The code usually avoids struct tags. > + int fd; > + int memsize; size_t? > + void *mem; > + struct netmap_if *nifp; > + struct netmap_ring *rx; > + struct netmap_ring *tx; > + char fdname[128]; /* normally /dev/netmap */ PATH_MAX? > + char ifname[128]; /* maybe the nmreq here ? */ IFNAMSIZ? > +}; > + > +struct nm_state { > + NetClientState nc; > + struct netmap_state me; > + unsigned int read_poll; > + unsigned int write_poll; bool for read_poll and write_poll. > +}; > + > +// a fast copy routine only for multiples of 64 bytes, non overlapped. > +static inline void > +pkt_copy(const void *_src, void *_dst, int l) > +{ > + const uint64_t *src = _src; > + uint64_t *dst = _dst; > +#define likely(x) __builtin_expect(!!(x), 1) > +#define unlikely(x) __builtin_expect(!!(x), 0) Already defined in include/qemu/osdep.h. > + if (unlikely(l >= 1024)) { > + bcopy(src, dst, l); > + return; > + } > + for (; l > 0; l -= 64) { > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + *dst++ = *src++; > + } > +} I wonder how different FreeBSD bcopy() is from glibc memcpy() and if the optimization is even a win. The glibc code is probably hand-written assembly that CPU vendors have contributed for specific CPU model families. Did you compare glibc memcpy() against pkt_copy()? > + > + > +/* > + * open a netmap device. We assume there is only one queue > + * (which is the case for the VALE bridge). > + */ > +static int netmap_open(struct netmap_state *me) > +{ > + int fd, l, err; Please use "size_t len" instead of "int l". > + struct nmreq req; > + > + me->fd = fd = open(me->fdname, O_RDWR); > + if (fd < 0) { > + error_report("Unable to open netmap device '%s'", me->fdname); > + return -1; > + } > + bzero(&req, sizeof(req)); > + pstrcpy(req.nr_name, sizeof(req.nr_name), me->ifname); > + req.nr_ringid = 0; > + req.nr_version = NETMAP_API; > + err = ioctl(fd, NIOCGINFO, &req); > + if (err) { > + error_report("cannot get info on %s", me->ifname); > + goto error; > + } > + l = me->memsize = req.nr_memsize; > + err = ioctl(fd, NIOCREGIF, &req); > + if (err) { > + error_report("Unable to register %s", me->ifname); > + goto error; > + } > + > + me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); > + if (me->mem == MAP_FAILED) { > + error_report("Unable to mmap"); > + me->mem = NULL; > + goto error; > + } > + > + me->nifp = NETMAP_IF(me->mem, req.nr_offset); > + me->tx = NETMAP_TXRING(me->nifp, 0); > + me->rx = NETMAP_RXRING(me->nifp, 0); > + return 0; > + > +error: > + close(me->fd); > + return -1; > +} > + > +// XXX do we need the can-send routine ? > +static int netmap_can_send(void *opaque) > +{ > + struct nm_state *s = opaque; > + > + return qemu_can_send_packet(&s->nc); > +} Yes, I think it is a good idea. We only receive packets if our peer can also receive them. (Why we still need queues is another question but at least .read_poll() should be implemented IMO.) > + > +static void netmap_send(void *opaque); > +static void netmap_writable(void *opaque); > + > +/* > + * set the handlers for the device > + */ > +static void netmap_update_fd_handler(struct nm_state *s) > +{ > +#if 1 > + qemu_set_fd_handler2(s->me.fd, > + s->read_poll ? netmap_can_send : NULL, > + s->read_poll ? netmap_send : NULL, > + s->write_poll ? netmap_writable : NULL, > + s); > +#else > + qemu_set_fd_handler(s->me.fd, > + s->read_poll ? netmap_send : NULL, > + s->write_poll ? netmap_writable : NULL, > + s); > +#endif Please drop the #if. > +} > + > +// update the read handler > +static void netmap_read_poll(struct nm_state *s, int enable) bool enable > +{ > + if (s->read_poll != enable) { /* do nothing if not changed */ > + s->read_poll = enable; > + netmap_update_fd_handler(s); > + } > +} > + > +// update the write handler > +static void netmap_write_poll(struct nm_state *s, int enable) bool enable > +{ > + if (s->write_poll != enable) { > + s->write_poll = enable; > + netmap_update_fd_handler(s); > + } > +} > + > +/* > + * the fd_write() callback, invoked if the fd is marked as > + * writable after a poll. Reset the handler and flush any > + * buffered packets. > + */ > +static void netmap_writable(void *opaque) > +{ > + struct nm_state *s = opaque; > + > + netmap_write_poll(s, 0); > + qemu_flush_queued_packets(&s->nc); > +} It feels like the net subsystem is missing an opportunity to extract common code for file descriptor readability/writability. We're basically duplicating the code from tap.c. > + > +/* > + * new data guest --> backend > + */ > +static ssize_t netmap_receive_raw(NetClientState *nc, const uint8_t *buf, > size_t size) > +{ > + struct nm_state *s = DO_UPCAST(struct nm_state, nc, nc); > + struct netmap_ring *ring = s->me.tx; > + > + if (ring) { > + /* request an early notification to avoid running dry */ > + if (ring->avail < ring->num_slots / 2 && s->write_poll == 0) { > + netmap_write_poll(s, 1); > + } > + if (ring->avail == 0) { // cannot write > + return 0; > + } > + uint32_t i = ring->cur; > + uint32_t idx = ring->slot[i].buf_idx; > + uint8_t *dst = (u_char *)NETMAP_BUF(ring, idx); Why cast to u_char instead of uint8_t directly? > + > + ring->slot[i].len = size; > + pkt_copy(buf, dst, size); How does netmap guarantee that the buffer is large enough for this packet? > + ring->cur = NETMAP_RING_NEXT(ring, i); > + ring->avail--; > + } > + return size; > +} > + > +// complete a previous send (backend --> guest), enable the fd_read callback Please use /* */ instead of //. I'm not sure if we strictly enforce it but it's rare to see // in QEMU. > +static void netmap_send(void *opaque) > +{ > + struct nm_state *s = opaque; > + int sent = 0; > + struct netmap_ring *ring = s->me.rx; > + > + /* only check ring->avail, let the packet be queued > + * with qemu_send_packet_async() if needed > + * XXX until we fix the propagation on the bridge we need to stop early > + */ > + while (ring->avail > 0 && qemu_can_send_packet(&s->nc) ) { > + uint32_t i = ring->cur; > + uint32_t idx = ring->slot[i].buf_idx; > + uint8_t *src = (u_char *)NETMAP_BUF(ring, idx); > + int size = ring->slot[i].len; > + > + ring->cur = NETMAP_RING_NEXT(ring, i); > + ring->avail--; > + sent++; sent is unused? > +static void netmap_cleanup(NetClientState *nc) > +{ > + struct nm_state *s = DO_UPCAST(struct nm_state, nc, nc); > + > + qemu_purge_queued_packets(nc); > + > + netmap_read_poll(s, 0); > + netmap_write_poll(s, 0); These could be replaced with a single call to netmap_poll(nc, false). > + close(s->me.fd); Missing munmap? > +int net_init_netmap(const NetClientOptions *opts, const char *name, > NetClientState *peer) > +{ > + const NetdevNetmapOptions *netmap_opts = opts->netmap; > + NetClientState *nc; > + struct netmap_state me; > + struct nm_state *s; > + > + pstrcpy(me.fdname, sizeof(me.fdname), name ? name : "/dev/netmap"); > + /* set default name for the port if not supplied */ > + pstrcpy(me.ifname, sizeof(me.ifname), > + netmap_opts->has_ifname ? netmap_opts->ifname : "vale0"); > + if (netmap_open(&me)) QEMU coding style uses braces, even when the if body is only one line. > + return -1; > + > + /* create the object -- XXX use name or ifname ? */ The name should be neither the filename nor the ifname. It's a separate namespace and the user gets to decide the name.