The tun/tap driver in 2.6.27 contains a new IFF_VNET_HDR
flag which makes every packet read from or written to the
tap fd be preceded by a virtio_net_hdr header.
This allows us to pass larger packets and packets with
partial checkums between the guest and the host, greatly
increasing the achievable bandwidth.
If the tap device has IFF_VNET_HDR enabled, the virtio-net
driver the merely needs to shuffle the headers supplied
by the guest or host to the other side.
We also inform the guest that we can now receive GSO packets
and have it confirm whether it can do likewise. If the guest
can receive GSO packets, we enable GSO on the tun device
using TUNSETOFFLOAD.
Note also that we increase the size of the tap packet buffer
to accomodate the largest possible GSO packet.
Signed-off-by: Mark McLoughlin <[EMAIL PROTECTED]>
---
qemu/hw/virtio-net.c | 86 +++++++++++++++++++++++++++++++++++++++++---------
qemu/net.h | 5 +++
qemu/vl.c | 75 ++++++++++++++++++++++++++++++++++++++-----
3 files changed, 142 insertions(+), 24 deletions(-)
diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
index 47349ce..9b1298e 100644
--- a/qemu/hw/virtio-net.c
+++ b/qemu/hw/virtio-net.c
@@ -22,9 +22,18 @@
#define VIRTIO_ID_NET 1
/* The feature bitmap for virtio net */
-#define VIRTIO_NET_F_NO_CSUM 0
-#define VIRTIO_NET_F_MAC 5
-#define VIRTIO_NET_F_GS0 6
+#define VIRTIO_NET_F_CSUM 0 /* Host handles pkts w/ partial csum */
+#define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/
partial csum */
+#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */
+#define VIRTIO_NET_F_GSO 6 /* Host handles pkts w/ any GSO type */
+#define VIRTIO_NET_F_GUEST_TSO4 7 /* Guest can handle TSOv4 in. */
+#define VIRTIO_NET_F_GUEST_TSO6 8 /* Guest can handle TSOv6 in. */
+#define VIRTIO_NET_F_GUEST_ECN 9 /* Guest can handle TSO[6] w/ ECN in. */
+#define VIRTIO_NET_F_GUEST_UFO 10 /* Guest can handle UFO in. */
+#define VIRTIO_NET_F_HOST_TSO4 11 /* Host can handle TSOv4 in. */
+#define VIRTIO_NET_F_HOST_TSO6 12 /* Host can handle TSOv6 in. */
+#define VIRTIO_NET_F_HOST_ECN 13 /* Host can handle TSO[6] w/ ECN in. */
+#define VIRTIO_NET_F_HOST_UFO 14 /* Host can handle UFO in. */
#define TX_TIMER_INTERVAL 250000 /* 250 us */
@@ -42,8 +51,6 @@ struct virtio_net_hdr
uint8_t flags;
#define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame
#define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO)
-/* FIXME: Do we need this? If they said they can handle ECN, do they care? */
-#define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN
#define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO)
#define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP
#define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set
@@ -85,7 +92,38 @@ static void virtio_net_update_config(VirtIODevice *vdev,
uint8_t *config)
static uint32_t virtio_net_get_features(VirtIODevice *vdev)
{
- return (1 << VIRTIO_NET_F_MAC);
+ VirtIONet *n = to_virtio_net(vdev);
+ VLANClientState *host = n->vc->vlan->first_client;
+ uint32_t features = (1 << VIRTIO_NET_F_MAC);
+
+ if (tap_has_vnet_hdr(host)) {
+ features |= (1 << VIRTIO_NET_F_CSUM);
+ features |= (1 << VIRTIO_NET_F_GUEST_CSUM);
+ features |= (1 << VIRTIO_NET_F_GUEST_TSO4);
+ features |= (1 << VIRTIO_NET_F_GUEST_TSO6);
+ features |= (1 << VIRTIO_NET_F_GUEST_ECN);
+ features |= (1 << VIRTIO_NET_F_HOST_TSO4);
+ features |= (1 << VIRTIO_NET_F_HOST_TSO6);
+ features |= (1 << VIRTIO_NET_F_HOST_ECN);
+ /* Kernel can't actually handle UFO in software currently. */
+ }
+
+ return features;
+}
+
+static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features)
+{
+ VirtIONet *n = to_virtio_net(vdev);
+ VLANClientState *host = n->vc->vlan->first_client;
+
+ if (!tap_has_vnet_hdr(host) || !host->set_offload)
+ return;
+
+ host->set_offload(host,
+ (features >> VIRTIO_NET_F_GUEST_CSUM) & 1,
+ (features >> VIRTIO_NET_F_GUEST_TSO4) & 1,
+ (features >> VIRTIO_NET_F_GUEST_TSO6) & 1,
+ (features >> VIRTIO_NET_F_GUEST_ECN) & 1);
}
/* RX */
@@ -121,6 +159,7 @@ static void virtio_net_receive(void *opaque, const uint8_t
*buf, int size)
VirtQueueElement elem;
struct virtio_net_hdr *hdr;
int offset, i;
+ int total;
if (virtqueue_pop(n->rx_vq, &elem) == 0)
return;
@@ -134,18 +173,26 @@ static void virtio_net_receive(void *opaque, const
uint8_t *buf, int size)
hdr->flags = 0;
hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
- /* copy in packet. ugh */
offset = 0;
+ total = sizeof(*hdr);
+
+ if (tap_has_vnet_hdr(n->vc->vlan->first_client)) {
+ memcpy(hdr, buf, sizeof(*hdr));
+ offset += total;
+ }
+
+ /* copy in packet. ugh */
i = 1;
while (offset < size && i < elem.in_num) {
int len = MIN(elem.in_sg[i].iov_len, size - offset);
memcpy(elem.in_sg[i].iov_base, buf + offset, len);
offset += len;
+ total += len;
i++;
}
/* signal other side */
- virtqueue_push(n->rx_vq, &elem, sizeof(*hdr) + offset);
+ virtqueue_push(n->rx_vq, &elem, total);
virtio_notify(&n->vdev, n->rx_vq);
}
@@ -153,23 +200,31 @@ static void virtio_net_receive(void *opaque, const
uint8_t *buf, int size)
static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
{
VirtQueueElement elem;
+ int has_vnet_hdr = tap_has_vnet_hdr(n->vc->vlan->first_client);
if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
return;
while (virtqueue_pop(vq, &elem)) {
ssize_t len = 0;
+ unsigned int out_num = elem.out_num;
+ struct iovec *out_sg = &elem.out_sg[0];
+
+ if (out_num < 1 || out_sg->iov_len != sizeof(struct virtio_net_hdr)) {
+ fprintf(stderr, "virtio-net header not in first element\n");
+ exit(1);
+ }
- if (elem.out_num < 1 ||
- elem.out_sg[0].iov_len != sizeof(struct virtio_net_hdr)) {
- fprintf(stderr, "virtio-net header not in first element\n");
- exit(1);
+ /* ignore the header if GSO is not supported */
+ if (!has_vnet_hdr) {
+ out_num--;
+ out_sg++;
+ len += sizeof(struct virtio_net_hdr);
}
- /* ignore the header for now */
- len = qemu_sendv_packet(n->vc, &elem.out_sg[1], elem.out_num - 1);
+ len += qemu_sendv_packet(n->vc, out_sg, out_num);
- virtqueue_push(vq, &elem, sizeof(struct virtio_net_hdr) + len);
+ virtqueue_push(vq, &elem, len);
virtio_notify(&n->vdev, vq);
}
}
@@ -249,6 +304,7 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int
devfn)
n->vdev.update_config = virtio_net_update_config;
n->vdev.get_features = virtio_net_get_features;
+ n->vdev.set_features = virtio_net_set_features;
n->rx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_rx);
n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
memcpy(n->mac, nd->macaddr, 6);
diff --git a/qemu/net.h b/qemu/net.h
index afc29ef..ae1a338 100644
--- a/qemu/net.h
+++ b/qemu/net.h
@@ -9,12 +9,15 @@ typedef ssize_t (IOReadvHandler)(void *, const struct iovec
*, int);
typedef struct VLANClientState VLANClientState;
+typedef void (SetOffload)(VLANClientState *, int, int, int, int);
+
struct VLANClientState {
IOReadHandler *fd_read;
IOReadvHandler *fd_readv;
/* Packets may still be sent if this returns zero. It's used to
rate-limit the slirp code. */
IOCanRWHandler *fd_can_read;
+ SetOffload *set_offload;
void *opaque;
struct VLANClientState *next;
struct VLANState *vlan;
@@ -42,6 +45,8 @@ void qemu_handler_true(void *opaque);
void do_info_network(void);
+int tap_has_vnet_hdr(void *opaque);
+
int net_client_init(const char *device, const char *opts);
void net_client_uninit(NICInfo *nd);
diff --git a/qemu/vl.c b/qemu/vl.c
index b39e919..fc53ae0 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -4340,14 +4340,33 @@ void do_info_slirp(void)
#endif /* CONFIG_SLIRP */
-#if !defined(_WIN32)
+#ifdef _WIN32
+
+int tap_has_vnet_hdr(void *opaque)
+{
+ return 0;
+}
+
+#else /* !defined(_WIN32) */
+
+#ifndef IFF_VNET_HDR
+#define TAP_BUFSIZE 4096
+#else
+#include <linux/virtio_net.h>
+#define ETH_HLEN 14
+#define ETH_DATA_LEN 1500
+#define MAX_PACKET_LEN (ETH_HLEN + ETH_DATA_LEN)
+#define MAX_SKB_FRAGS ((65536/TARGET_PAGE_SIZE) + 2)
+#define TAP_BUFSIZE (sizeof(struct virtio_net_hdr) + MAX_PACKET_LEN +
(MAX_SKB_FRAGS*TARGET_PAGE_SIZE))
+#endif
typedef struct TAPState {
VLANClientState *vc;
int fd;
char down_script[1024];
- char buf[4096];
+ char buf[TAP_BUFSIZE];
int size;
+ unsigned int has_vnet_hdr : 1;
} TAPState;
static void tap_receive(void *opaque, const uint8_t *buf, int size)
@@ -4442,9 +4461,40 @@ static void tap_send(void *opaque)
} while (s->size > 0);
}
+int tap_has_vnet_hdr(void *opaque)
+{
+ VLANClientState *vc = opaque;
+ TAPState *s = vc->opaque;
+
+ return s ? s->has_vnet_hdr : 0;
+}
+
+#ifdef TUNSETOFFLOAD
+static void tap_set_offload(VLANClientState *vc, int csum, int tso4, int tso6,
+ int ecn)
+{
+ TAPState *s = vc->opaque;
+ unsigned int offload = 0;
+
+ if (csum) {
+ offload |= TUN_F_CSUM;
+ if (tso4)
+ offload |= TUN_F_TSO4;
+ if (tso6)
+ offload |= TUN_F_TSO6;
+ if ((tso4 || tso6) && ecn)
+ offload |= TUN_F_TSO_ECN;
+ }
+
+ if (ioctl(s->fd, TUNSETOFFLOAD, offload) != 0)
+ fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
+ strerror(errno));
+}
+#endif /* TUNSETOFFLOAD */
+
/* fd support */
-static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
+static TAPState *net_tap_fd_init(VLANState *vlan, int fd, int vnet_hdr)
{
TAPState *s;
@@ -4452,15 +4502,19 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int
fd)
if (!s)
return NULL;
s->fd = fd;
+ s->has_vnet_hdr = vnet_hdr != 0;
s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s);
s->vc->fd_readv = tap_readv;
+#ifdef TUNSETOFFLOAD
+ s->vc->set_offload = tap_set_offload;
+#endif
qemu_set_fd_handler2(s->fd, tap_can_send, tap_send, NULL, s);
snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd);
return s;
}
#if defined (_BSD) || defined (__FreeBSD_kernel__)
-static int tap_open(char *ifname, int ifname_size)
+static int tap_open(char *ifname, int ifname_size, int *vnet_hdr)
{
int fd;
char *dev;
@@ -4602,7 +4656,7 @@ int tap_alloc(char *dev)
return tap_fd;
}
-static int tap_open(char *ifname, int ifname_size)
+static int tap_open(char *ifname, int ifname_size, int *vnet_hdr)
{
char dev[10]="";
int fd;
@@ -4615,7 +4669,7 @@ static int tap_open(char *ifname, int ifname_size)
return fd;
}
#else
-static int tap_open(char *ifname, int ifname_size)
+static int tap_open(char *ifname, int ifname_size, int *vnet_hdr)
{
struct ifreq ifr;
int fd, ret;
@@ -4684,13 +4738,15 @@ static int net_tap_init(VLANState *vlan, const char
*ifname1,
{
TAPState *s;
int fd;
+ int vnet_hdr;
char ifname[128];
if (ifname1 != NULL)
pstrcpy(ifname, sizeof(ifname), ifname1);
else
ifname[0] = '\0';
- TFR(fd = tap_open(ifname, sizeof(ifname)));
+ vnet_hdr = 0;
+ TFR(fd = tap_open(ifname, sizeof(ifname), &vnet_hdr));
if (fd < 0)
return -1;
@@ -4700,9 +4756,10 @@ static int net_tap_init(VLANState *vlan, const char
*ifname1,
if (launch_script(setup_script, ifname, fd))
return -1;
}
- s = net_tap_fd_init(vlan, fd);
+ s = net_tap_fd_init(vlan, fd, vnet_hdr);
if (!s)
return -1;
+
snprintf(s->vc->info_str, sizeof(s->vc->info_str),
"tap: ifname=%s setup_script=%s", ifname, setup_script);
if (down_script && strcmp(down_script, "no"))
@@ -5364,7 +5421,7 @@ int net_client_init(const char *device, const char *p)
fd = strtol(buf, NULL, 0);
fcntl(fd, F_SETFL, O_NONBLOCK);
ret = -1;
- if (net_tap_fd_init(vlan, fd))
+ if (net_tap_fd_init(vlan, fd, 0))
ret = 0;
} else {
if (get_param_value(ifname, sizeof(ifname), "ifname", p) <= 0) {
--
1.5.4.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html