aleksandr.fedorov_itglobal.com created this revision.
aleksandr.fedorov_itglobal.com added reviewers: bhyve, jhb, rgrimes, krion, 
v.maffione_gmail.com.
Herald added a subscriber: bcran.

REVISION SUMMARY
  Сurrent implementation of the virtio-net backend doesn't allow to transfer of 
packets larger than the netmap(4) buffer size (2048) or maximum guest 
descriptor size for tap(4) case. The reason is that there is no support for 
merge-able buffers (VIRTIO_NET_F_MRG_RXBUF in virtio specifications). See PR: 
215737 This significantly limits the TCP throughput.
  
  This patch adds support for mergable buffers using netmap's ability to chain 
it's own buffers (see NS _MOREFRAG, netmap(4)). The same approach is used by 
QEMU (virtio-net + netmap backend).
  
  We are seeing a significant increase in throughput both for transferring data 
between VM's on the same host, and between VM's on different hosts. See tests 
below.

TEST PLAN
  **1. 'iperf3 -c X.X.X.X -t 60 -R' between 2 Ubuntu 16.04 VM's through VALE 
switch.**
  
  MTU 1500. VM - virtio-net - VALE - virtio-net - VM:
  
  [ ID] Interval           Transfer     Bandwidth       Retr
  [  5]   0.00-60.04  sec  28.6 GBytes  4.09 Gbits/sec    3             sender
  [  5]   0.00-60.04  sec  28.6 GBytes  4.09 Gbits/sec                  receiver
  
  MTU 3000. VM - virtio-net - VALE - virtio-net - VM (MTU 3000):
  
  [ ID] Interval           Transfer     Bandwidth       Retr
  [  5]   0.00-60.04  sec  51.8 GBytes  7.42 Gbits/sec  651             sender
  [  5]   0.00-60.04  sec  51.8 GBytes  7.42 Gbits/sec                  receiver
  
  MTU 9000. VM - virtio-net - VALE - virtio-net - VM:
  
  [ ID] Interval           Transfer     Bandwidth       Retr
  [  5]   0.00-60.04  sec  99.7 GBytes  14.3 Gbits/sec  100             sender
  [  5]   0.00-60.04  sec  99.7 GBytes  14.3 Gbits/sec                  receiver
  
  MTU 16000. VM - virtio-net - VALE - virtio-net - VM:
  
  [ ID] Interval           Transfer     Bandwidth       Retr
  [  5]   0.00-60.04  sec   122 GBytes  17.5 Gbits/sec    0             sender
  [  5]   0.00-60.04  sec   122 GBytes  17.5 Gbits/sec                  receiver
  
  MTU 32000. VM - virtio-net - VALE - virtio-net - VM:
  
  [ ID] Interval           Transfer     Bandwidth       Retr
  [  5]   0.00-60.04  sec   152 GBytes  21.7 Gbits/sec   64             sender
  [  5]   0.00-60.04  sec   152 GBytes  21.7 Gbits/sec                  receiver
  
  MTU 64000. VM - virtio-net - VALE - virtio-net - VM:
  
  [ ID] Interval           Transfer     Bandwidth       Retr
  [  5]   0.00-60.04  sec   220 GBytes  31.4 Gbits/sec   60             sender
  [  5]   0.00-60.04  sec   220 GBytes  31.4 Gbits/sec                  receiver
  
  **2. 'iperf3 -c X.X.X.X -t 60 -R' between 2 FreeBSD 12 RELEASE VM's through 
VALE switch.**
  
  MTU 1500. VM - virtio-net - VALE - virtio-net - VM: 1.30 Gbits/sec
  MTU 3000. VM - virtio-net - VALE - virtio-net - VM: 2.14 Gbits/sec
  MTU 9000. VM - virtio-net - VALE - virtio-net - VM: 4.80 Gbits/sec
  MTU 16000. VM - virtio-net - VALE - virtio-net - VM: 7.25 Gbits/sec
  MTU 32000. VM - virtio-net - VALE - virtio-net - VM: 12.8 Gbits/sec
  MTU 64000. VM - virtio-net - VALE - virtio-net - VM: 13.3 Gbits/sec

REVISION DETAIL
  https://reviews.freebsd.org/D20276

AFFECTED FILES
  usr.sbin/bhyve/pci_virtio_net.c

EMAIL PREFERENCES
  https://reviews.freebsd.org/settings/panel/emailpreferences/

To: aleksandr.fedorov_itglobal.com, #bhyve, jhb, rgrimes, krion, 
v.maffione_gmail.com
Cc: freebsd-virtualization-list, evgueni.gavrilov_itglobal.com, bcran
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -73,6 +73,8 @@
 
 #define VTNET_MAXSEGS	256
 
+#define VTNET_MIN_AVAIL_DESC	64
+
 /*
  * Host capabilities.  Note that we only offer a few of these.
  */
@@ -392,85 +394,107 @@
 }
 
 static __inline int
-pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
+pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt, int iovsize)
 {
-	int r, i;
-	int len = 0;
+	char *buf;
+	int i;
+	int frag_size;
+	int iov_off;
+	int len;
+	int nm_off;
+	int nm_buf_size;
 
-	for (r = nmd->cur_tx_ring; ; ) {
-		struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
-		uint32_t cur, idx;
-		char *buf;
+	struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, nmd->cur_rx_ring);
 
-		if (nm_ring_empty(ring)) {
-			r++;
-			if (r > nmd->last_tx_ring)
-				r = nmd->first_tx_ring;
-			if (r == nmd->cur_tx_ring)
-				break;
-			continue;
+	if ((nm_ring_space(ring) * ring->nr_buf_size) < iovsize) {
+		/*
+		 * No more avail space in TX ring, try to flush it.
+		 */
+		ioctl(nmd->fd, NIOCTXSYNC, NULL);
+		return (0);
+	}
+
+	i = ring->cur;
+	buf = NETMAP_BUF(ring, ring->slot[i].buf_idx);
+	iov_off = 0;
+	len = iovsize;
+	nm_buf_size = ring->nr_buf_size;
+	nm_off = 0;
+
+	while (iovsize) {
+
+		if (unlikely(iov_off == iov->iov_len)) {
+			iov++;
+			iov_off = 0;
 		}
-		cur = ring->cur;
-		idx = ring->slot[cur].buf_idx;
-		buf = NETMAP_BUF(ring, idx);
 
-		for (i = 0; i < iovcnt; i++) {
-			if (len + iov[i].iov_len > 2048)
-				break;
-			memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
-			len += iov[i].iov_len;
+		if (unlikely(nm_off == nm_buf_size)) {
+			ring->slot[i].flags = NS_MOREFRAG;
+			i = nm_ring_next(ring, i);
+			buf = NETMAP_BUF(ring, ring->slot[i].buf_idx);
+			nm_off = 0;
 		}
-		ring->slot[cur].len = len;
-		ring->head = ring->cur = nm_ring_next(ring, cur);
-		nmd->cur_tx_ring = r;
-		ioctl(nmd->fd, NIOCTXSYNC, NULL);
-		break;
+
+		frag_size = MIN(nm_buf_size - nm_off, iov->iov_len - iov_off);
+		memcpy(buf + nm_off, iov->iov_base + iov_off, frag_size);
+
+		iovsize -= frag_size;
+		iov_off += frag_size;
+		nm_off += frag_size;
+
+		ring->slot[i].len = nm_off;
 	}
 
+	/* The last slot must not have NS_MOREFRAG set. */
+	ring->slot[i].flags &= ~NS_MOREFRAG;
+	ring->head = ring->cur = nm_ring_next(ring, i);
+	ioctl(nmd->fd, NIOCTXSYNC, NULL);
+
 	return (len);
 }
 
 static __inline int
-pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
+pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt, int iovsize)
 {
-	int len = 0;
-	int i = 0;
-	int r;
+	char *buf;
+	int i;
+	int iov_off;
+	int frag_size;
+	int len;
+	int nm_off;
 
-	for (r = nmd->cur_rx_ring; ; ) {
-		struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
-		uint32_t cur, idx;
-		char *buf;
-		size_t left;
+	struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring);
 
-		if (nm_ring_empty(ring)) {
-			r++;
-			if (r > nmd->last_rx_ring)
-				r = nmd->first_rx_ring;
-			if (r == nmd->cur_rx_ring)
-				break;
-			continue;
+	i = r->cur;
+	buf = NETMAP_BUF(r, r->slot[i].buf_idx);
+	iov_off = 0;
+	nm_off = 0;
+	len = iovsize;
+
+	while (iovsize) {
+
+		if (unlikely(iov_off == iov->iov_len)) {
+			iov++;
+			iov_off = 0;
 		}
-		cur = ring->cur;
-		idx = ring->slot[cur].buf_idx;
-		buf = NETMAP_BUF(ring, idx);
-		left = ring->slot[cur].len;
 
-		for (i = 0; i < iovcnt && left > 0; i++) {
-			if (iov[i].iov_len > left)
-				iov[i].iov_len = left;
-			memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
-			len += iov[i].iov_len;
-			left -= iov[i].iov_len;
+		if (unlikely(nm_off == r->slot[i].len)) {
+			i = nm_ring_next(r, i);
+			buf = NETMAP_BUF(r, r->slot[i].buf_idx);
+			nm_off = 0;
 		}
-		ring->head = ring->cur = nm_ring_next(ring, cur);
-		nmd->cur_rx_ring = r;
-		ioctl(nmd->fd, NIOCRXSYNC, NULL);
-		break;
+
+		frag_size = MIN(r->slot[i].len - nm_off, iov->iov_len - iov_off);
+		memcpy(iov->iov_base + iov_off, buf + nm_off, frag_size);
+
+		iovsize -= frag_size;
+		iov_off += frag_size;
+		nm_off += frag_size;
 	}
-	for (; i < iovcnt; i++)
-		iov[i].iov_len = 0;
 
+	r->head = r->cur = nm_ring_next(r, i);
+	ioctl(nmd->fd, NIOCRXSYNC, NULL);
+
 	return (len);
 }
 
@@ -481,32 +505,102 @@
 pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
 		    int len)
 {
-	static char pad[60]; /* all zero bytes */
-
 	if (sc->vsc_nmd == NULL)
 		return;
 
-	/*
-	 * If the length is < 60, pad out to that and add the
-	 * extra zero'd segment to the iov. It is guaranteed that
-	 * there is always an extra iov available by the caller.
-	 */
-	if (len < 60) {
-		iov[iovcnt].iov_base = pad;
-		iov[iovcnt].iov_len = 60 - len;
-		iovcnt++;
+	(void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt, len);
+}
+
+static __inline int
+vq_avail_to_iovec(struct vqueue_info *vq, struct iovec *iov, int len, int start,
+		int minavail)
+{
+	int idx;
+	uint16_t mask = vq->vq_qsize - 1;
+	volatile struct virtio_desc *vdir;
+	struct vmctx *ctx = vq->vq_vs->vs_pi->pi_vmctx;
+
+	uint16_t ndesc = (uint16_t)(vq->vq_avail->va_idx - vq->vq_last_avail - start);
+
+	if (ndesc < minavail)
+		return (0);
+
+	int off = 0;
+	int uidx = vq->vq_used->vu_idx + start;
+
+	for (int i = 0; i < ndesc; i++) {
+		idx = vq->vq_avail->va_ring[(vq->vq_last_avail + i +  start) & mask];
+		vdir = &vq->vq_desc[idx];
+
+		iov[i].iov_base = paddr_guest2host(ctx, 
+				vdir->vd_addr, vdir->vd_len);
+		iov[i].iov_len = vdir->vd_len;
+
+		off += vdir->vd_len;
+
+		vq->vq_used->vu_ring[uidx & mask].vu_idx = idx;
+		vq->vq_used->vu_ring[uidx & mask].vu_tlen = 
+			(off >= len) ? vdir->vd_len - (off - len) : vdir->vd_len;
+
+		uidx++;
+
+		if (off >= len) {
+			return (i + 1);
+		}
 	}
-	(void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
+
+	return (0);
 }
 
+static __inline void
+vq_inc_used_idx_and_last_avail(struct vqueue_info *vq, int n)
+{
+	if (n > 0) {
+		vq->vq_last_avail += n;
+		vq->vq_used->vu_idx += n;
+	}
+}
+
+static __inline int
+netmap_next_pkt_len(struct nm_desc *nmd)
+{
+	int i;
+	int len;
+	struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring);
+
+	len = 0;
+
+	for (i = r->cur; i != r->tail; i = nm_ring_next(r, i)) {
+		len += r->slot[i].len;
+		if (!(r->slot[i].flags & NS_MOREFRAG))
+			break;
+	}
+
+	return (len);
+}
+
+static __inline void
+netmap_drop_pkt(struct nm_desc *nmd)
+{
+	int i;
+	struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring);
+
+	for (i = r->cur; i != r->tail; i = nm_ring_next(r, i)) {
+		if (!(r->slot[i].flags & NS_MOREFRAG)) {
+			r->head = r->cur = nm_ring_next(r, i);
+			return;
+		}
+	}
+}
+
 static void
 pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
 {
-	struct iovec iov[VTNET_MAXSEGS], *riov;
+	struct iovec iov[VTNET_RINGSZ], *riov;
 	struct vqueue_info *vq;
-	void *vrx;
-	int len, n;
-	uint16_t idx;
+	int len;
+	int n;
+	int used;
 
 	/*
 	 * Should never be called without a valid netmap descriptor
@@ -517,11 +611,11 @@
 	 * But, will be called when the rx ring hasn't yet
 	 * been set up or the guest is resetting the device.
 	 */
-	if (!sc->vsc_rx_ready || sc->resetting) {
+	if (unlikely((!sc->vsc_rx_ready || sc->resetting))) {
 		/*
 		 * Drop the packet and try later.
 		 */
-		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
+		netmap_drop_pkt(sc->vsc_nmd);
 		return;
 	}
 
@@ -529,63 +623,54 @@
 	 * Check for available rx buffers
 	 */
 	vq = &sc->vsc_queues[VTNET_RXQ];
-	if (!vq_has_descs(vq)) {
+	if (unlikely(!vq_has_descs(vq))) {
 		/*
 		 * Drop the packet and try later.  Interrupt on
 		 * empty, if that's negotiated.
 		 */
-		(void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
+		netmap_drop_pkt(sc->vsc_nmd);
 		vq_endchains(vq, 1);
 		return;
 	}
 
+	used = 0;
+
 	do {
-		/*
-		 * Get descriptor chain.
-		 */
-		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
-		assert(n >= 1 && n <= VTNET_MAXSEGS);
+		len = netmap_next_pkt_len(sc->vsc_nmd);
 
-		/*
-		 * Get a pointer to the rx header, and use the
-		 * data immediately following it for the packet buffer.
-		 */
-		vrx = iov[0].iov_base;
-		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
+		if (unlikely(len == 0)) {
+			vq_inc_used_idx_and_last_avail(vq, used);
+			vq_endchains(vq, 0);
+			return;
+		}
 
-		len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
+		n = vq_avail_to_iovec(vq, iov, len + sc->rx_vhdrlen, used,
+				VTNET_MIN_AVAIL_DESC);
 
-		if (len == 0) {
-			/*
-			 * No more packets, but still some avail ring
-			 * entries.  Interrupt if needed/appropriate.
-			 */
-			vq_retchain(vq);
+		if (unlikely(n == 0)) {
+			vq_inc_used_idx_and_last_avail(vq, used);
 			vq_endchains(vq, 0);
 			return;
 		}
 
-		/*
-		 * The only valid field in the rx packet header is the
-		 * number of buffers if merged rx bufs were negotiated.
-		 */
-		memset(vrx, 0, sc->rx_vhdrlen);
-
 		if (sc->rx_merge) {
-			struct virtio_net_rxhdr *vrxh;
-
-			vrxh = vrx;
-			vrxh->vrh_bufs = 1;
+			struct virtio_net_rxhdr *vrxh = iov[0].iov_base;
+			memset(vrxh, 0, sc->rx_vhdrlen);
+			vrxh->vrh_bufs = n;
 		}
 
-		/*
-		 * Release this chain and handle more chains.
-		 */
-		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
+		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
+
+		(void)pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n, len);
+
+		used += n;
+
 	} while (vq_has_descs(vq));
 
-	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
+	vq_inc_used_idx_and_last_avail(vq, used);
 	vq_endchains(vq, 1);
+
+	return;
 }
 
 static void

_______________________________________________
freebsd-virtualization@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-virtualization
To unsubscribe, send any mail to 
"freebsd-virtualization-unsubscr...@freebsd.org"

Reply via email to