aleksandr.fedorov_itglobal.com created this revision. aleksandr.fedorov_itglobal.com added reviewers: bhyve, jhb, rgrimes, krion, v.maffione_gmail.com. Herald added a subscriber: bcran.
REVISION SUMMARY Сurrent implementation of the virtio-net backend doesn't allow to transfer of packets larger than the netmap(4) buffer size (2048) or maximum guest descriptor size for tap(4) case. The reason is that there is no support for merge-able buffers (VIRTIO_NET_F_MRG_RXBUF in virtio specifications). See PR: 215737 This significantly limits the TCP throughput. This patch adds support for mergable buffers using netmap's ability to chain it's own buffers (see NS _MOREFRAG, netmap(4)). The same approach is used by QEMU (virtio-net + netmap backend). We are seeing a significant increase in throughput both for transferring data between VM's on the same host, and between VM's on different hosts. See tests below. TEST PLAN **1. 'iperf3 -c X.X.X.X -t 60 -R' between 2 Ubuntu 16.04 VM's through VALE switch.** MTU 1500. VM - virtio-net - VALE - virtio-net - VM: [ ID] Interval Transfer Bandwidth Retr [ 5] 0.00-60.04 sec 28.6 GBytes 4.09 Gbits/sec 3 sender [ 5] 0.00-60.04 sec 28.6 GBytes 4.09 Gbits/sec receiver MTU 3000. VM - virtio-net - VALE - virtio-net - VM (MTU 3000): [ ID] Interval Transfer Bandwidth Retr [ 5] 0.00-60.04 sec 51.8 GBytes 7.42 Gbits/sec 651 sender [ 5] 0.00-60.04 sec 51.8 GBytes 7.42 Gbits/sec receiver MTU 9000. VM - virtio-net - VALE - virtio-net - VM: [ ID] Interval Transfer Bandwidth Retr [ 5] 0.00-60.04 sec 99.7 GBytes 14.3 Gbits/sec 100 sender [ 5] 0.00-60.04 sec 99.7 GBytes 14.3 Gbits/sec receiver MTU 16000. VM - virtio-net - VALE - virtio-net - VM: [ ID] Interval Transfer Bandwidth Retr [ 5] 0.00-60.04 sec 122 GBytes 17.5 Gbits/sec 0 sender [ 5] 0.00-60.04 sec 122 GBytes 17.5 Gbits/sec receiver MTU 32000. VM - virtio-net - VALE - virtio-net - VM: [ ID] Interval Transfer Bandwidth Retr [ 5] 0.00-60.04 sec 152 GBytes 21.7 Gbits/sec 64 sender [ 5] 0.00-60.04 sec 152 GBytes 21.7 Gbits/sec receiver MTU 64000. VM - virtio-net - VALE - virtio-net - VM: [ ID] Interval Transfer Bandwidth Retr [ 5] 0.00-60.04 sec 220 GBytes 31.4 Gbits/sec 60 sender [ 5] 0.00-60.04 sec 220 GBytes 31.4 Gbits/sec receiver **2. 'iperf3 -c X.X.X.X -t 60 -R' between 2 FreeBSD 12 RELEASE VM's through VALE switch.** MTU 1500. VM - virtio-net - VALE - virtio-net - VM: 1.30 Gbits/sec MTU 3000. VM - virtio-net - VALE - virtio-net - VM: 2.14 Gbits/sec MTU 9000. VM - virtio-net - VALE - virtio-net - VM: 4.80 Gbits/sec MTU 16000. VM - virtio-net - VALE - virtio-net - VM: 7.25 Gbits/sec MTU 32000. VM - virtio-net - VALE - virtio-net - VM: 12.8 Gbits/sec MTU 64000. VM - virtio-net - VALE - virtio-net - VM: 13.3 Gbits/sec REVISION DETAIL https://reviews.freebsd.org/D20276 AFFECTED FILES usr.sbin/bhyve/pci_virtio_net.c EMAIL PREFERENCES https://reviews.freebsd.org/settings/panel/emailpreferences/ To: aleksandr.fedorov_itglobal.com, #bhyve, jhb, rgrimes, krion, v.maffione_gmail.com Cc: freebsd-virtualization-list, evgueni.gavrilov_itglobal.com, bcran
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c --- a/usr.sbin/bhyve/pci_virtio_net.c +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -73,6 +73,8 @@ #define VTNET_MAXSEGS 256 +#define VTNET_MIN_AVAIL_DESC 64 + /* * Host capabilities. Note that we only offer a few of these. */ @@ -392,85 +394,107 @@ } static __inline int -pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt, int iovsize) { - int r, i; - int len = 0; + char *buf; + int i; + int frag_size; + int iov_off; + int len; + int nm_off; + int nm_buf_size; - for (r = nmd->cur_tx_ring; ; ) { - struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); - uint32_t cur, idx; - char *buf; + struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, nmd->cur_rx_ring); - if (nm_ring_empty(ring)) { - r++; - if (r > nmd->last_tx_ring) - r = nmd->first_tx_ring; - if (r == nmd->cur_tx_ring) - break; - continue; + if ((nm_ring_space(ring) * ring->nr_buf_size) < iovsize) { + /* + * No more avail space in TX ring, try to flush it. + */ + ioctl(nmd->fd, NIOCTXSYNC, NULL); + return (0); + } + + i = ring->cur; + buf = NETMAP_BUF(ring, ring->slot[i].buf_idx); + iov_off = 0; + len = iovsize; + nm_buf_size = ring->nr_buf_size; + nm_off = 0; + + while (iovsize) { + + if (unlikely(iov_off == iov->iov_len)) { + iov++; + iov_off = 0; } - cur = ring->cur; - idx = ring->slot[cur].buf_idx; - buf = NETMAP_BUF(ring, idx); - for (i = 0; i < iovcnt; i++) { - if (len + iov[i].iov_len > 2048) - break; - memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); - len += iov[i].iov_len; + if (unlikely(nm_off == nm_buf_size)) { + ring->slot[i].flags = NS_MOREFRAG; + i = nm_ring_next(ring, i); + buf = NETMAP_BUF(ring, ring->slot[i].buf_idx); + nm_off = 0; } - ring->slot[cur].len = len; - ring->head = ring->cur = nm_ring_next(ring, cur); - nmd->cur_tx_ring = r; - ioctl(nmd->fd, NIOCTXSYNC, NULL); - break; + + frag_size = MIN(nm_buf_size - nm_off, iov->iov_len - iov_off); + memcpy(buf + nm_off, iov->iov_base + iov_off, frag_size); + + iovsize -= frag_size; + iov_off += frag_size; + nm_off += frag_size; + + ring->slot[i].len = nm_off; } + /* The last slot must not have NS_MOREFRAG set. */ + ring->slot[i].flags &= ~NS_MOREFRAG; + ring->head = ring->cur = nm_ring_next(ring, i); + ioctl(nmd->fd, NIOCTXSYNC, NULL); + return (len); } static __inline int -pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt, int iovsize) { - int len = 0; - int i = 0; - int r; + char *buf; + int i; + int iov_off; + int frag_size; + int len; + int nm_off; - for (r = nmd->cur_rx_ring; ; ) { - struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); - uint32_t cur, idx; - char *buf; - size_t left; + struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring); - if (nm_ring_empty(ring)) { - r++; - if (r > nmd->last_rx_ring) - r = nmd->first_rx_ring; - if (r == nmd->cur_rx_ring) - break; - continue; + i = r->cur; + buf = NETMAP_BUF(r, r->slot[i].buf_idx); + iov_off = 0; + nm_off = 0; + len = iovsize; + + while (iovsize) { + + if (unlikely(iov_off == iov->iov_len)) { + iov++; + iov_off = 0; } - cur = ring->cur; - idx = ring->slot[cur].buf_idx; - buf = NETMAP_BUF(ring, idx); - left = ring->slot[cur].len; - for (i = 0; i < iovcnt && left > 0; i++) { - if (iov[i].iov_len > left) - iov[i].iov_len = left; - memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); - len += iov[i].iov_len; - left -= iov[i].iov_len; + if (unlikely(nm_off == r->slot[i].len)) { + i = nm_ring_next(r, i); + buf = NETMAP_BUF(r, r->slot[i].buf_idx); + nm_off = 0; } - ring->head = ring->cur = nm_ring_next(ring, cur); - nmd->cur_rx_ring = r; - ioctl(nmd->fd, NIOCRXSYNC, NULL); - break; + + frag_size = MIN(r->slot[i].len - nm_off, iov->iov_len - iov_off); + memcpy(iov->iov_base + iov_off, buf + nm_off, frag_size); + + iovsize -= frag_size; + iov_off += frag_size; + nm_off += frag_size; } - for (; i < iovcnt; i++) - iov[i].iov_len = 0; + r->head = r->cur = nm_ring_next(r, i); + ioctl(nmd->fd, NIOCRXSYNC, NULL); + return (len); } @@ -481,32 +505,102 @@ pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int len) { - static char pad[60]; /* all zero bytes */ - if (sc->vsc_nmd == NULL) return; - /* - * If the length is < 60, pad out to that and add the - * extra zero'd segment to the iov. It is guaranteed that - * there is always an extra iov available by the caller. - */ - if (len < 60) { - iov[iovcnt].iov_base = pad; - iov[iovcnt].iov_len = 60 - len; - iovcnt++; + (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt, len); +} + +static __inline int +vq_avail_to_iovec(struct vqueue_info *vq, struct iovec *iov, int len, int start, + int minavail) +{ + int idx; + uint16_t mask = vq->vq_qsize - 1; + volatile struct virtio_desc *vdir; + struct vmctx *ctx = vq->vq_vs->vs_pi->pi_vmctx; + + uint16_t ndesc = (uint16_t)(vq->vq_avail->va_idx - vq->vq_last_avail - start); + + if (ndesc < minavail) + return (0); + + int off = 0; + int uidx = vq->vq_used->vu_idx + start; + + for (int i = 0; i < ndesc; i++) { + idx = vq->vq_avail->va_ring[(vq->vq_last_avail + i + start) & mask]; + vdir = &vq->vq_desc[idx]; + + iov[i].iov_base = paddr_guest2host(ctx, + vdir->vd_addr, vdir->vd_len); + iov[i].iov_len = vdir->vd_len; + + off += vdir->vd_len; + + vq->vq_used->vu_ring[uidx & mask].vu_idx = idx; + vq->vq_used->vu_ring[uidx & mask].vu_tlen = + (off >= len) ? vdir->vd_len - (off - len) : vdir->vd_len; + + uidx++; + + if (off >= len) { + return (i + 1); + } } - (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); + + return (0); } +static __inline void +vq_inc_used_idx_and_last_avail(struct vqueue_info *vq, int n) +{ + if (n > 0) { + vq->vq_last_avail += n; + vq->vq_used->vu_idx += n; + } +} + +static __inline int +netmap_next_pkt_len(struct nm_desc *nmd) +{ + int i; + int len; + struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring); + + len = 0; + + for (i = r->cur; i != r->tail; i = nm_ring_next(r, i)) { + len += r->slot[i].len; + if (!(r->slot[i].flags & NS_MOREFRAG)) + break; + } + + return (len); +} + +static __inline void +netmap_drop_pkt(struct nm_desc *nmd) +{ + int i; + struct netmap_ring *r = NETMAP_RXRING(nmd->nifp, nmd->cur_rx_ring); + + for (i = r->cur; i != r->tail; i = nm_ring_next(r, i)) { + if (!(r->slot[i].flags & NS_MOREFRAG)) { + r->head = r->cur = nm_ring_next(r, i); + return; + } + } +} + static void pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) { - struct iovec iov[VTNET_MAXSEGS], *riov; + struct iovec iov[VTNET_RINGSZ], *riov; struct vqueue_info *vq; - void *vrx; - int len, n; - uint16_t idx; + int len; + int n; + int used; /* * Should never be called without a valid netmap descriptor @@ -517,11 +611,11 @@ * But, will be called when the rx ring hasn't yet * been set up or the guest is resetting the device. */ - if (!sc->vsc_rx_ready || sc->resetting) { + if (unlikely((!sc->vsc_rx_ready || sc->resetting))) { /* * Drop the packet and try later. */ - (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + netmap_drop_pkt(sc->vsc_nmd); return; } @@ -529,63 +623,54 @@ * Check for available rx buffers */ vq = &sc->vsc_queues[VTNET_RXQ]; - if (!vq_has_descs(vq)) { + if (unlikely(!vq_has_descs(vq))) { /* * Drop the packet and try later. Interrupt on * empty, if that's negotiated. */ - (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + netmap_drop_pkt(sc->vsc_nmd); vq_endchains(vq, 1); return; } + used = 0; + do { - /* - * Get descriptor chain. - */ - n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); - assert(n >= 1 && n <= VTNET_MAXSEGS); + len = netmap_next_pkt_len(sc->vsc_nmd); - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vrx = iov[0].iov_base; - riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + if (unlikely(len == 0)) { + vq_inc_used_idx_and_last_avail(vq, used); + vq_endchains(vq, 0); + return; + } - len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); + n = vq_avail_to_iovec(vq, iov, len + sc->rx_vhdrlen, used, + VTNET_MIN_AVAIL_DESC); - if (len == 0) { - /* - * No more packets, but still some avail ring - * entries. Interrupt if needed/appropriate. - */ - vq_retchain(vq); + if (unlikely(n == 0)) { + vq_inc_used_idx_and_last_avail(vq, used); vq_endchains(vq, 0); return; } - /* - * The only valid field in the rx packet header is the - * number of buffers if merged rx bufs were negotiated. - */ - memset(vrx, 0, sc->rx_vhdrlen); - if (sc->rx_merge) { - struct virtio_net_rxhdr *vrxh; - - vrxh = vrx; - vrxh->vrh_bufs = 1; + struct virtio_net_rxhdr *vrxh = iov[0].iov_base; + memset(vrxh, 0, sc->rx_vhdrlen); + vrxh->vrh_bufs = n; } - /* - * Release this chain and handle more chains. - */ - vq_relchain(vq, idx, len + sc->rx_vhdrlen); + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + + (void)pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n, len); + + used += n; + } while (vq_has_descs(vq)); - /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_inc_used_idx_and_last_avail(vq, used); vq_endchains(vq, 1); + + return; } static void
_______________________________________________ freebsd-virtualization@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/freebsd-virtualization To unsubscribe, send any mail to "freebsd-virtualization-unsubscr...@freebsd.org"