This patch optimizes Vhost performance for large packets when the Mergeable Rx buffer feature is enabled. It introduces a dedicated memcpy function for vhost enqueue/dequeue to replace rte_memcpy.
The reason is that rte_memcpy is for general cases, it handles unaligned copies and make store aligned, it even makes load aligned for micro architectures like Ivy Bridge. However alignment handling comes at a price: It introduces extra load/store instructions. Vhost memcpy is rather special: The copy is aligned, and remote, and there is header write along which is also remote. In this case the memcpy instruction stream should be simplified, to reduce extra load/store, therefore reduce the probability of load/store buffer full caused pipeline stall, to let the actual memcpy instructions be issued and let H/W prefetcher goes to work as early as possible. Performance gain is visible when packet size: 1. Larger than 512 bytes on AVX/SSE platforms like Ivy Bridge 2. Larger than 256 bytes on AVX2 platforms like Haswell 3. Larger than 512 bytes on AVX512 platforms like Skylake Up to 20% gain can be achieved by this patch for PVP traffic. The test can also be conducted without NIC, by using loopback traffic between Vhost and Virtio. For example, increase TXONLY_DEF_PACKET_LEN to the requested packet size in testpmd.h, rebuild and start testpmd in both host and guest, then "start" on one side and "start tx_first 32" on the other. Signed-off-by: Zhihong Wang <zhihong.w...@intel.com> --- lib/librte_vhost/virtio_net.c | 72 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 595f67c..cd6f21a 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -50,6 +50,72 @@ #define MAX_PKT_BURST 32 #define VHOST_LOG_PAGE 4096 +/** + * This function is used to for vhost memcpy, to replace rte_memcpy. + * The reason is that rte_memcpy is for general cases, where vhost + * memcpy is a rather special case: The copy is aligned, and remote, + * and there is header write along which is also remote. In this case + * the memcpy instruction stream should be simplified to reduce extra + * load/store, therefore reduce the probability of load/store buffer + * full caused pipeline stall, to let the actual memcpy instructions + * be issued and let H/W prefetcher goes to work as early as possible. + */ +static inline void __attribute__((always_inline)) +vhost_memcpy(void *dst, const void *src, size_t n) +{ + /* Copy size <= 16 bytes */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + src = (const uint8_t *)src + 1; + dst = (uint8_t *)dst + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + src = (const uint16_t *)src + 1; + dst = (uint16_t *)dst + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + src = (const uint32_t *)src + 1; + dst = (uint32_t *)dst + 1; + } + if (n & 0x08) + *(uint64_t *)dst = *(const uint64_t *)src; + + return; + } + + /* Copy 16 <= size <= 32 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + + return; + } + + /* Copy 32 < size <= 64 bytes */ + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + + return; + } + + /* Copy 64 bytes blocks */ + for (; n >= 64; n -= 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; + } + + /* Copy whatever left */ + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); +} + static inline void __attribute__((always_inline)) vhost_log_page(uint8_t *log_base, uint64_t page) { @@ -246,7 +312,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, } cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), + vhost_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), cpy_len); vhost_log_write(dev, desc->addr + desc_offset, cpy_len); @@ -522,7 +588,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, } cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), + vhost_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), cpy_len); vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset, @@ -856,7 +922,7 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, */ mbuf_avail = cpy_len; } else { - rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, + vhost_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset), (void *)((uintptr_t)(desc_addr + desc_offset)), cpy_len); -- 2.7.4