Using rte_memset instead of copy_virtio_net_hdr can bring 3%~4% performance improvements on IA platform from virtio/vhost non-mergeable loopback testing.
Two key points have been considered: 1. One variable initialization could be saved, which involves memory store. 2. copy_virtio_net_hdr involves both load (from stack, the virtio_hdr var) and store (to virtio driver memory), while rte_memset just involves store. Signed-off-by: Zhiyong Yang <zhiyong.y...@intel.com> --- doc/guides/rel_notes/release_17_02.rst | 11 +++++++++++ lib/librte_vhost/virtio_net.c | 18 +++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/guides/rel_notes/release_17_02.rst b/doc/guides/rel_notes/release_17_02.rst index 3b65038..eecf857 100644 --- a/doc/guides/rel_notes/release_17_02.rst +++ b/doc/guides/rel_notes/release_17_02.rst @@ -38,6 +38,17 @@ New Features Also, make sure to start the actual text at the margin. ========================================================= +* **Introduced rte_memset and related test on IA platform.** + + Performance drop had been caused in some cases on Ivybridge when DPDK code calls glibc + function memset. It was necessary to introduce more high efficient function to fix it. + The function rte_memset supported three types of instruction sets including sse & avx(128 bits), + avx2(256 bits) and avx512(512bits). + + * Added rte_memset support on IA platform. + * Added functional autotest support for rte_memset. + * Added performance autotest support for rte_memset. + * Improved performance to use rte_memset instead of copy_virtio_net_hdr in lib/librte_vhost. Resolved Issues --------------- diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 595f67c..392b31b 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -37,6 +37,7 @@ #include <rte_mbuf.h> #include <rte_memcpy.h> +#include <rte_memset.h> #include <rte_ether.h> #include <rte_ip.h> #include <rte_virtio_net.h> @@ -194,7 +195,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, uint32_t cpy_len; struct vring_desc *desc; uint64_t desc_addr; - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; + struct virtio_net_hdr *virtio_hdr; desc = &descs[desc_idx]; desc_addr = gpa_to_vva(dev, desc->addr); @@ -208,8 +209,9 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, rte_prefetch0((void *)(uintptr_t)desc_addr); - virtio_enqueue_offload(m, &virtio_hdr.hdr); - copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); + virtio_hdr = (struct virtio_net_hdr *)(uintptr_t)desc_addr; + rte_memset(virtio_hdr, 0, sizeof(*virtio_hdr)); + virtio_enqueue_offload(m, virtio_hdr); vhost_log_write(dev, desc->addr, dev->vhost_hlen); PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); @@ -459,7 +461,6 @@ static inline int __attribute__((always_inline)) copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, struct buf_vector *buf_vec, uint16_t num_buffers) { - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; uint32_t vec_idx = 0; uint64_t desc_addr; uint32_t mbuf_offset, mbuf_avail; @@ -480,7 +481,6 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, hdr_phys_addr = buf_vec[vec_idx].buf_addr; rte_prefetch0((void *)(uintptr_t)hdr_addr); - virtio_hdr.num_buffers = num_buffers; LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", dev->vid, num_buffers); @@ -512,8 +512,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, } if (hdr_addr) { - virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr); - copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr); + struct virtio_net_hdr_mrg_rxbuf *hdr = + (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr; + + rte_memset(&(hdr->hdr), 0, sizeof(hdr->hdr)); + hdr->num_buffers = num_buffers; + virtio_enqueue_offload(hdr_mbuf, &(hdr->hdr)); vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen); PRINT_PACKET(dev, (uintptr_t)hdr_addr, dev->vhost_hlen, 0); -- 2.7.4