First of all, rte_memcpy() is mostly useful for coping big packets by leveraging hardware advanced instructions like AVX. But for virtio net hdr, which is 12 bytes at most, invoking rte_memcpy() will not introduce any performance boost.
And, to my suprise, rte_memcpy() is VERY huge. Since rte_memcpy() is inlined, it increases the binary code size linearly every time we call it at a different place. Replacing the two rte_memcpy() with directly copy saves nearly 12K bytes of code size! Signed-off-by: Yuanhan Liu <yuanhan.liu at linux.intel.com> --- lib/librte_vhost/vhost_rxtx.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c index 3909584..97690c3 100644 --- a/lib/librte_vhost/vhost_rxtx.c +++ b/lib/librte_vhost/vhost_rxtx.c @@ -92,6 +92,17 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) return; } +static inline void +copy_virtio_net_hdr(struct vhost_virtqueue *vq, uint64_t desc_addr, + struct virtio_net_hdr_mrg_rxbuf hdr) +{ + if (vq->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) { + *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; + } else { + *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; + } +} + static inline int __attribute__((always_inline)) copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *m, uint16_t desc_idx, uint32_t *copied) @@ -108,8 +119,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, rte_prefetch0((void *)(uintptr_t)desc_addr); virtio_enqueue_offload(m, &virtio_hdr.hdr); - rte_memcpy((void *)(uintptr_t)desc_addr, - (const void *)&virtio_hdr, vq->vhost_hlen); + copy_virtio_net_hdr(vq, desc_addr, virtio_hdr); PRINT_PACKET(dev, (uintptr_t)desc_addr, vq->vhost_hlen, 0); desc_offset = vq->vhost_hlen; @@ -404,8 +414,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, dev->device_fh, virtio_hdr.num_buffers); virtio_enqueue_offload(m, &virtio_hdr.hdr); - rte_memcpy((void *)(uintptr_t)desc_addr, - (const void *)&virtio_hdr, vq->vhost_hlen); + copy_virtio_net_hdr(vq, desc_addr, virtio_hdr); PRINT_PACKET(dev, (uintptr_t)desc_addr, vq->vhost_hlen, 0); desc_avail = vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; -- 1.9.0