In build where use_cc_memcpy is set to true, the vhost user PMD suffers a large performance drop on Intel P-cores for small packets, at least when built by GCC and (to a much lesser extent) clang.
This patch addresses that issue by using a custom virtio memcpy()-based packet copying routine. Performance results from a Raptor Lake @ 3,2 GHz: GCC 12.3.0 64 bytes packets Core Mode Mpps E RTE memcpy 9.5 E cc memcpy 9.7 E cc memcpy+pktcpy 9.0 P RTE memcpy 16.4 P cc memcpy 13.5 P cc memcpy+pktcpy 16.2 GCC 12.3.0 1500 bytes packets Core Mode Mpps P RTE memcpy 5.8 P cc memcpy 5.9 P cc memcpy+pktcpy 5.9 clang 15.0.7 64 bytes packets Core Mode Mpps P RTE memcpy 13.3 P cc memcpy 12.9 P cc memcpy+pktcpy 13.9 "RTE memcpy" is use_cc_memcpy=false, "cc memcpy" is use_cc_memcpy=true and "pktcpy" is when this patch is applied. Signed-off-by: Mattias Rönnblom <mattias.ronnb...@ericsson.com> --- lib/vhost/virtio_net.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index 370402d849..63571587a8 100644 --- a/lib/vhost/virtio_net.c +++ b/lib/vhost/virtio_net.c @@ -231,6 +231,39 @@ vhost_async_dma_check_completed(struct virtio_net *dev, int16_t dma_id, uint16_t return nr_copies; } +/* The code generated by GCC (and to a lesser extent, clang) with just + * a straight memcpy() to copy packets is less than optimal on Intel + * P-cores, for small packets. Thus the need of this specialized + * memcpy() in builds where use_cc_memcpy is set to true. + */ +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64) +static __rte_always_inline void +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len) +{ + void *dst = __builtin_assume_aligned(in_dst, 16); + const void *src = __builtin_assume_aligned(in_src, 16); + + if (len <= 256) { + size_t left; + + for (left = len; left >= 32; left -= 32) { + memcpy(dst, src, 32); + dst = RTE_PTR_ADD(dst, 32); + src = RTE_PTR_ADD(src, 32); + } + + memcpy(dst, src, left); + } else + memcpy(dst, src, len); +} +#else +static __rte_always_inline void +pktcpy(void *dst, const void *src, size_t len) +{ + rte_memcpy(dst, src, len); +} +#endif + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) __rte_shared_locks_required(&vq->iotlb_lock) @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) int i; for (i = 0; i < count; i++) { - rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); + pktcpy(elem[i].dst, elem[i].src, elem[i].len); vhost_log_cache_write_iova(dev, vq, elem[i].log_addr, elem[i].len); PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0); @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) int i; for (i = 0; i < count; i++) - rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); + pktcpy(elem[i].dst, elem[i].src, elem[i].len); vq->batch_copy_nb_elems = 0; } -- 2.34.1