> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64) > +static __rte_always_inline void > +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len) > +{
A comment describing why batch_copy_elem.dst and src point to 16 byte aligned data would be nice. > + void *dst = __builtin_assume_aligned(in_dst, 16); > + const void *src = __builtin_assume_aligned(in_src, 16); > + > + if (len <= 256) { > + size_t left; > + > + for (left = len; left >= 32; left -= 32) { > + memcpy(dst, src, 32); > + dst = RTE_PTR_ADD(dst, 32); > + src = RTE_PTR_ADD(src, 32); > + } > + > + memcpy(dst, src, left); > + } else > + memcpy(dst, src, len); > +} > +#else > +static __rte_always_inline void > +pktcpy(void *dst, const void *src, size_t len) > +{ > + rte_memcpy(dst, src, len); > +} > +#endif > + > static inline void > do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue > *vq) > __rte_shared_locks_required(&vq->iotlb_lock) > @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct > vhost_virtqueue *vq) > int i; > > for (i = 0; i < count; i++) { > - rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); > + pktcpy(elem[i].dst, elem[i].src, elem[i].len); > vhost_log_cache_write_iova(dev, vq, elem[i].log_addr, > elem[i].len); > PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0); > @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) > int i; > > for (i = 0; i < count; i++) > - rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); > + pktcpy(elem[i].dst, elem[i].src, elem[i].len); > > vq->batch_copy_nb_elems = 0; > } > -- > 2.43.0 Anyway, Acked-by: Morten Brørup <m...@smartsharesystems.com>