> +#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
> +static __rte_always_inline void
> +pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
> +{

A comment describing why batch_copy_elem.dst and src point to 16 byte aligned 
data would be nice.

> +     void *dst = __builtin_assume_aligned(in_dst, 16);
> +     const void *src = __builtin_assume_aligned(in_src, 16);
> +
> +     if (len <= 256) {
> +             size_t left;
> +
> +             for (left = len; left >= 32; left -= 32) {
> +                     memcpy(dst, src, 32);
> +                     dst = RTE_PTR_ADD(dst, 32);
> +                     src = RTE_PTR_ADD(src, 32);
> +             }
> +
> +             memcpy(dst, src, left);
> +     } else
> +             memcpy(dst, src, len);
> +}
> +#else
> +static __rte_always_inline void
> +pktcpy(void *dst, const void *src, size_t len)
> +{
> +     rte_memcpy(dst, src, len);
> +}
> +#endif
> +
>  static inline void
>  do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
> *vq)
>       __rte_shared_locks_required(&vq->iotlb_lock)
> @@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
> vhost_virtqueue *vq)
>       int i;
> 
>       for (i = 0; i < count; i++) {
> -             rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +             pktcpy(elem[i].dst, elem[i].src, elem[i].len);
>               vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
>                                          elem[i].len);
>               PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
> @@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
>       int i;
> 
>       for (i = 0; i < count; i++)
> -             rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> +             pktcpy(elem[i].dst, elem[i].src, elem[i].len);
> 
>       vq->batch_copy_nb_elems = 0;
>  }
> --
> 2.43.0

Anyway,
Acked-by: Morten Brørup <m...@smartsharesystems.com>

Reply via email to