On 2024-10-09 23:25, Morten Brørup wrote:
+#if defined(RTE_USE_CC_MEMCPY) && defined(RTE_ARCH_X86_64)
+static __rte_always_inline void
+pktcpy(void *restrict in_dst, const void *restrict in_src, size_t len)
+{

A comment describing why batch_copy_elem.dst and src point to 16 byte aligned 
data would be nice.


Good point. As I think I mentioned at some point, I'm not sure they are.

From what I recall, having (or pretending) the data is 16-bit aligned does give a noticeable performance increase on x86_64.

Is this something I should look into for 24.11, or this patch set is not going to make it anyway?

+       void *dst = __builtin_assume_aligned(in_dst, 16);
+       const void *src = __builtin_assume_aligned(in_src, 16);
+
+       if (len <= 256) {
+               size_t left;
+
+               for (left = len; left >= 32; left -= 32) {
+                       memcpy(dst, src, 32);
+                       dst = RTE_PTR_ADD(dst, 32);
+                       src = RTE_PTR_ADD(src, 32);
+               }
+
+               memcpy(dst, src, left);
+       } else
+               memcpy(dst, src, len);
+}
+#else
+static __rte_always_inline void
+pktcpy(void *dst, const void *src, size_t len)
+{
+       rte_memcpy(dst, src, len);
+}
+#endif
+
  static inline void
  do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue
*vq)
        __rte_shared_locks_required(&vq->iotlb_lock)
@@ -240,7 +273,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct
vhost_virtqueue *vq)
        int i;

        for (i = 0; i < count; i++) {
-               rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+               pktcpy(elem[i].dst, elem[i].src, elem[i].len);
                vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
                                           elem[i].len);
                PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
@@ -257,7 +290,7 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq)
        int i;

        for (i = 0; i < count; i++)
-               rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
+               pktcpy(elem[i].dst, elem[i].src, elem[i].len);

        vq->batch_copy_nb_elems = 0;
  }
--
2.43.0

Anyway,
Acked-by: Morten Brørup <m...@smartsharesystems.com>


Reply via email to