Instead of writing back descriptors chains in order, let's
write the first chain flags last in order to improve batching.

With Kernel's pktgen benchmark, ~3% performance gain is measured.

Signed-off-by: Maxime Coquelin <maxime.coque...@redhat.com>
---
 lib/librte_vhost/virtio_net.c | 37 ++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5e1a1a727..f54642c2d 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -135,19 +135,10 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
                        struct vhost_virtqueue *vq)
 {
        int i;
-       uint16_t used_idx = vq->last_used_idx;
+       uint16_t head_flags, head_idx = vq->last_used_idx;
 
-       /* Split loop in two to save memory barriers */
-       for (i = 0; i < vq->shadow_used_idx; i++) {
-               vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
-               vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
-
-               used_idx += vq->shadow_used_packed[i].count;
-               if (used_idx >= vq->size)
-                       used_idx -= vq->size;
-       }
-
-       rte_smp_wmb();
+       if (unlikely(vq->shadow_used_idx == 0))
+               return;
 
        for (i = 0; i < vq->shadow_used_idx; i++) {
                uint16_t flags;
@@ -165,12 +156,22 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
                        flags &= ~VRING_DESC_F_AVAIL;
                }
 
-               vq->desc_packed[vq->last_used_idx].flags = flags;
+               vq->desc_packed[vq->last_used_idx].id =
+                       vq->shadow_used_packed[i].id;
+               vq->desc_packed[vq->last_used_idx].len =
+                       vq->shadow_used_packed[i].len;
+
+               if (i > 0) {
+                       vq->desc_packed[vq->last_used_idx].flags = flags;
 
-               vhost_log_cache_used_vring(dev, vq,
+                       vhost_log_cache_used_vring(dev, vq,
                                        vq->last_used_idx *
                                        sizeof(struct vring_packed_desc),
                                        sizeof(struct vring_packed_desc));
+               } else {
+                       head_idx = vq->last_used_idx;
+                       head_flags = flags;
+               }
 
                vq->last_used_idx += vq->shadow_used_packed[i].count;
                if (vq->last_used_idx >= vq->size) {
@@ -180,7 +181,15 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
        }
 
        rte_smp_wmb();
+
+       vq->desc_packed[head_idx].flags = head_flags;
        vq->shadow_used_idx = 0;
+
+       vhost_log_cache_used_vring(dev, vq,
+                               head_idx *
+                               sizeof(struct vring_packed_desc),
+                               sizeof(struct vring_packed_desc));
+
        vhost_log_cache_sync(dev, vq);
 }
 
-- 
2.17.2

Reply via email to