Instead of writing back descriptors chains in order, let's write the first chain flags last in order to improve batching.
With Kernel's pktgen benchmark, ~3% performance gain is measured. Signed-off-by: Maxime Coquelin <maxime.coque...@redhat.com> --- lib/librte_vhost/virtio_net.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 8c657a101..66ccd3c35 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -97,6 +97,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, { int i; uint16_t used_idx = vq->last_used_idx; + uint16_t head_idx = vq->last_used_idx; + uint16_t head_flags = 0; /* Split loop in two to save memory barriers */ for (i = 0; i < vq->shadow_used_idx; i++) { @@ -126,12 +128,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, flags &= ~VRING_DESC_F_AVAIL; } - vq->desc_packed[vq->last_used_idx].flags = flags; + if (i > 0) { + vq->desc_packed[vq->last_used_idx].flags = flags; - vhost_log_cache_used_vring(dev, vq, + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * sizeof(struct vring_packed_desc), sizeof(struct vring_packed_desc)); + } else { + head_idx = vq->last_used_idx; + head_flags = flags; + } vq->last_used_idx += vq->shadow_used_packed[i].count; if (vq->last_used_idx >= vq->size) { @@ -140,7 +147,15 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, } } + vq->desc_packed[head_idx].flags = head_flags; + rte_smp_wmb(); + + vhost_log_cache_used_vring(dev, vq, + head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + vq->shadow_used_idx = 0; vhost_log_cache_sync(dev, vq); } -- 2.17.2