On Thu, Dec 20, 2018 at 11:00:22AM +0100, Maxime Coquelin wrote:
> Instead of writing back descriptors chains in order, let's
> write the first chain flags last in order to improve batching.
> 
> With Kernel's pktgen benchmark, ~3% performance gain is measured.
> 
> Signed-off-by: Maxime Coquelin <maxime.coque...@redhat.com>
> ---
>  lib/librte_vhost/virtio_net.c | 19 +++++++++++++++++--
>  1 file changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 8c657a101..66ccd3c35 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -97,6 +97,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>  {
>       int i;
>       uint16_t used_idx = vq->last_used_idx;
> +     uint16_t head_idx = vq->last_used_idx;
> +     uint16_t head_flags = 0;
>  
>       /* Split loop in two to save memory barriers */
>       for (i = 0; i < vq->shadow_used_idx; i++) {
> @@ -126,12 +128,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>                       flags &= ~VRING_DESC_F_AVAIL;
>               }
>  
> -             vq->desc_packed[vq->last_used_idx].flags = flags;
> +             if (i > 0) {
> +                     vq->desc_packed[vq->last_used_idx].flags = flags;
>  
> -             vhost_log_cache_used_vring(dev, vq,
> +                     vhost_log_cache_used_vring(dev, vq,
>                                       vq->last_used_idx *
>                                       sizeof(struct vring_packed_desc),
>                                       sizeof(struct vring_packed_desc));
> +             } else {
> +                     head_idx = vq->last_used_idx;
> +                     head_flags = flags;
> +             }
>  
>               vq->last_used_idx += vq->shadow_used_packed[i].count;
>               if (vq->last_used_idx >= vq->size) {
> @@ -140,7 +147,15 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>               }
>       }
>  
> +     vq->desc_packed[head_idx].flags = head_flags;
> +
>       rte_smp_wmb();
> +
> +     vhost_log_cache_used_vring(dev, vq,
> +                             head_idx *
> +                             sizeof(struct vring_packed_desc),
> +                             sizeof(struct vring_packed_desc));
> +
>       vq->shadow_used_idx = 0;
>       vhost_log_cache_sync(dev, vq);

How about moving rte_smp_wmb into logging functions?
This way it's free with log disabled even on arm...

>  }
> -- 
> 2.17.2

Reply via email to