On 12.12.2018 11:24, Maxime Coquelin wrote:
> Instead of writing back descriptors chains in order, let's
> write the first chain flags last in order to improve batching.
> 
> With Kernel's pktgen benchmark, ~3% performance gain is measured.
> 
> Signed-off-by: Maxime Coquelin <maxime.coque...@redhat.com>
> ---
>  lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
>  1 file changed, 24 insertions(+), 15 deletions(-)
> 

Hi.
I made some rough testing on my ARMv8 system with this patch and v1 of it.
Here is the performance difference with current master:
    v1: +1.1 %
    v2: -3.6 %

So, write barriers are quiet heavy in practice.

My testcase is the three instances of testpmd on a same host (with v11 from 
Jens):

    txonly (virtio_user0) --> fwd mode io (vhost0, vhost1) --> rxonly 
(virtio_user1)

Best regards, Ilya Maximets.

> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 5e1a1a727..c0b3d1137 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -135,19 +135,10 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>                       struct vhost_virtqueue *vq)
>  {
>       int i;
> -     uint16_t used_idx = vq->last_used_idx;
> +     uint16_t head_flags, head_idx = vq->last_used_idx;
>  
> -     /* Split loop in two to save memory barriers */
> -     for (i = 0; i < vq->shadow_used_idx; i++) {
> -             vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
> -             vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
> -
> -             used_idx += vq->shadow_used_packed[i].count;
> -             if (used_idx >= vq->size)
> -                     used_idx -= vq->size;
> -     }
> -
> -     rte_smp_wmb();
> +     if (unlikely(vq->shadow_used_idx == 0))
> +             return;
>  
>       for (i = 0; i < vq->shadow_used_idx; i++) {
>               uint16_t flags;
> @@ -165,12 +156,24 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>                       flags &= ~VRING_DESC_F_AVAIL;
>               }
>  
> -             vq->desc_packed[vq->last_used_idx].flags = flags;
> +             vq->desc_packed[vq->last_used_idx].id =
> +                     vq->shadow_used_packed[i].id;
> +             vq->desc_packed[vq->last_used_idx].len =
> +                     vq->shadow_used_packed[i].len;
> +
> +             rte_smp_wmb();
>  
> -             vhost_log_cache_used_vring(dev, vq,
> +             if (i > 0) {
> +                     vq->desc_packed[vq->last_used_idx].flags = flags;
> +
> +                     vhost_log_cache_used_vring(dev, vq,
>                                       vq->last_used_idx *
>                                       sizeof(struct vring_packed_desc),
>                                       sizeof(struct vring_packed_desc));
> +             } else {
> +                     head_idx = vq->last_used_idx;
> +                     head_flags = flags;
> +             }
>  
>               vq->last_used_idx += vq->shadow_used_packed[i].count;
>               if (vq->last_used_idx >= vq->size) {
> @@ -179,8 +182,14 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>               }
>       }
>  
> -     rte_smp_wmb();
> +     vq->desc_packed[head_idx].flags = head_flags;
>       vq->shadow_used_idx = 0;
> +
> +     vhost_log_cache_used_vring(dev, vq,
> +                             head_idx *
> +                             sizeof(struct vring_packed_desc),
> +                             sizeof(struct vring_packed_desc));
> +
>       vhost_log_cache_sync(dev, vq);
>  }
>  
> 

Reply via email to