On Tue, May 15, 2018 at 07:30:21PM +0200, Maxime Coquelin wrote:
> This patch caches all dirty pages logging until the used ring index
> is updated. These dirty pages won't be accessed by the guest as
> long as the host doesn't give them back to it by updating the
> index.
> 
> The goal of this optimization is to fix a performance regression
> introduced when the vhost library started to use atomic operations
> to set bits in the shared dirty log map. While the fix was valid
> as previous implementation wasn't safe against concurent accesses,

Typo: concurent

> contention was induced.
> 
> With this patch, during migration, we have:
> 1. Less atomic operations as only a single atomic OR operation
> per 32 or 64 (depending on CPU)  pages.
> 2. Less atomic operations as during a burst, the same page will
> be marked dirty only once.
> 3. Less write memory barriers.
> 
> Fixes: 897f13a1f726 ("vhost: make page logging atomic")
> 
> Cc: sta...@dpdk.org
> 
> Cc: Tiwei Bie <tiwei....@intel.com>
> Suggested-by: Michael S. Tsirkin <m...@redhat.com>
> Signed-off-by: Maxime Coquelin <maxime.coque...@redhat.com>

This is a nice approach! Thanks for the work!

> ---
>  lib/librte_vhost/vhost.h      | 113 
> ++++++++++++++++++++++++++++++++++++++++++
>  lib/librte_vhost/virtio_net.c |  29 +++++++----
>  2 files changed, 132 insertions(+), 10 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 891978131..8f6a41d7e 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -36,6 +36,8 @@
>  
>  #define BUF_VECTOR_MAX 256
>  
> +#define VHOST_LOG_CACHE_NR 32
> +
>  /**
>   * Structure contains buffer address, length and descriptor index
>   * from vring to do scatter RX.
> @@ -69,6 +71,14 @@ struct batch_copy_elem {
>       uint64_t log_addr;
>  };
>  
> +/*
> + * Structure that contains the info for batched dirty logging.
> + */
> +struct log_cache_entry {
> +     uint32_t offset;
> +     unsigned long val;
> +};
> +
>  /**
>   * Structure contains variables relevant to RX/TX virtqueues.
>   */
> @@ -112,6 +122,9 @@ struct vhost_virtqueue {
>       struct batch_copy_elem  *batch_copy_elems;
>       uint16_t                batch_copy_nb_elems;
>  
> +     struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
> +     uint16_t log_cache_nb_elem;
> +
>       rte_rwlock_t    iotlb_lock;
>       rte_rwlock_t    iotlb_pending_lock;
>       struct rte_mempool *iotlb_pool;
> @@ -309,7 +322,15 @@ struct virtio_net {
>  static __rte_always_inline void
>  vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
>  {
> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)

Just curious, is there any reference about why
this version was chosen? Thanks!

> +             /*
> +              * __sync_ built-ins are deprecated, but __atomic_ ones
> +              * are sub-optimized in older GCC versions.
> +              */

The indent isn't right (just need one tab here).

>       __sync_fetch_and_or_8(addr, (1U << nr));
> +#else
> +     __atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
> +#endif
>  }
>  
>  static __rte_always_inline void
> @@ -340,6 +361,98 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, 
> uint64_t len)
>       }
>  }
>  
> +static __rte_always_inline void
> +vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
> +     unsigned long *log_base;
> +     int i;
> +
> +     if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> +                !dev->log_base))
> +             return;
> +
> +     log_base = (unsigned long *)(uintptr_t)dev->log_base;
> +
> +     /* To make sure guest memory updates are committed before logging */
> +     rte_smp_wmb();
> +
> +     for (i = 0; i < vq->log_cache_nb_elem; i++) {
> +             struct log_cache_entry *elem = vq->log_cache + i;
> +
> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
> +             /*
> +              * '__sync' builtins are deprecated, but '__atomic' ones
> +              * are sub-optimized in older GCC versions.
> +              */
> +             __sync_fetch_and_or(log_base + elem->offset, elem->val);
> +#else
> +             __atomic_fetch_or(log_base + elem->offset, elem->val,
> +                             __ATOMIC_RELAXED);
> +#endif
> +     }
> +
> +     vq->log_cache_nb_elem = 0;
> +}
> +
> +static __rte_always_inline void
> +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +                                      uint64_t page)
> +{
> +     uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
> +     uint32_t offset = page / (sizeof(unsigned long) << 3);
> +     int i;
> +
> +     for (i = 0; i < vq->log_cache_nb_elem; i++) {
> +             struct log_cache_entry *elem = vq->log_cache + i;
> +
> +             if (elem->offset == offset) {
> +                     elem->val |= (1U << bit_nr);

The val is unsigned long now, we need to use 1UL.

> +                     return;
> +             }
> +     }
> +
> +     if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
> +             /*
> +              * No more room for a new log cache entry,
> +              * so write the dirty log map directly.
> +              */
> +             rte_smp_wmb();
> +             vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
> +
> +             return;
> +     }
> +
> +     vq->log_cache[i].offset = offset;
> +     vq->log_cache[i].val = (1U << bit_nr);

Ditto.

> +}
> +
> +static __rte_always_inline void
> +vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +                                       uint64_t addr, uint64_t len)

The 8 spaces width tabs are more widely used in DPDK.
And in below coding style document,

https://github.com/DPDK/dpdk/blob/master/doc/guides/contributing/coding_style.rst

The width of each level indent in most examples is 8
spaces. So maybe it's better to keep using 8 spaces
width tabs.

> +{
> +     uint64_t page;
> +
> +     if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> +                !dev->log_base || !len))
> +             return;
> +
> +     if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
> +             return;
> +
> +     page = addr / VHOST_LOG_PAGE;
> +     while (page * VHOST_LOG_PAGE < addr + len) {
> +             vhost_log_cache_page(dev, vq, page);
> +             page += 1;
> +     }
> +}
> +
> +static __rte_always_inline void
> +vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue 
> *vq,
> +                  uint64_t offset, uint64_t len)
> +{
> +     vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
> +}
> +
>  static __rte_always_inline void
>  vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>                    uint64_t offset, uint64_t len)
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index eed6b0227..76ec5f089 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -78,7 +78,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>       rte_memcpy(&vq->used->ring[to],
>                       &vq->shadow_used_ring[from],
>                       size * sizeof(struct vring_used_elem));
> -     vhost_log_used_vring(dev, vq,
> +     vhost_log_cache_used_vring(dev, vq,
>                       offsetof(struct vring_used, ring[to]),
>                       size * sizeof(struct vring_used_elem));
>  }
> @@ -106,6 +106,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct 
> vhost_virtqueue *vq)
>  
>       rte_smp_wmb();
>  
> +     vhost_log_cache_sync(dev, vq);

Each time we call vhost_log_cache_sync(), there
is already a rte_smp_wmb() which is to protect
the used->idx update. So maybe there is no need
to call rte_smp_wmb() in vhost_log_cache_sync().

Best regards,
Tiwei Bie

Reply via email to