Re: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path registration API

Maxime Coquelin Thu, 25 Jun 2020 06:43:25 -0700

On 6/18/20 7:50 AM, Liu, Yong wrote:
> Thanks, Patrick. So comments are inline.
> 
>> -----Original Message-----
>> From: dev <dev-boun...@dpdk.org> On Behalf Of patrick...@intel.com
>> Sent: Thursday, June 11, 2020 6:02 PM
>> To: dev@dpdk.org; maxime.coque...@redhat.com; Xia, Chenbo
>> <chenbo....@intel.com>; Wang, Zhihong <zhihong.w...@intel.com>; Ye,
>> Xiaolong <xiaolong...@intel.com>
>> Cc: Fu, Patrick <patrick...@intel.com>; Jiang, Cheng1
>> <cheng1.ji...@intel.com>; Liang, Cunming <cunming.li...@intel.com>
>> Subject: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
>> registration API
>>
>> From: Patrick <patrick...@intel.com>
>>
>> This patch introduces registration/un-registration APIs
>> for async data path together with all required data
>> structures and DMA callback function proto-types.
>>
>> Signed-off-by: Patrick <patrick...@intel.com>
>> ---
>>  lib/librte_vhost/Makefile          |   3 +-
>>  lib/librte_vhost/rte_vhost.h       |   1 +
>>  lib/librte_vhost/rte_vhost_async.h | 134
>> +++++++++++++++++++++++++++++++++++++
>>  lib/librte_vhost/socket.c          |  20 ++++++
>>  lib/librte_vhost/vhost.c           |  74 +++++++++++++++++++-
>>  lib/librte_vhost/vhost.h           |  30 ++++++++-
>>  lib/librte_vhost/vhost_user.c      |  28 ++++++--
>>  7 files changed, 283 insertions(+), 7 deletions(-)
>>  create mode 100644 lib/librte_vhost/rte_vhost_async.h
>>
>> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
>> index e592795..3aed094 100644
>> --- a/lib/librte_vhost/Makefile
>> +++ b/lib/librte_vhost/Makefile
>> @@ -41,7 +41,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c
>> iotlb.c socket.c vhost.c \
>>                                      vhost_user.c virtio_net.c vdpa.c
>>
>>  # install includes
>> -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
>> +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
>> \
>> +                                            rte_vhost_async.h
>>
> Hi Patrick,
> Please also update meson build for newly added file.
> 
> Thanks,
> Marvin
> 
>>  # only compile vhost crypto when cryptodev is enabled
>>  ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
>> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
>> index d43669f..cec4d07 100644
>> --- a/lib/librte_vhost/rte_vhost.h
>> +++ b/lib/librte_vhost/rte_vhost.h
>> @@ -35,6 +35,7 @@
>>  #define RTE_VHOST_USER_EXTBUF_SUPPORT       (1ULL << 5)
>>  /* support only linear buffers (no chained mbufs) */
>>  #define RTE_VHOST_USER_LINEARBUF_SUPPORT    (1ULL << 6)
>> +#define RTE_VHOST_USER_ASYNC_COPY   (1ULL << 7)
>>
>>  /** Protocol features. */
>>  #ifndef VHOST_USER_PROTOCOL_F_MQ
>> diff --git a/lib/librte_vhost/rte_vhost_async.h
>> b/lib/librte_vhost/rte_vhost_async.h
>> new file mode 100644
>> index 0000000..82f2ebe
>> --- /dev/null
>> +++ b/lib/librte_vhost/rte_vhost_async.h
>> @@ -0,0 +1,134 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2018 Intel Corporation
>> + */
> 
> s/2018/2020/ 
> 
>> +
>> +#ifndef _RTE_VHOST_ASYNC_H_
>> +#define _RTE_VHOST_ASYNC_H_
>> +
>> +#include "rte_vhost.h"
>> +
>> +/**
>> + * iovec iterator
>> + */
>> +struct iov_it {
>> +    /** offset to the first byte of interesting data */
>> +    size_t offset;
>> +    /** total bytes of data in this iterator */
>> +    size_t count;
>> +    /** pointer to the iovec array */
>> +    struct iovec *iov;
>> +    /** number of iovec in this iterator */
>> +    unsigned long nr_segs;
>> +};
> 
> Patrick,
> I think structure named as "it" is too generic for understanding, please use 
> more meaningful name like "iov_iter". 

I would also add that being pat if the Vhost API, it needs to be
prefixed with rte_vhost_.

This comment applies to all structures in this header.

>> +
>> +/**
>> + * dma transfer descriptor pair
>> + */
>> +struct dma_trans_desc {
>> +    /** source memory iov_it */
>> +    struct iov_it *src;
>> +    /** destination memory iov_it */
>> +    struct iov_it *dst;
>> +};
>> +
> 
> This series patch named as sync copy,  and dma is just one async copy method 
> which underneath hardware supplied. 
> IMHO, structure is better to named as "async_copy_desc" which matched the 
> overall concept. 
> 
>> +/**
>> + * dma transfer status
>> + */
>> +struct dma_trans_status {
>> +    /** An array of application specific data for source memory */
>> +    uintptr_t *src_opaque_data;
>> +    /** An array of application specific data for destination memory */
>> +    uintptr_t *dst_opaque_data;
>> +};
>> +
> Same as pervious comment.
> 
>> +/**
>> + * dma operation callbacks to be implemented by applications
>> + */
>> +struct rte_vhost_async_channel_ops {
>> +    /**
>> +     * instruct a DMA channel to perform copies for a batch of packets
>> +     *
>> +     * @param vid
>> +     *  id of vhost device to perform data copies
>> +     * @param queue_id
>> +     *  queue id to perform data copies
>> +     * @param descs
>> +     *  an array of DMA transfer memory descriptors
>> +     * @param opaque_data
>> +     *  opaque data pair sending to DMA engine
>> +     * @param count
>> +     *  number of elements in the "descs" array
>> +     * @return
>> +     *  -1 on failure, number of descs processed on success
>> +     */
>> +    int (*transfer_data)(int vid, uint16_t queue_id,
>> +            struct dma_trans_desc *descs,
>> +            struct dma_trans_status *opaque_data,
>> +            uint16_t count);
>> +    /**
>> +     * check copy-completed packets from a DMA channel
>> +     * @param vid
>> +     *  id of vhost device to check copy completion
>> +     * @param queue_id
>> +     *  queue id to check copyp completion
>> +     * @param opaque_data
>> +     *  buffer to receive the opaque data pair from DMA engine
>> +     * @param max_packets
>> +     *  max number of packets could be completed
>> +     * @return
>> +     *  -1 on failure, number of iov segments completed on success
>> +     */
>> +    int (*check_completed_copies)(int vid, uint16_t queue_id,
>> +            struct dma_trans_status *opaque_data,
>> +            uint16_t max_packets);
>> +};
>> +
>> +/**
>> + *  dma channel feature bit definition
>> + */
>> +struct dma_channel_features {
>> +    union {
>> +            uint32_t intval;
>> +            struct {
>> +                    uint32_t inorder:1;
>> +                    uint32_t resvd0115:15;
>> +                    uint32_t threshold:12;
>> +                    uint32_t resvd2831:4;
>> +            };
>> +    };
>> +};
>> +
> 
> Naming feature bits as "intval" may cause confusion, why not just use its 
> meaning like "engine_features"?
> I'm not sure whether format "resvd0115" match dpdk copy style. In my mind, 
> dpdk will use resvd_0 and resvd_1 for two reserved elements.
> 
>> +/**
>> + * register a dma channel for vhost
>> + *
>> + * @param vid
>> + *  vhost device id DMA channel to be attached to
>> + * @param queue_id
>> + *  vhost queue id DMA channel to be attached to
>> + * @param features
>> + *  DMA channel feature bit
>> + *    b0       : DMA supports inorder data transfer
>> + *    b1  - b15: reserved
>> + *    b16 - b27: Packet length threshold for DMA transfer
>> + *    b28 - b31: reserved
>> + * @param ops
>> + *  DMA operation callbacks
>> + * @return
>> + *  0 on success, -1 on failures
>> + */
>> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>> +    uint32_t features, struct rte_vhost_async_channel_ops *ops);
>> +
>> +/**
>> + * unregister a dma channel for vhost
>> + *
>> + * @param vid
>> + *  vhost device id DMA channel to be detached
>> + * @param queue_id
>> + *  vhost queue id DMA channel to be detached
>> + * @return
>> + *  0 on success, -1 on failures
>> + */
>> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);
>> +
>> +#endif /* _RTE_VDPA_H_ */
>> diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
>> index 0a66ef9..f817783 100644
>> --- a/lib/librte_vhost/socket.c
>> +++ b/lib/librte_vhost/socket.c
>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>>      bool use_builtin_virtio_net;
>>      bool extbuf;
>>      bool linearbuf;
>> +    bool async_copy;
>>
>>      /*
>>       * The "supported_features" indicates the feature bits the
>> @@ -210,6 +211,7 @@ struct vhost_user {
>>      size_t size;
>>      struct vhost_user_connection *conn;
>>      int ret;
>> +    struct virtio_net *dev;
>>
>>      if (vsocket == NULL)
>>              return;
>> @@ -241,6 +243,13 @@ struct vhost_user {
>>      if (vsocket->linearbuf)
>>              vhost_enable_linearbuf(vid);
>>
>> +    if (vsocket->async_copy) {
>> +            dev = get_device(vid);
>> +
>> +            if (dev)
>> +                    dev->async_copy = 1;
>> +    }
>> +
> 
> IMHO, user can chose which queue utilize async copy as backend hardware 
> resource is limited. 
> So should async_copy enable flag be saved in virtqueue structure? 
> 
>>      VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
>>
>>      if (vsocket->notify_ops->new_connection) {
>> @@ -891,6 +900,17 @@ struct vhost_user_reconnect_list {
>>              goto out_mutex;
>>      }
>>
>> +    vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
>> +
>> +    if (vsocket->async_copy &&
>> +            (flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
>> +            RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
>> +            VHOST_LOG_CONFIG(ERR, "error: enabling async copy and
>> IOMMU "
>> +                    "or post-copy feature simultaneously is not "
>> +                    "supported\n");
>> +            goto out_mutex;
>> +    }
>> +
>>      /*
>>       * Set the supported features correctly for the builtin vhost-user
>>       * net driver.
>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
>> index 0266318..e6b688a 100644
>> --- a/lib/librte_vhost/vhost.c
>> +++ b/lib/librte_vhost/vhost.c
>> @@ -332,8 +332,13 @@
>>  {
>>      if (vq_is_packed(dev))
>>              rte_free(vq->shadow_used_packed);
>> -    else
>> +    else {
>>              rte_free(vq->shadow_used_split);
>> +            if (vq->async_pkts_pending)
>> +                    rte_free(vq->async_pkts_pending);
>> +            if (vq->async_pending_info)
>> +                    rte_free(vq->async_pending_info);
>> +    }
>>      rte_free(vq->batch_copy_elems);
>>      rte_mempool_free(vq->iotlb_pool);
>>      rte_free(vq);
>> @@ -1527,3 +1532,70 @@ int rte_vhost_extern_callback_register(int vid,
>>      if (vhost_data_log_level >= 0)
>>              rte_log_set_level(vhost_data_log_level,
>> RTE_LOG_WARNING);
>>  }
>> +
>> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>> +                                    uint32_t features,
>> +                                    struct rte_vhost_async_channel_ops
>> *ops)
>> +{
>> +    struct vhost_virtqueue *vq;
>> +    struct virtio_net *dev = get_device(vid);
>> +    struct dma_channel_features f;
>> +
>> +    if (dev == NULL || ops == NULL)
>> +            return -1;
>> +
>> +    f.intval = features;
>> +
>> +    vq = dev->virtqueue[queue_id];
>> +
>> +    if (vq == NULL)
>> +            return -1;
>> +
>> +    /** packed queue is not supported */
>> +    if (vq_is_packed(dev) || !f.inorder)
>> +            return -1;
>> +
> Virtio already has in_order concept, these two names are so like and can be 
> easily messed up.  Please consider how to distinguish them.
> 
>> +    if (ops->check_completed_copies == NULL ||
>> +            ops->transfer_data == NULL)
>> +            return -1;
>> +
> 
> Previous error is unlikely to be true, unlikely macro may be helpful for 
> understanding. 
> 
>> +    rte_spinlock_lock(&vq->access_lock);
>> +
>> +    vq->async_ops.check_completed_copies = ops-
>>> check_completed_copies;
>> +    vq->async_ops.transfer_data = ops->transfer_data;
>> +
>> +    vq->async_inorder = f.inorder;
>> +    vq->async_threshold = f.threshold;
>> +
>> +    vq->async_registered = true;
>> +
>> +    rte_spinlock_unlock(&vq->access_lock);
>> +
>> +    return 0;
>> +}
>> +
>> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
>> +{
>> +    struct vhost_virtqueue *vq;
>> +    struct virtio_net *dev = get_device(vid);
>> +
>> +    if (dev == NULL)
>> +            return -1;
>> +
>> +    vq = dev->virtqueue[queue_id];
>> +
>> +    if (vq == NULL)
>> +            return -1;
>> +
>> +    rte_spinlock_lock(&vq->access_lock);
>> +
>> +    vq->async_ops.transfer_data = NULL;
>> +    vq->async_ops.check_completed_copies = NULL;
>> +
>> +    vq->async_registered = false;
>> +
>> +    rte_spinlock_unlock(&vq->access_lock);
>> +
>> +    return 0;
>> +}
>> +
>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
>> index df98d15..a7fbe23 100644
>> --- a/lib/librte_vhost/vhost.h
>> +++ b/lib/librte_vhost/vhost.h
>> @@ -23,6 +23,8 @@
>>  #include "rte_vhost.h"
>>  #include "rte_vdpa.h"
>>
>> +#include "rte_vhost_async.h"
>> +
>>  /* Used to indicate that the device is running on a data core */
>>  #define VIRTIO_DEV_RUNNING 1
>>  /* Used to indicate that the device is ready to operate */
>> @@ -39,6 +41,11 @@
>>
>>  #define VHOST_LOG_CACHE_NR 32
>>
>> +#define MAX_PKT_BURST 32
>> +
>> +#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
>> +#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
>> +
>>  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)    \
>>      ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED |
>> VRING_DESC_F_WRITE) : \
>>              VRING_DESC_F_WRITE)
>> @@ -200,6 +207,25 @@ struct vhost_virtqueue {
>>      TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
>>      int                             iotlb_cache_nr;
>>      TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
>> +
>> +    /* operation callbacks for async dma */
>> +    struct rte_vhost_async_channel_ops      async_ops;
>> +
>> +    struct iov_it it_pool[VHOST_MAX_ASYNC_IT];
>> +    struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
>> +
>> +    /* async data transfer status */
>> +    uintptr_t       **async_pkts_pending;
>> +    #define         ASYNC_PENDING_INFO_N_MSK 0xFFFF
>> +    #define         ASYNC_PENDING_INFO_N_SFT 16
>> +    uint64_t        *async_pending_info;
>> +    uint16_t        async_pkts_idx;
>> +    uint16_t        async_pkts_inflight_n;
>> +
>> +    /* vq async features */
>> +    bool            async_inorder;
>> +    bool            async_registered;
>> +    uint16_t        async_threshold;
>>  } __rte_cache_aligned;
>>
>>  /* Old kernels have no such macros defined */
>> @@ -353,6 +379,7 @@ struct virtio_net {
>>      int16_t                 broadcast_rarp;
>>      uint32_t                nr_vring;
>>      int                     dequeue_zero_copy;
>> +    int                     async_copy;
>>      int                     extbuf;
>>      int                     linearbuf;
>>      struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
>> @@ -702,7 +729,8 @@ uint64_t translate_log_addr(struct virtio_net *dev,
>> struct vhost_virtqueue *vq,
>>      /* Don't kick guest if we don't reach index specified by guest. */
>>      if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
>>              uint16_t old = vq->signalled_used;
>> -            uint16_t new = vq->last_used_idx;
>> +            uint16_t new = vq->async_pkts_inflight_n ?
>> +                                    vq->used->idx:vq->last_used_idx;
>>              bool signalled_used_valid = vq->signalled_used_valid;
>>
>>              vq->signalled_used = new;
>> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
>> index 84bebad..d7600bf 100644
>> --- a/lib/librte_vhost/vhost_user.c
>> +++ b/lib/librte_vhost/vhost_user.c
>> @@ -464,12 +464,25 @@
>>      } else {
>>              if (vq->shadow_used_split)
>>                      rte_free(vq->shadow_used_split);
>> +            if (vq->async_pkts_pending)
>> +                    rte_free(vq->async_pkts_pending);
>> +            if (vq->async_pending_info)
>> +                    rte_free(vq->async_pending_info);
>> +
>>              vq->shadow_used_split = rte_malloc(NULL,
>>                              vq->size * sizeof(struct vring_used_elem),
>>                              RTE_CACHE_LINE_SIZE);
>> -            if (!vq->shadow_used_split) {
>> +            vq->async_pkts_pending = rte_malloc(NULL,
>> +                            vq->size * sizeof(uintptr_t),
>> +                            RTE_CACHE_LINE_SIZE);
>> +            vq->async_pending_info = rte_malloc(NULL,
>> +                            vq->size * sizeof(uint64_t),
>> +                            RTE_CACHE_LINE_SIZE);
>> +            if (!vq->shadow_used_split ||
>> +                    !vq->async_pkts_pending ||
>> +                    !vq->async_pending_info) {
>>                      VHOST_LOG_CONFIG(ERR,
>> -                                    "failed to allocate memory for
>> shadow used ring.\n");
>> +                                    "failed to allocate memory for vq
>> internal data.\n");
> 
> If async copy not enabled, there will be no need to allocate related 
> structures. 
> 
>>                      return RTE_VHOST_MSG_RESULT_ERR;
>>              }
>>      }
>> @@ -1147,7 +1160,8 @@
>>                      goto err_mmap;
>>              }
>>
>> -            populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
>> +            populate = (dev->dequeue_zero_copy || dev->async_copy) ?
>> +                    MAP_POPULATE : 0;
>>              mmap_addr = mmap(NULL, mmap_size, PROT_READ |
>> PROT_WRITE,
>>                               MAP_SHARED | populate, fd, 0);
>>
>> @@ -1162,7 +1176,7 @@
>>              reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
>>                                    mmap_offset;
>>
>> -            if (dev->dequeue_zero_copy)
>> +            if (dev->dequeue_zero_copy || dev->async_copy)
>>                      if (add_guest_pages(dev, reg, alignment) < 0) {
>>                              VHOST_LOG_CONFIG(ERR,
>>                                      "adding guest pages to region %u
>> failed.\n",
>> @@ -1945,6 +1959,12 @@ static int vhost_user_set_vring_err(struct
>> virtio_net **pdev __rte_unused,
>>      } else {
>>              rte_free(vq->shadow_used_split);
>>              vq->shadow_used_split = NULL;
>> +            if (vq->async_pkts_pending)
>> +                    rte_free(vq->async_pkts_pending);
>> +            if (vq->async_pending_info)
>> +                    rte_free(vq->async_pending_info);
>> +            vq->async_pkts_pending = NULL;
>> +            vq->async_pending_info = NULL;
>>      }
>>
>>      rte_free(vq->batch_copy_elems);
>> --
>> 1.8.3.1
>
Re: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path registration API

Reply via email to