Re: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path registration API

Liu, Yong Thu, 18 Jun 2020 17:41:32 -0700

> -----Original Message-----
> From: Fu, Patrick <[email protected]>
> Sent: Thursday, June 18, 2020 5:09 PM
> To: Liu, Yong <[email protected]>
> Cc: Jiang, Cheng1 <[email protected]>; Liang, Cunming
> <[email protected]>; [email protected]; [email protected];
> Xia, Chenbo <[email protected]>; Wang, Zhihong
> <[email protected]>; Ye, Xiaolong <[email protected]>
> Subject: RE: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
> registration API
> 
> 
> 
> > -----Original Message-----
> > From: Liu, Yong <[email protected]>
> > Sent: Thursday, June 18, 2020 1:51 PM
> > To: Fu, Patrick <[email protected]>
> > Cc: Fu, Patrick <[email protected]>; Jiang, Cheng1
> > <[email protected]>; Liang, Cunming <[email protected]>;
> > [email protected]; [email protected]; Xia, Chenbo
> > <[email protected]>; Wang, Zhihong <[email protected]>; Ye,
> > Xiaolong <[email protected]>
> > Subject: RE: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
> > registration API
> >
> > Thanks, Patrick. So comments are inline.
> >
> > > -----Original Message-----
> > > From: dev <[email protected]> On Behalf Of [email protected]
> > > Sent: Thursday, June 11, 2020 6:02 PM
> > > To: [email protected]; [email protected]; Xia, Chenbo
> > > <[email protected]>; Wang, Zhihong <[email protected]>;
> Ye,
> > > Xiaolong <[email protected]>
> > > Cc: Fu, Patrick <[email protected]>; Jiang, Cheng1
> > > <[email protected]>; Liang, Cunming <[email protected]>
> > > Subject: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
> > > registration API
> > >
> > > From: Patrick <[email protected]>
> > >
> > > This patch introduces registration/un-registration APIs for async data
> > > path together with all required data structures and DMA callback
> > > function proto-types.
> > >
> > > Signed-off-by: Patrick <[email protected]>
> > > ---
> > >  lib/librte_vhost/Makefile          |   3 +-
> > >  lib/librte_vhost/rte_vhost.h       |   1 +
> > >  lib/librte_vhost/rte_vhost_async.h | 134
> > > +++++++++++++++++++++++++++++++++++++
> > >  lib/librte_vhost/socket.c          |  20 ++++++
> > >  lib/librte_vhost/vhost.c           |  74 +++++++++++++++++++-
> > >  lib/librte_vhost/vhost.h           |  30 ++++++++-
> > >  lib/librte_vhost/vhost_user.c      |  28 ++++++--
> > >  7 files changed, 283 insertions(+), 7 deletions(-)  create mode
> > > 100644 lib/librte_vhost/rte_vhost_async.h
> > >
> > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> > > index e592795..3aed094 100644
> > > --- a/lib/librte_vhost/Makefile
> > > +++ b/lib/librte_vhost/Makefile
> > > @@ -41,7 +41,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c
> > iotlb.c
> > > socket.c vhost.c \
> > >  vhost_user.c virtio_net.c vdpa.c
> > >
> > >  # install includes
> > > -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
> > rte_vdpa.h
> > > +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
> > rte_vdpa.h
> > > \
> > > +rte_vhost_async.h
> > >
> > Hi Patrick,
> > Please also update meson build for newly added file.
> >
> > Thanks,
> > Marvin
> >
> > >  # only compile vhost crypto when cryptodev is enabled  ifeq
> > > ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
> > > diff --git a/lib/librte_vhost/rte_vhost.h
> > > b/lib/librte_vhost/rte_vhost.h index d43669f..cec4d07 100644
> > > --- a/lib/librte_vhost/rte_vhost.h
> > > +++ b/lib/librte_vhost/rte_vhost.h
> > > @@ -35,6 +35,7 @@
> > >  #define RTE_VHOST_USER_EXTBUF_SUPPORT(1ULL << 5)
> > >  /* support only linear buffers (no chained mbufs) */
> > >  #define RTE_VHOST_USER_LINEARBUF_SUPPORT(1ULL << 6)
> > > +#define RTE_VHOST_USER_ASYNC_COPY(1ULL << 7)
> > >
> > >  /** Protocol features. */
> > >  #ifndef VHOST_USER_PROTOCOL_F_MQ
> > > diff --git a/lib/librte_vhost/rte_vhost_async.h
> > > b/lib/librte_vhost/rte_vhost_async.h
> > > new file mode 100644
> > > index 0000000..82f2ebe
> > > --- /dev/null
> > > +++ b/lib/librte_vhost/rte_vhost_async.h
> > > @@ -0,0 +1,134 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > + * Copyright(c) 2018 Intel Corporation  */
> >
> > s/2018/2020/
> >
> > > +
> > > +#ifndef _RTE_VHOST_ASYNC_H_
> > > +#define _RTE_VHOST_ASYNC_H_
> > > +
> > > +#include "rte_vhost.h"
> > > +
> > > +/**
> > > + * iovec iterator
> > > + */
> > > +struct iov_it {
> > > +/** offset to the first byte of interesting data */
> > > +size_t offset;
> > > +/** total bytes of data in this iterator */
> > > +size_t count;
> > > +/** pointer to the iovec array */
> > > +struct iovec *iov;
> > > +/** number of iovec in this iterator */
> > > +unsigned long nr_segs;
> > > +};
> >
> > Patrick,
> > I think structure named as "it" is too generic for understanding, please
> use
> > more meaningful name like "iov_iter".
> >
> > > +
> > > +/**
> > > + * dma transfer descriptor pair
> > > + */
> > > +struct dma_trans_desc {
> > > +/** source memory iov_it */
> > > +struct iov_it *src;
> > > +/** destination memory iov_it */
> > > +struct iov_it *dst;
> > > +};
> > > +
> >
> > This series patch named as sync copy,  and dma is just one async copy
> > method which underneath hardware supplied.
> > IMHO, structure is better to named as "async_copy_desc" which matched
> the
> > overall concept.
> >
> > > +/**
> > > + * dma transfer status
> > > + */
> > > +struct dma_trans_status {
> > > +/** An array of application specific data for source memory */
> > > +uintptr_t *src_opaque_data;
> > > +/** An array of application specific data for destination memory */
> > > +uintptr_t *dst_opaque_data;
> > > +};
> > > +
> > Same as pervious comment.
> >
> > > +/**
> > > + * dma operation callbacks to be implemented by applications  */
> > > +struct rte_vhost_async_channel_ops {
> > > +/**
> > > + * instruct a DMA channel to perform copies for a batch of packets
> > > + *
> > > + * @param vid
> > > + *  id of vhost device to perform data copies
> > > + * @param queue_id
> > > + *  queue id to perform data copies
> > > + * @param descs
> > > + *  an array of DMA transfer memory descriptors
> > > + * @param opaque_data
> > > + *  opaque data pair sending to DMA engine
> > > + * @param count
> > > + *  number of elements in the "descs" array
> > > + * @return
> > > + *  -1 on failure, number of descs processed on success
> > > + */
> > > +int (*transfer_data)(int vid, uint16_t queue_id,
> > > +struct dma_trans_desc *descs,
> > > +struct dma_trans_status *opaque_data,
> > > +uint16_t count);
> > > +/**
> > > + * check copy-completed packets from a DMA channel
> > > + * @param vid
> > > + *  id of vhost device to check copy completion
> > > + * @param queue_id
> > > + *  queue id to check copyp completion
> > > + * @param opaque_data
> > > + *  buffer to receive the opaque data pair from DMA engine
> > > + * @param max_packets
> > > + *  max number of packets could be completed
> > > + * @return
> > > + *  -1 on failure, number of iov segments completed on success
> > > + */
> > > +int (*check_completed_copies)(int vid, uint16_t queue_id,
> > > +struct dma_trans_status *opaque_data,
> > > +uint16_t max_packets);
> > > +};
> > > +
> > > +/**
> > > + *  dma channel feature bit definition  */ struct
> > > +dma_channel_features {
> > > +union {
> > > +uint32_t intval;
> > > +struct {
> > > +uint32_t inorder:1;
> > > +uint32_t resvd0115:15;
> > > +uint32_t threshold:12;
> > > +uint32_t resvd2831:4;
> > > +};
> > > +};
> > > +};
> > > +
> >
> > Naming feature bits as "intval" may cause confusion, why not just use its
> > meaning like "engine_features"?
> > I'm not sure whether format "resvd0115" match dpdk copy style. In my
> mind,
> > dpdk will use resvd_0 and resvd_1 for two reserved elements.
> >
> For comments here above, I will take changes in v2 patch
> 
> > > +if (dev)
> > > +dev->async_copy = 1;
> > > +}
> > > +
> >
> > IMHO, user can chose which queue utilize async copy as backend
> hardware
> > resource is limited.
> > So should async_copy enable flag be saved in virtqueue structure?
> >
> We have per queue flag to identify the enabling status of a specific queue.
> "async_copy" flag is a dev level flag which identifies the async capability 
> of a
> vhost device.
> This is necessary because we rely on this flag to do initialization work if 
> the
> vhost backend need to support async mode at any of its queues.
> 

I got u, how about rename this variable to "async_copy_enabled" which more 
aligned to its real implication. 

Thanks,
Marvin

> > >  VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> > >
> > >  if (vsocket->notify_ops->new_connection) { @@ -891,6 +900,17 @@
> > > struct vhost_user_reconnect_list {
> > >  goto out_mutex;
> > >  }
> > >
> > > +vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> > > +
> > > +if (vsocket->async_copy &&
> > > +(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
> > > +RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
> > > +VHOST_LOG_CONFIG(ERR, "error: enabling async copy and
> > > IOMMU "
> > > +"or post-copy feature simultaneously is not "
> > > +"supported\n");
> > > +goto out_mutex;
> > > +}
> > > +
> > >  /*
> > >   * Set the supported features correctly for the builtin vhost-user
> > >   * net driver.
> > > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > > 0266318..e6b688a 100644
> > > --- a/lib/librte_vhost/vhost.c
> > > +++ b/lib/librte_vhost/vhost.c
> > > @@ -332,8 +332,13 @@
> > >  {
> > >  if (vq_is_packed(dev))
> > >  rte_free(vq->shadow_used_packed);
> > > -else
> > > +else {
> > >  rte_free(vq->shadow_used_split);
> > > +if (vq->async_pkts_pending)
> > > +rte_free(vq->async_pkts_pending);
> > > +if (vq->async_pending_info)
> > > +rte_free(vq->async_pending_info);
> > > +}
> > >  rte_free(vq->batch_copy_elems);
> > >  rte_mempool_free(vq->iotlb_pool);
> > >  rte_free(vq);
> > > @@ -1527,3 +1532,70 @@ int rte_vhost_extern_callback_register(int
> vid,
> > >  if (vhost_data_log_level >= 0)
> > >  rte_log_set_level(vhost_data_log_level,
> > > RTE_LOG_WARNING);
> > >  }
> > > +
> > > +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> > > +uint32_t features,
> > > +struct rte_vhost_async_channel_ops
> > > *ops)
> > > +{
> > > +struct vhost_virtqueue *vq;
> > > +struct virtio_net *dev = get_device(vid);
> > > +struct dma_channel_features f;
> > > +
> > > +if (dev == NULL || ops == NULL)
> > > +return -1;
> > > +
> > > +f.intval = features;
> > > +
> > > +vq = dev->virtqueue[queue_id];
> > > +
> > > +if (vq == NULL)
> > > +return -1;
> > > +
> > > +/** packed queue is not supported */
> > > +if (vq_is_packed(dev) || !f.inorder)
> > > +return -1;
> > > +
> > Virtio already has in_order concept, these two names are so like and can
> be
> > easily messed up.  Please consider how to distinguish them.
> >
> What about "async_inorder"

Look great for me.

> 
> > > +if (ops->check_completed_copies == NULL ||
> > > +ops->transfer_data == NULL)
> > > +return -1;
> > > +
> >
> > Previous error is unlikely to be true, unlikely macro may be helpful for
> > understanding.
> >
> Will update in v2 patch
> 
> > > +rte_spinlock_lock(&vq->access_lock);
> > > +
> > > +vq->async_ops.check_completed_copies = ops-
> > > >check_completed_copies;
> > > +vq->async_ops.transfer_data = ops->transfer_data;
> > > +
> > > +vq->async_inorder = f.inorder;
> > > +vq->async_threshold = f.threshold;
> > > +
> > > +vq->async_registered = true;
> > > +
> > > +rte_spinlock_unlock(&vq->access_lock);
> > > +
> > > +return 0;
> > > +}
> > > +
> > > +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id) {
> > > +struct vhost_virtqueue *vq;
> > > +struct virtio_net *dev = get_device(vid);
> > > +
> > > +if (dev == NULL)
> > > +return -1;
> > > +
> > > +vq = dev->virtqueue[queue_id];
> > > +
> > > +if (vq == NULL)
> > > +return -1;
> > > +
> > > +rte_spinlock_lock(&vq->access_lock);
> > > +
> > > +vq->async_ops.transfer_data = NULL;
> > > +vq->async_ops.check_completed_copies = NULL;
> > > +
> > > +vq->async_registered = false;
> > > +
> > > +rte_spinlock_unlock(&vq->access_lock);
> > > +
> > > +return 0;
> > > +}
> > > +
> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > > df98d15..a7fbe23 100644
> > > --- a/lib/librte_vhost/vhost.h
> > > +++ b/lib/librte_vhost/vhost.h
> > > @@ -23,6 +23,8 @@
> > >  #include "rte_vhost.h"
> > >  #include "rte_vdpa.h"
> > >
> > > +#include "rte_vhost_async.h"
> > > +
> > >  /* Used to indicate that the device is running on a data core */
> > > #define VIRTIO_DEV_RUNNING 1
> > >  /* Used to indicate that the device is ready to operate */ @@ -39,6
> > > +41,11 @@
> > >
> > >  #define VHOST_LOG_CACHE_NR 32
> > >
> > > +#define MAX_PKT_BURST 32
> > > +
> > > +#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2) #define
> > > +VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
> > > +
> > >  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)\
> > >  ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED |
> > > VRING_DESC_F_WRITE) : \
> > >  VRING_DESC_F_WRITE)
> > > @@ -200,6 +207,25 @@ struct vhost_virtqueue {
> > >  TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
> > >  intiotlb_cache_nr;
> > >  TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
> > > +
> > > +/* operation callbacks for async dma */
> > > +struct rte_vhost_async_channel_opsasync_ops;
> > > +
> > > +struct iov_it it_pool[VHOST_MAX_ASYNC_IT];
> > > +struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
> > > +
> > > +/* async data transfer status */
> > > +uintptr_t**async_pkts_pending;
> > > +#defineASYNC_PENDING_INFO_N_MSK 0xFFFF
> > > +#defineASYNC_PENDING_INFO_N_SFT 16
> > > +uint64_t*async_pending_info;
> > > +uint16_tasync_pkts_idx;
> > > +uint16_tasync_pkts_inflight_n;
> > > +
> > > +/* vq async features */
> > > +boolasync_inorder;
> > > +boolasync_registered;
> > > +uint16_tasync_threshold;
> > >  } __rte_cache_aligned;
> > >
> > >  /* Old kernels have no such macros defined */ @@ -353,6 +379,7 @@
> > > struct virtio_net {
> > >  int16_tbroadcast_rarp;
> > >  uint32_tnr_vring;
> > >  intdequeue_zero_copy;
> > > +intasync_copy;
> > >  intextbuf;
> > >  intlinearbuf;
> > >  struct vhost_virtqueue*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > > @@ -702,7 +729,8 @@ uint64_t translate_log_addr(struct virtio_net
> > > *dev, struct vhost_virtqueue *vq,
> > >  /* Don't kick guest if we don't reach index specified by guest. */
> > >  if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
> > >  uint16_t old = vq->signalled_used;
> > > -uint16_t new = vq->last_used_idx;
> > > +uint16_t new = vq->async_pkts_inflight_n ?
> > > +vq->used->idx:vq->last_used_idx;
> > >  bool signalled_used_valid = vq->signalled_used_valid;
> > >
> > >  vq->signalled_used = new;
> > > diff --git a/lib/librte_vhost/vhost_user.c
> > > b/lib/librte_vhost/vhost_user.c index 84bebad..d7600bf 100644
> > > --- a/lib/librte_vhost/vhost_user.c
> > > +++ b/lib/librte_vhost/vhost_user.c
> > > @@ -464,12 +464,25 @@
> > >  } else {
> > >  if (vq->shadow_used_split)
> > >  rte_free(vq->shadow_used_split);
> > > +if (vq->async_pkts_pending)
> > > +rte_free(vq->async_pkts_pending);
> > > +if (vq->async_pending_info)
> > > +rte_free(vq->async_pending_info);
> > > +
> > >  vq->shadow_used_split = rte_malloc(NULL,
> > >  vq->size * sizeof(struct vring_used_elem),
> > >  RTE_CACHE_LINE_SIZE);
> > > -if (!vq->shadow_used_split) {
> > > +vq->async_pkts_pending = rte_malloc(NULL,
> > > +vq->size * sizeof(uintptr_t),
> > > +RTE_CACHE_LINE_SIZE);
> > > +vq->async_pending_info = rte_malloc(NULL,
> > > +vq->size * sizeof(uint64_t),
> > > +RTE_CACHE_LINE_SIZE);
> > > +if (!vq->shadow_used_split ||
> > > +!vq->async_pkts_pending ||
> > > +!vq->async_pending_info) {
> > >  VHOST_LOG_CONFIG(ERR,
> > > -"failed to allocate memory for
> > > shadow used ring.\n");
> > > +"failed to allocate memory for vq
> > > internal data.\n");
> >
> > If async copy not enabled, there will be no need to allocate related
> structures.
> >
> Will update in v2 patch
> 
> Thanks,
> 
> Patrick
Re: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path registration API

Reply via email to