Re: [dpdk-dev] [RFC] example/vhost: add support for vhost async data path

Xia, Chenbo Mon, 22 Jun 2020 18:55:03 -0700

Hi Cheng,

> -----Original Message-----
> From: Jiang, Cheng1 <cheng1.ji...@intel.com>
> Sent: Monday, June 22, 2020 10:59 AM
> To: maxime.coque...@redhat.com; Xia, Chenbo <chenbo....@intel.com>;
> Wang, Zhihong <zhihong.w...@intel.com>
> Cc: dev@dpdk.org; Fu, Patrick <patrick...@intel.com>; Liang, Cunming
> <cunming.li...@intel.com>; Jiang, Cheng1 <cheng1.ji...@intel.com>
> Subject: [RFC] example/vhost: add support for vhost async data path
> 
> This patch makes vhost-vswitch be able to use vhost asynchronous api for
> enqueue operations. Demonstrated how the application leverage IOAT DMA
> channel with vhost async api. Since this is an early preview patch, the
> performance has not been fully optimized and it's not suggested to use this
> patch as a tool for benchmark.
> 
> We introduce two parameters to enable DMA acceleration for Tx operations of
> queues:
> –async_vhost_driver Async vhost-user net driver which demonstrates how to use
> the async vhost APIs will be used when this option is given. It is disabled by
> default.
> 
> -dmas This parameter is used to specify the assigned DMA device of a queue.
> 
> This patch depends on following patch set:
> http://patches.dpdk.org/cover/71265/
> 
> Signed-off-by: Cheng Jiang <cheng1.ji...@intel.com>
> ---
>  examples/vhost/main.c | 246
> +++++++++++++++++++++++++++++++++++++++++-
>  examples/vhost/main.h |   1 +
>  2 files changed, 243 insertions(+), 4 deletions(-)
> 
> diff --git a/examples/vhost/main.c b/examples/vhost/main.c index
> ab649bf14..46dd282e0 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -24,11 +24,15 @@
>  #include <rte_ip.h>
>  #include <rte_tcp.h>
>  #include <rte_pause.h>
> +#include <rte_vhost_async.h>
> +#include <rte_rawdev.h>
> +#include <rte_ioat_rawdev.h>
> +#include <rte_pci.h>
> 
>  #include "main.h"
> 
>  #ifndef MAX_QUEUES
> -#define MAX_QUEUES 128
> +#define MAX_QUEUES 512
>  #endif
> 
>  /* the maximum number of external ports supported */ @@ -58,6 +62,10 @@
>  /* Maximum long option length for option parsing. */  #define
> MAX_LONG_OPT_SZ 64
> 
> +#define IOAT_RING_SIZE 4096
> +
> +#define MAX_ENQUEUED_SIZE 2048
> +
>  /* mask of enabled ports */
>  static uint32_t enabled_port_mask = 0;
> 
> @@ -96,6 +104,20 @@ static int dequeue_zero_copy;
> 
>  static int builtin_net_driver;
> 
> +static int async_vhost_driver;
> +
> +struct dma_info {
> +     struct rte_pci_addr addr;
> +     uint16_t dev_id;
> +     bool is_valid;
> +};
> +
> +struct dma_info_input {
> +     struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2];
> +     uint16_t nr;
> +};
> +
> +static struct dma_info_input dma_bind[20];


Should '20' be MAX_VHOST_DEVICE as this is indexed by vid?

>  /* Specify timeout (in useconds) between retries on RX. */  static uint32_t
> burst_rx_delay_time = BURST_RX_WAIT_US;
>  /* Specify the number of retries on RX. */ @@ -141,6 +163,61 @@ static struct
> rte_eth_conf vmdq_conf_default = {
>       },
>  };
> 
> +static int
> +ioat_transfer_data_cb(int vid, uint16_t queue_id, struct dma_trans_desc
> *descs,
> +             struct dma_trans_status *opaque_data, uint16_t count) {
> +     int ret;
> +     uint16_t i_desc;
> +
> +     struct iov_it *src = NULL;
> +     struct iov_it *dst = NULL;
> +     unsigned long i_seg;
> +
> +     int dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
> +     if (likely(!opaque_data)) {
> +             for (i_desc = 0; i_desc < count; i_desc++) {
> +                     src = descs[i_desc].src;
> +                     dst = descs[i_desc].dst;
> +                     i_seg = 0;
> +                     while (i_seg < src->nr_segs) {
> +                             ret = rte_ioat_enqueue_copy(dev_id,
> +                                     (uintptr_t)(src->iov[i_seg].iov_base)
> +                                             + src->offset,
> +                                     (uintptr_t)(dst->iov[i_seg].iov_base)
> +                                             + dst->offset,
> +                                     src->iov[i_seg].iov_len,
> +                                     0,
> +                                     0,
> +                                     0);
> +                             if (ret != 1)
> +                                     break;
> +                             i_seg++;
> +                     }
> +             }
> +     } else {
> +             /* Opaque data is not supported */
> +             return -1;
> +     }
> +     /* ring the doolbell */

s/doolbell/doorbell

> +     rte_ioat_do_copies(dev_id);
> +     return i_desc;
> +}
> +
> +static int
> +ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
> +             struct dma_trans_status *opaque_data,
> +             uint16_t max_packets __rte_unused)
> +{
> +     if (!opaque_data) {
> +             uintptr_t dump[255];
> +             return
> rte_ioat_completed_copies(dma_bind[vid].dmas[queue_id * 2
> +                     + VIRTIO_RXQ].dev_id, 255, dump, dump);
> +     } else {
> +             /* Opaque data is not supported */
> +             return -1;
> +     }
> +}
> 
>  static unsigned lcore_ids[RTE_MAX_LCORE];  static uint16_t
> ports[RTE_MAX_ETHPORTS]; @@ -186,6 +263,94 @@ struct mbuf_table
> lcore_tx_queue[RTE_MAX_LCORE];
>   * Builds up the correct configuration for VMDQ VLAN pool map
>   * according to the pool & queue limits.
>   */
> +
> +static inline int
> +open_dma(const char *value, void *dma_bind_info) {
> +     struct dma_info_input *dma_info = dma_bind_info;
> +     char *input = strndup(value, strlen(value) + 1);
> +     char *addrs = input;
> +     char *ptrs[2];
> +     char *start, *end, *substr;
> +     int64_t qid, vring_id;
> +     struct rte_ioat_rawdev_config config;
> +     struct rte_rawdev_info info = { .dev_private = &config };
> +     char name[32];
> +     int dev_id;
> +     int ret = 0;
> +
> +     while (isblank(*addrs))
> +             addrs++;
> +     if (*addrs == '\0') {
> +             ret = -1;
> +             goto out;
> +     }
> +
> +     /* process DMA devices within bracket. */
> +     addrs++;
> +     substr = strtok(addrs, ";]");
> +     if (!substr) {
> +             ret = -1;
> +             goto out;
> +     }
> +
> +     do {
> +             rte_strsplit(substr, strlen(substr), ptrs, 2, '@');
> +
> +             start = strstr(ptrs[0], "txq");
> +             if (start == NULL) {
> +                     ret = -1;
> +                     goto out;
> +             }
> +
> +             start += 3;
> +             qid = strtol(start, &end, 0);
> +             if (end == start) {
> +                     ret = -1;
> +                     goto out;
> +             }
> +
> +             vring_id = qid * 2 + VIRTIO_RXQ;
> +             if (rte_pci_addr_parse(ptrs[1],
> +                                    &dma_info->dmas[vring_id].addr) < 0) {
> +                     ret = -1;
> +                     goto out;
> +             }
> +
> +             rte_pci_device_name(&dma_info->dmas[vring_id].addr,
> +                                 name, sizeof(name));
> +             dev_id = rte_rawdev_get_dev_id(name);
> +             if (dev_id == (uint16_t)(-ENODEV) ||
> +                 dev_id == (uint16_t)(-EINVAL)) {
> +                     ret = -1;
> +                     goto out;
> +             }
> +
> +             if (rte_rawdev_info_get(dev_id, &info) < 0 ||
> +                 strstr(info.driver_name, "ioat") == NULL) {
> +                     ret = -1;
> +                     goto out;
> +             }
> +
> +             dma_info->dmas[vring_id].dev_id = dev_id;
> +             dma_info->dmas[vring_id].is_valid = true;
> +             config.ring_size = IOAT_RING_SIZE;
> +             if (rte_rawdev_configure(dev_id, &info) < 0) {
> +                     ret = -1;
> +                     goto out;
> +             }
> +             rte_rawdev_start(dev_id);
> +
> +             dma_info->nr++;
> +
> +             substr = strtok(NULL, ";]");
> +     } while (substr);
> +
> +out:
> +     free(input);
> +     return ret;
> +}
> +
>  static inline int
>  get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)  { @@ -
> 488,6 +653,8 @@ us_vhost_parse_args(int argc, char **argv)
>               {"client", no_argument, &client_mode, 1},
>               {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
>               {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
> +             {"async_vhost_driver", no_argument, &async_vhost_driver, 1},
> +             {"dmas", required_argument, NULL, 0},
>               {NULL, 0, 0, 0},
>       };
> 
> @@ -623,13 +790,25 @@ us_vhost_parse_args(int argc, char **argv)
>                                               "socket-file",
> MAX_LONG_OPT_SZ)) {
>                               if (us_vhost_parse_socket_path(optarg) == -1) {
>                                       RTE_LOG(INFO, VHOST_CONFIG,
> -                                     "Invalid argument for socket name
> (Max %d characters)\n",
> -                                     PATH_MAX);
> +                                             "Invalid argument for socket
> name (Max %d characters)\n",
> +                                             PATH_MAX);
>                                       us_vhost_usage(prgname);
>                                       return -1;
>                               }
>                       }
> 
> +                     if (!strncmp(long_option[option_index].name,
> +                                             "dmas", MAX_LONG_OPT_SZ))
> {
> +                             if (open_dma(optarg, &(dma_bind[0])) == -1) {
> +                                     if (*optarg == -1) {
> +                                             RTE_LOG(INFO,
> VHOST_CONFIG,
> +                                                     "Wrong DMA args\n");
> +                                             us_vhost_usage(prgname);
> +                                             return -1;
> +                                     }
> +                             }
> +                     }
> +
>                       break;
> 
>                       /* Invalid option - print options. */ @@ -785,9 +964,26
> @@ virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
>           struct rte_mbuf *m)
>  {
>       uint16_t ret;
> +     struct rte_mbuf *m_cpl[1];
> 
>       if (builtin_net_driver) {
>               ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
> +     } else if (async_vhost_driver) {
> +             ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid,
> VIRTIO_RXQ,
> +                                             &m, 1);
> +
> +             if (likely(ret)) {
> +                     dst_vdev->nr_async_pkts++;
> +                     rte_mbuf_refcnt_update(m, 1);
> +             }
> +
> +             while (likely(dst_vdev->nr_async_pkts)) {
> +                     dst_vdev->nr_async_pkts =
> +                             rte_vhost_poll_enqueue_completed(dst_vdev-
> >vid,
> +                                     VIRTIO_RXQ, m_cpl, 1);

I think nr_async_pkts should be changed only when 
rte_vhost_poll_enqueue_completed succeed?

Thanks!
Chenbo

> +                     dst_vdev->nr_async_pkts--;
> +                     rte_pktmbuf_free(*m_cpl);
> +             }
>       } else {
>               ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m,
> 1);
>       }
> @@ -1036,6 +1232,19 @@ drain_mbuf_table(struct mbuf_table *tx_q)
>       }
>  }
> 
> +static __rte_always_inline void
> +complete_async_pkts(struct vhost_dev *vdev, uint16_t qid) {
> +     struct rte_mbuf *p_cpl[MAX_PKT_BURST];
> +     uint16_t complete_count;
> +
> +     complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
> +                                             qid, p_cpl, MAX_PKT_BURST);
> +     vdev->nr_async_pkts -= complete_count;
> +     if (complete_count)
> +             free_pkts(p_cpl, complete_count);
> +}
> +
>  static __rte_always_inline void
>  drain_eth_rx(struct vhost_dev *vdev)
>  {
> @@ -1044,6 +1253,10 @@ drain_eth_rx(struct vhost_dev *vdev)
> 
>       rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
>                                   pkts, MAX_PKT_BURST);
> +
> +     while (likely(vdev->nr_async_pkts))
> +             complete_async_pkts(vdev, VIRTIO_RXQ);
> +
>       if (!rx_count)
>               return;
> 
> @@ -1068,16 +1281,22 @@ drain_eth_rx(struct vhost_dev *vdev)
>       if (builtin_net_driver) {
>               enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
>                                               pkts, rx_count);
> +     } else if (async_vhost_driver) {
> +             enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
> +                                     VIRTIO_RXQ, pkts, rx_count);
> +             vdev->nr_async_pkts += enqueue_count;
>       } else {
>               enqueue_count = rte_vhost_enqueue_burst(vdev->vid,
> VIRTIO_RXQ,
>                                               pkts, rx_count);
>       }
> +
>       if (enable_stats) {
>               rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
>               rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
>       }
> 
> -     free_pkts(pkts, rx_count);
> +     if (!async_vhost_driver)
> +             free_pkts(pkts, rx_count);
>  }
> 
>  static __rte_always_inline void
> @@ -1224,6 +1443,9 @@ destroy_device(int vid)
>               "(%d) device has been removed from data core\n",
>               vdev->vid);
> 
> +     if (async_vhost_driver)
> +             rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
> +
>       rte_free(vdev);
>  }
> 
> @@ -1238,6 +1460,12 @@ new_device(int vid)
>       uint32_t device_num_min = num_devices;
>       struct vhost_dev *vdev;
> 
> +     struct rte_vhost_async_channel_ops channel_ops = {
> +             .transfer_data = ioat_transfer_data_cb,
> +             .check_completed_copies = ioat_check_completed_copies_cb
> +     };
> +     struct dma_channel_features f;
> +
>       vdev = rte_zmalloc("vhost device", sizeof(*vdev),
> RTE_CACHE_LINE_SIZE);
>       if (vdev == NULL) {
>               RTE_LOG(INFO, VHOST_DATA,
> @@ -1278,6 +1506,13 @@ new_device(int vid)
>               "(%d) device has been added to data core %d\n",
>               vid, vdev->coreid);
> 
> +     if (async_vhost_driver) {
> +             f.inorder = 1;
> +             f.threshold = 256;
> +             return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
> +                     f.intval, &channel_ops);
> +     }
> +
>       return 0;
>  }
> 
> @@ -1517,6 +1752,9 @@ main(int argc, char *argv[])
>       /* Register vhost user driver to handle vhost messages. */
>       for (i = 0; i < nb_sockets; i++) {
>               char *file = socket_files + i * PATH_MAX;
> +             if (async_vhost_driver)
> +                     flags = flags | RTE_VHOST_USER_ASYNC_COPY;
> +
>               ret = rte_vhost_driver_register(file, flags);
>               if (ret != 0) {
>                       unregister_drivers(i);
> diff --git a/examples/vhost/main.h b/examples/vhost/main.h index
> 7cba0edbf..4317b6ae8 100644
> --- a/examples/vhost/main.h
> +++ b/examples/vhost/main.h
> @@ -51,6 +51,7 @@ struct vhost_dev {
>       uint64_t features;
>       size_t hdr_len;
>       uint16_t nr_vrings;
> +     uint16_t nr_async_pkts;
>       struct rte_vhost_memory *mem;
>       struct device_statistics stats;
>       TAILQ_ENTRY(vhost_dev) global_vdev_entry;
> --
> 2.26.2

Re: [dpdk-dev] [RFC] example/vhost: add support for vhost async data path

Reply via email to