> -----Original Message----- > From: Hu, Jiayu <jiayu...@intel.com> > Sent: Monday, January 17, 2022 1:40 PM > To: Xia, Chenbo <chenbo....@intel.com>; dev@dpdk.org > Cc: maxime.coque...@redhat.com; i.maxim...@ovn.org; Richardson, Bruce > <bruce.richard...@intel.com>; Van Haaren, Harry <harry.van.haa...@intel.com>; > Pai G, Sunil <sunil.pa...@intel.com>; Mcnamara, John > <john.mcnam...@intel.com>; > Ding, Xuan <xuan.d...@intel.com>; Jiang, Cheng1 <cheng1.ji...@intel.com>; > lian...@liangbit.com > Subject: RE: [PATCH v1 1/1] vhost: integrate dmadev in asynchronous datapath > > Hi Chenbo, > > Please see replies inline. > > Thanks, > Jiayu > > > -----Original Message----- > > From: Xia, Chenbo <chenbo....@intel.com> > > > diff --git a/examples/vhost/main.c b/examples/vhost/main.c > > > index 33d023aa39..44073499bc 100644 > > > --- a/examples/vhost/main.c > > > +++ b/examples/vhost/main.c > > > @@ -24,8 +24,9 @@ > > > #include <rte_ip.h> > > > #include <rte_tcp.h> > > > #include <rte_pause.h> > > > +#include <rte_dmadev.h> > > > +#include <rte_vhost_async.h> > > > > > > -#include "ioat.h" > > > #include "main.h" > > > > > > #ifndef MAX_QUEUES > > > @@ -56,6 +57,14 @@ > > > #define RTE_TEST_TX_DESC_DEFAULT 512 > > > > > > #define INVALID_PORT_ID 0xFF > > > +#define INVALID_DMA_ID -1 > > > + > > > +#define MAX_VHOST_DEVICE 1024 > > > +#define DMA_RING_SIZE 4096 > > > + > > > +struct dma_for_vhost dma_bind[MAX_VHOST_DEVICE]; > > > +struct rte_vhost_async_dma_info > > dma_config[RTE_DMADEV_DEFAULT_MAX]; > > > +static int dma_count; > > > > > > /* mask of enabled ports */ > > > static uint32_t enabled_port_mask = 0; > > > @@ -96,8 +105,6 @@ static int builtin_net_driver; > > > > > > static int async_vhost_driver; > > > > > > -static char *dma_type; > > > - > > > /* Specify timeout (in useconds) between retries on RX. */ > > > static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; > > > /* Specify the number of retries on RX. */ > > > @@ -196,13 +203,134 @@ struct vhost_bufftable > > *vhost_txbuff[RTE_MAX_LCORE * > > > MAX_VHOST_DEVICE]; > > > #define MBUF_TABLE_DRAIN_TSC((rte_get_tsc_hz() + US_PER_S - 1) \ > > > / US_PER_S * BURST_TX_DRAIN_US) > > > > > > +static inline bool > > > +is_dma_configured(int16_t dev_id) > > > +{ > > > +int i; > > > + > > > +for (i = 0; i < dma_count; i++) { > > > +if (dma_config[i].dev_id == dev_id) { > > > +return true; > > > +} > > > +} > > > +return false; > > > +} > > > + > > > static inline int > > > open_dma(const char *value) > > > { > > > -if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) > > > -return open_ioat(value); > > > +struct dma_for_vhost *dma_info = dma_bind; > > > +char *input = strndup(value, strlen(value) + 1); > > > +char *addrs = input; > > > +char *ptrs[2]; > > > +char *start, *end, *substr; > > > +int64_t vid, vring_id; > > > + > > > +struct rte_dma_info info; > > > +struct rte_dma_conf dev_config = { .nb_vchans = 1 }; > > > +struct rte_dma_vchan_conf qconf = { > > > +.direction = RTE_DMA_DIR_MEM_TO_MEM, > > > +.nb_desc = DMA_RING_SIZE > > > +}; > > > + > > > +int dev_id; > > > +int ret = 0; > > > +uint16_t i = 0; > > > +char *dma_arg[MAX_VHOST_DEVICE]; > > > +int args_nr; > > > + > > > +while (isblank(*addrs)) > > > +addrs++; > > > +if (*addrs == '\0') { > > > +ret = -1; > > > +goto out; > > > +} > > > + > > > +/* process DMA devices within bracket. */ > > > +addrs++; > > > +substr = strtok(addrs, ";]"); > > > +if (!substr) { > > > +ret = -1; > > > +goto out; > > > +} > > > + > > > +args_nr = rte_strsplit(substr, strlen(substr), > > > +dma_arg, MAX_VHOST_DEVICE, ','); > > > +if (args_nr <= 0) { > > > +ret = -1; > > > +goto out; > > > +} > > > + > > > +while (i < args_nr) { > > > +char *arg_temp = dma_arg[i]; > > > +uint8_t sub_nr; > > > + > > > +sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); > > > +if (sub_nr != 2) { > > > +ret = -1; > > > +goto out; > > > +} > > > + > > > +start = strstr(ptrs[0], "txd"); > > > +if (start == NULL) { > > > +ret = -1; > > > +goto out; > > > +} > > > + > > > +start += 3; > > > +vid = strtol(start, &end, 0); > > > +if (end == start) { > > > +ret = -1; > > > +goto out; > > > +} > > > + > > > +vring_id = 0 + VIRTIO_RXQ; > > > > No need to introduce vring_id, it's always VIRTIO_RXQ > > I will remove it later. > > > > > > + > > > +dev_id = rte_dma_get_dev_id_by_name(ptrs[1]); > > > +if (dev_id < 0) { > > > +RTE_LOG(ERR, VHOST_CONFIG, "Fail to find > > DMA %s.\n", > > > ptrs[1]); > > > +ret = -1; > > > +goto out; > > > +} else if (is_dma_configured(dev_id)) { > > > +goto done; > > > +} > > > + > > > > Please call rte_dma_info_get before configure to make sure > > info.max_vchans >=1 > > Do you suggest to use "rte_dma_info_get() and info.max_vchans=0" to indicate > the device is not configured, rather than using is_dma_configure()?
No, I mean when you configure the dmadev with one vchan, make sure it does have at least one vchanl, even the 'vchan == 0' case can hardly happen. Just like the function call sequence in test_dmadev_instance, test_dmadev.c. > > > > > > +if (rte_dma_configure(dev_id, &dev_config) != 0) { > > > +RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure > > DMA %d.\n", > > > dev_id); > > > +ret = -1; > > > +goto out; > > > +} > > > + > > > +if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) { > > > +RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up > > DMA %d.\n", > > > dev_id); > > > +ret = -1; > > > +goto out; > > > +} > > > > > > -return -1; > > > +rte_dma_info_get(dev_id, &info); > > > +if (info.nb_vchans != 1) { > > > +RTE_LOG(ERR, VHOST_CONFIG, "DMA %d has no > > queues.\n", > > > dev_id); > > > > Then the above means the number of vchan is not configured. > > > > > +ret = -1; > > > +goto out; > > > +} > > > + > > > +if (rte_dma_start(dev_id) != 0) { > > > +RTE_LOG(ERR, VHOST_CONFIG, "Fail to start > > DMA %u.\n", > > > dev_id); > > > +ret = -1; > > > +goto out; > > > +} > > > + > > > +dma_config[dma_count].dev_id = dev_id; > > > +dma_config[dma_count].max_vchans = 1; > > > +dma_config[dma_count++].max_desc = DMA_RING_SIZE; > > > + > > > +done: > > > +(dma_info + vid)->dmas[vring_id].dev_id = dev_id; > > > +i++; > > > +} > > > +out: > > > +free(input); > > > +return ret; > > > } > > > > > > /* > > > @@ -500,8 +628,6 @@ enum { > > > OPT_CLIENT_NUM, > > > #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver" > > > OPT_BUILTIN_NET_DRIVER_NUM, > > > -#define OPT_DMA_TYPE "dma-type" > > > -OPT_DMA_TYPE_NUM, > > > #define OPT_DMAS "dmas" > > > OPT_DMAS_NUM, > > > }; > > > @@ -539,8 +665,6 @@ us_vhost_parse_args(int argc, char **argv) > > > NULL, OPT_CLIENT_NUM}, > > > {OPT_BUILTIN_NET_DRIVER, no_argument, > > > NULL, OPT_BUILTIN_NET_DRIVER_NUM}, > > > -{OPT_DMA_TYPE, required_argument, > > > -NULL, OPT_DMA_TYPE_NUM}, > > > {OPT_DMAS, required_argument, > > > NULL, OPT_DMAS_NUM}, > > > {NULL, 0, 0, 0}, > > > @@ -661,10 +785,6 @@ us_vhost_parse_args(int argc, char **argv) > > > } > > > break; > > > > > > -case OPT_DMA_TYPE_NUM: > > > -dma_type = optarg; > > > -break; > > > - > > > case OPT_DMAS_NUM: > > > if (open_dma(optarg) == -1) { > > > RTE_LOG(INFO, VHOST_CONFIG, > > > @@ -841,9 +961,10 @@ complete_async_pkts(struct vhost_dev *vdev) > > > { > > > struct rte_mbuf *p_cpl[MAX_PKT_BURST]; > > > uint16_t complete_count; > > > +int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id; > > > > > > complete_count = rte_vhost_poll_enqueue_completed(vdev->vid, > > > -VIRTIO_RXQ, p_cpl, > > MAX_PKT_BURST); > > > +VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, > > dma_id, 0); > > > if (complete_count) { > > > free_pkts(p_cpl, complete_count); > > > __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, > > > __ATOMIC_SEQ_CST); > > > @@ -883,11 +1004,12 @@ drain_vhost(struct vhost_dev *vdev) > > > > > > if (builtin_net_driver) { > > > ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); > > > -} else if (async_vhost_driver) { > > > +} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) { > > > uint16_t enqueue_fail = 0; > > > +int16_t dma_id = dma_bind[vdev- > > >vid].dmas[VIRTIO_RXQ].dev_id; > > > > > > complete_async_pkts(vdev); > > > -ret = rte_vhost_submit_enqueue_burst(vdev->vid, > > VIRTIO_RXQ, m, > > > nr_xmit); > > > +ret = rte_vhost_submit_enqueue_burst(vdev->vid, > > VIRTIO_RXQ, m, > > > nr_xmit, dma_id, 0); > > > __atomic_add_fetch(&vdev->pkts_inflight, ret, > > __ATOMIC_SEQ_CST); > > > > > > enqueue_fail = nr_xmit - ret; > > > @@ -905,7 +1027,7 @@ drain_vhost(struct vhost_dev *vdev) > > > __ATOMIC_SEQ_CST); > > > } > > > > > > -if (!async_vhost_driver) > > > +if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) > > > free_pkts(m, nr_xmit); > > > } > > > > > > @@ -1211,12 +1333,13 @@ drain_eth_rx(struct vhost_dev *vdev) > > > if (builtin_net_driver) { > > > enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, > > > pkts, rx_count); > > > -} else if (async_vhost_driver) { > > > +} else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) { > > > uint16_t enqueue_fail = 0; > > > +int16_t dma_id = dma_bind[vdev- > > >vid].dmas[VIRTIO_RXQ].dev_id; > > > > > > complete_async_pkts(vdev); > > > enqueue_count = rte_vhost_submit_enqueue_burst(vdev- > > >vid, > > > -VIRTIO_RXQ, pkts, rx_count); > > > +VIRTIO_RXQ, pkts, rx_count, dma_id, > > 0); > > > __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, > > > __ATOMIC_SEQ_CST); > > > > > > enqueue_fail = rx_count - enqueue_count; > > > @@ -1235,7 +1358,7 @@ drain_eth_rx(struct vhost_dev *vdev) > > > __ATOMIC_SEQ_CST); > > > } > > > > > > -if (!async_vhost_driver) > > > +if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) > > > free_pkts(pkts, rx_count); > > > } > > > > > > @@ -1387,18 +1510,20 @@ destroy_device(int vid) > > > "(%d) device has been removed from data core\n", > > > vdev->vid); > > > > > > -if (async_vhost_driver) { > > > +if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) { > > > uint16_t n_pkt = 0; > > > +int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id; > > > struct rte_mbuf *m_cpl[vdev->pkts_inflight]; > > > > > > while (vdev->pkts_inflight) { > > > n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, > > VIRTIO_RXQ, > > > -m_cpl, vdev->pkts_inflight); > > > +m_cpl, vdev->pkts_inflight, > > dma_id, 0); > > > free_pkts(m_cpl, n_pkt); > > > __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, > > > __ATOMIC_SEQ_CST); > > > } > > > > > > rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); > > > +dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false; > > > } > > > > > > rte_free(vdev); > > > @@ -1468,20 +1593,14 @@ new_device(int vid) > > > "(%d) device has been added to data core %d\n", > > > vid, vdev->coreid); > > > > > > -if (async_vhost_driver) { > > > -struct rte_vhost_async_config config = {0}; > > > -struct rte_vhost_async_channel_ops channel_ops; > > > - > > > -if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) { > > > -channel_ops.transfer_data = ioat_transfer_data_cb; > > > -channel_ops.check_completed_copies = > > > -ioat_check_completed_copies_cb; > > > - > > > -config.features = RTE_VHOST_ASYNC_INORDER; > > > +if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) { > > > +int ret; > > > > > > -return rte_vhost_async_channel_register(vid, > > VIRTIO_RXQ, > > > -config, &channel_ops); > > > +ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ); > > > +if (ret == 0) { > > > +dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = > > true; > > > } > > > +return ret; > > > } > > > > > > return 0; > > > @@ -1502,14 +1621,15 @@ vring_state_changed(int vid, uint16_t > > queue_id, int > > > enable) > > > if (queue_id != VIRTIO_RXQ) > > > return 0; > > > > > > -if (async_vhost_driver) { > > > +if (dma_bind[vid].dmas[queue_id].async_enabled) { > > > if (!enable) { > > > uint16_t n_pkt = 0; > > > +int16_t dma_id = > > dma_bind[vid].dmas[VIRTIO_RXQ].dev_id; > > > struct rte_mbuf *m_cpl[vdev->pkts_inflight]; > > > > > > while (vdev->pkts_inflight) { > > > n_pkt = > > rte_vhost_clear_queue_thread_unsafe(vid, > > > queue_id, > > > -m_cpl, vdev- > > >pkts_inflight); > > > +m_cpl, vdev- > > >pkts_inflight, dma_id, > > > 0); > > > free_pkts(m_cpl, n_pkt); > > > __atomic_sub_fetch(&vdev->pkts_inflight, > > n_pkt, > > > __ATOMIC_SEQ_CST); > > > } > > > @@ -1657,6 +1777,25 @@ create_mbuf_pool(uint16_t nr_port, uint32_t > > > nr_switch_core, uint32_t mbuf_size, > > > rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); > > > } > > > > > > +static void > > > +init_dma(void) > > > +{ > > > +int i; > > > + > > > +for (i = 0; i < MAX_VHOST_DEVICE; i++) { > > > +int j; > > > + > > > +for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) { > > > +dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID; > > > +dma_bind[i].dmas[j].async_enabled = false; > > > +} > > > +} > > > + > > > +for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) { > > > +dma_config[i].dev_id = INVALID_DMA_ID; > > > +} > > > +} > > > + > > > /* > > > * Main function, does initialisation and calls the per-lcore functions. > > > */ > > > @@ -1679,6 +1818,9 @@ main(int argc, char *argv[]) > > > argc -= ret; > > > argv += ret; > > > > > > +/* initialize dma structures */ > > > +init_dma(); > > > + > > > /* parse app arguments */ > > > ret = us_vhost_parse_args(argc, argv); > > > if (ret < 0) > > > @@ -1754,6 +1896,20 @@ main(int argc, char *argv[]) > > > if (client_mode) > > > flags |= RTE_VHOST_USER_CLIENT; > > > > > > +if (async_vhost_driver) { > > > +if (rte_vhost_async_dma_configure(dma_config, dma_count) > > < 0) { > > > +RTE_LOG(ERR, VHOST_PORT, "Failed to configure > > DMA in > > > vhost.\n"); > > > +for (i = 0; i < dma_count; i++) { > > > +if (dma_config[i].dev_id != INVALID_DMA_ID) > > { > > > +rte_dma_stop(dma_config[i].dev_id); > > > +dma_config[i].dev_id = > > INVALID_DMA_ID; > > > +} > > > +} > > > +dma_count = 0; > > > +async_vhost_driver = false; > > > +} > > > +} > > > + > > > /* Register vhost user driver to handle vhost messages. */ > > > for (i = 0; i < nb_sockets; i++) { > > > char *file = socket_files + i * PATH_MAX; > > > diff --git a/examples/vhost/main.h b/examples/vhost/main.h > > > index e7b1ac60a6..b4a453e77e 100644 > > > --- a/examples/vhost/main.h > > > +++ b/examples/vhost/main.h > > > @@ -8,6 +8,7 @@ > > > #include <sys/queue.h> > > > > > > #include <rte_ether.h> > > > +#include <rte_pci.h> > > > > > > /* Macros for printing using RTE_LOG */ > > > #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 > > > @@ -79,6 +80,16 @@ struct lcore_info { > > > struct vhost_dev_tailq_list vdev_list; > > > }; > > > > > > +struct dma_info { > > > +struct rte_pci_addr addr; > > > +int16_t dev_id; > > > +bool async_enabled; > > > +}; > > > + > > > +struct dma_for_vhost { > > > +struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2]; > > > +}; > > > + > > > /* we implement non-extra virtio net features */ > > > #define VIRTIO_NET_FEATURES0 > > > > > > diff --git a/examples/vhost/meson.build b/examples/vhost/meson.build > > > index 3efd5e6540..87a637f83f 100644 > > > --- a/examples/vhost/meson.build > > > +++ b/examples/vhost/meson.build > > > @@ -12,13 +12,9 @@ if not is_linux > > > endif > > > > > > deps += 'vhost' > > > +deps += 'dmadev' > > > allow_experimental_apis = true > > > sources = files( > > > 'main.c', > > > 'virtio_net.c', > > > ) > > > - > > > -if dpdk_conf.has('RTE_RAW_IOAT') > > > - deps += 'raw_ioat' > > > - sources += files('ioat.c') > > > -endif > > > diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build > > > index cdb37a4814..8107329400 100644 > > > --- a/lib/vhost/meson.build > > > +++ b/lib/vhost/meson.build > > > @@ -33,7 +33,8 @@ headers = files( > > > 'rte_vhost_async.h', > > > 'rte_vhost_crypto.h', > > > ) > > > + > > > driver_sdk_headers = files( > > > 'vdpa_driver.h', > > > ) > > > -deps += ['ethdev', 'cryptodev', 'hash', 'pci'] > > > +deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev'] > > > diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h > > > index a87ea6ba37..23a7a2d8b3 100644 > > > --- a/lib/vhost/rte_vhost_async.h > > > +++ b/lib/vhost/rte_vhost_async.h > > > @@ -27,70 +27,12 @@ struct rte_vhost_iov_iter { > > > }; > > > > > > /** > > > - * dma transfer status > > > + * DMA device information > > > */ > > > -struct rte_vhost_async_status { > > > -/** An array of application specific data for source memory */ > > > -uintptr_t *src_opaque_data; > > > -/** An array of application specific data for destination memory */ > > > -uintptr_t *dst_opaque_data; > > > -}; > > > - > > > -/** > > > - * dma operation callbacks to be implemented by applications > > > - */ > > > -struct rte_vhost_async_channel_ops { > > > -/** > > > - * instruct async engines to perform copies for a batch of packets > > > - * > > > - * @param vid > > > - * id of vhost device to perform data copies > > > - * @param queue_id > > > - * queue id to perform data copies > > > - * @param iov_iter > > > - * an array of IOV iterators > > > - * @param opaque_data > > > - * opaque data pair sending to DMA engine > > > - * @param count > > > - * number of elements in the "descs" array > > > - * @return > > > - * number of IOV iterators processed, negative value means error > > > - */ > > > -int32_t (*transfer_data)(int vid, uint16_t queue_id, > > > -struct rte_vhost_iov_iter *iov_iter, > > > -struct rte_vhost_async_status *opaque_data, > > > -uint16_t count); > > > -/** > > > - * check copy-completed packets from the async engine > > > - * @param vid > > > - * id of vhost device to check copy completion > > > - * @param queue_id > > > - * queue id to check copy completion > > > - * @param opaque_data > > > - * buffer to receive the opaque data pair from DMA engine > > > - * @param max_packets > > > - * max number of packets could be completed > > > - * @return > > > - * number of async descs completed, negative value means error > > > - */ > > > -int32_t (*check_completed_copies)(int vid, uint16_t queue_id, > > > -struct rte_vhost_async_status *opaque_data, > > > -uint16_t max_packets); > > > -}; > > > - > > > -/** > > > - * async channel features > > > - */ > > > -enum { > > > -RTE_VHOST_ASYNC_INORDER = 1U << 0, > > > -}; > > > - > > > -/** > > > - * async channel configuration > > > - */ > > > -struct rte_vhost_async_config { > > > -uint32_t features; > > > -uint32_t rsvd[2]; > > > +struct rte_vhost_async_dma_info { > > > +int16_t dev_id;/* DMA device ID */ > > > +uint16_t max_vchans;/* max number of vchan */ > > > +uint16_t max_desc;/* max desc number of vchan */ > > > }; > > > > > > /** > > > @@ -100,17 +42,11 @@ struct rte_vhost_async_config { > > > * vhost device id async channel to be attached to > > > * @param queue_id > > > * vhost queue id async channel to be attached to > > > - * @param config > > > - * Async channel configuration structure > > > - * @param ops > > > - * Async channel operation callbacks > > > * @return > > > * 0 on success, -1 on failures > > > */ > > > __rte_experimental > > > -int rte_vhost_async_channel_register(int vid, uint16_t queue_id, > > > -struct rte_vhost_async_config config, > > > -struct rte_vhost_async_channel_ops *ops); > > > +int rte_vhost_async_channel_register(int vid, uint16_t queue_id); > > > > > > /** > > > * Unregister an async channel for a vhost queue > > > @@ -136,17 +72,11 @@ int rte_vhost_async_channel_unregister(int vid, > > uint16_t > > > queue_id); > > > * vhost device id async channel to be attached to > > > * @param queue_id > > > * vhost queue id async channel to be attached to > > > - * @param config > > > - * Async channel configuration > > > - * @param ops > > > - * Async channel operation callbacks > > > * @return > > > * 0 on success, -1 on failures > > > */ > > > __rte_experimental > > > -int rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t > > queue_id, > > > -struct rte_vhost_async_config config, > > > -struct rte_vhost_async_channel_ops *ops); > > > +int rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t > > > queue_id); > > > > > > /** > > > * Unregister an async channel for a vhost queue without performing any > > > @@ -179,12 +109,17 @@ int > > rte_vhost_async_channel_unregister_thread_unsafe(int > > > vid, > > > * array of packets to be enqueued > > > * @param count > > > * packets num to be enqueued > > > + * @param dma_id > > > + * the identifier of the DMA device > > > + * @param vchan > > > + * the identifier of virtual DMA channel > > > * @return > > > * num of packets enqueued > > > */ > > > __rte_experimental > > > uint16_t rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id, > > > -struct rte_mbuf **pkts, uint16_t count); > > > +struct rte_mbuf **pkts, uint16_t count, int16_t dma_id, > > > +uint16_t vchan); > > > > All dma_id in the API should be uint16_t. Otherwise you need to check if > valid. > > Yes, you are right. Although dma_id is defined as int16_t and DMA library > checks > if it is valid, vhost doesn't handle DMA failure and we need to make sure > dma_id > is valid before using it. And even if vhost handles DMA error, a better place > to check > invalid dma_id is before passing it to DMA library too. I will add the check > later. > > > > > > > > > /** > > > * This function checks async completion status for a specific vhost > > > @@ -199,12 +134,17 @@ uint16_t rte_vhost_submit_enqueue_burst(int > > vid, > > > uint16_t queue_id, > > > * blank array to get return packet pointer > > > * @param count > > > * size of the packet array > > > + * @param dma_id > > > + * the identifier of the DMA device > > > + * @param vchan > > > + * the identifier of virtual DMA channel > > > * @return > > > * num of packets returned > > > */ > > > __rte_experimental > > > uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id, > > > -struct rte_mbuf **pkts, uint16_t count); > > > +struct rte_mbuf **pkts, uint16_t count, int16_t dma_id, > > > +uint16_t vchan); > > > > > > /** > > > * This function returns the amount of in-flight packets for the vhost > > > @@ -235,11 +175,32 @@ int rte_vhost_async_get_inflight(int vid, uint16_t > > > queue_id); > > > * Blank array to get return packet pointer > > > * @param count > > > * Size of the packet array > > > + * @param dma_id > > > + * the identifier of the DMA device > > > + * @param vchan > > > + * the identifier of virtual DMA channel > > > * @return > > > * Number of packets returned > > > */ > > > __rte_experimental > > > uint16_t rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id, > > > -struct rte_mbuf **pkts, uint16_t count); > > > +struct rte_mbuf **pkts, uint16_t count, int16_t dma_id, > > > +uint16_t vchan); > > > +/** > > > + * The DMA vChannels used in asynchronous data path must be > > configured > > > + * first. So this function needs to be called before enabling DMA > > > + * acceleration for vring. If this function fails, asynchronous data path > > > + * cannot be enabled for any vring further. > > > + * > > > + * @param dmas > > > + * DMA information > > > + * @param count > > > + * Element number of 'dmas' > > > + * @return > > > + * 0 on success, and -1 on failure > > > + */ > > > +__rte_experimental > > > +int rte_vhost_async_dma_configure(struct rte_vhost_async_dma_info > > *dmas, > > > +uint16_t count); > > > > I think based on current design, vhost can use every vchan if user app let > it. > > So the max_desc and max_vchans can just be got from dmadev APIs? Then > > there's > > no need to introduce the new ABI struct rte_vhost_async_dma_info. > > Yes, no need to introduce struct rte_vhost_async_dma_info. We can either use > struct rte_dma_info which is suggested by Maxime, or query from dma library > via device id. Since dma device configuration is left to applications, I > prefer to > use rte_dma_info directly. How do you think? If you only use rte_dma_info as input param, you will also need to call dmadev API to get dmadev ID in rte_vhost_async_dma_configure (Or you add both rte_dma_info and dmadev ID). So I suggest to only use dmadev ID as input. > > > > > And about max_desc, I see the dmadev lib, you can get vchan's max_desc > > but you > > may use a nb_desc (<= max_desc) to configure vchanl. And IIUC, vhost wants > > to > > know the nb_desc instead of max_desc? > > True, nb_desc is better than max_desc. But dma library doesn’t provide > function > to query nb_desc for every vchannel. And rte_dma_info cannot be used in > rte_vhost_async_dma_configure(), if vhost uses nb_desc. So the only way is > to require users to provide nb_desc for every vchannel, and it will introduce > a new struct. Is it really needed? > Since now dmadev lib does not provide a way to query real nb_desc for a vchanl, so I think we can just use max_desc. But ideally, if dmadev lib provides such a way, the configured nb_desc and nb_vchanl should be used to configure vhost lib. @Bruce, should you add such a way in dmadev lib? As users now do not know the real configured nb_desc of vchanl. > > > > > > > > #endif /* _RTE_VHOST_ASYNC_H_ */ > > > diff --git a/lib/vhost/version.map b/lib/vhost/version.map > > > index a7ef7f1976..1202ba9c1a 100644 > > > --- a/lib/vhost/version.map > > > +++ b/lib/vhost/version.map > > > @@ -84,6 +84,9 @@ EXPERIMENTAL { > > > > > > # added in 21.11 > > > rte_vhost_get_monitor_addr; > > > + > > > +# added in 22.03 > > > +rte_vhost_async_dma_configure; > > > }; > > > > > > INTERNAL { > > > diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c > > > index 13a9bb9dd1..32f37f4851 100644 > > > --- a/lib/vhost/vhost.c > > > +++ b/lib/vhost/vhost.c > > > @@ -344,6 +344,7 @@ vhost_free_async_mem(struct vhost_virtqueue *vq) > > > return; > > > > > > rte_free(vq->async->pkts_info); > > > +rte_free(vq->async->pkts_cmpl_flag); > > > > > > rte_free(vq->async->buffers_packed); > > > vq->async->buffers_packed = NULL; > > > @@ -1626,8 +1627,7 @@ rte_vhost_extern_callback_register(int vid, > > > } > > > > > > static __rte_always_inline int > > > -async_channel_register(int vid, uint16_t queue_id, > > > -struct rte_vhost_async_channel_ops *ops) > > > +async_channel_register(int vid, uint16_t queue_id) > > > { > > > struct virtio_net *dev = get_device(vid); > > > struct vhost_virtqueue *vq = dev->virtqueue[queue_id]; > > > @@ -1656,6 +1656,14 @@ async_channel_register(int vid, uint16_t > > queue_id, > > > goto out_free_async; > > > } > > > > > > +async->pkts_cmpl_flag = rte_zmalloc_socket(NULL, vq->size * > > sizeof(bool), > > > +RTE_CACHE_LINE_SIZE, node); > > > +if (!async->pkts_cmpl_flag) { > > > +VHOST_LOG_CONFIG(ERR, "failed to allocate async > > pkts_cmpl_flag > > > (vid %d, qid: %d)\n", > > > +vid, queue_id); > > > > qid: %u > > > > > +goto out_free_async; > > > +} > > > + > > > if (vq_is_packed(dev)) { > > > async->buffers_packed = rte_malloc_socket(NULL, > > > vq->size * sizeof(struct > > vring_used_elem_packed), > > > @@ -1676,9 +1684,6 @@ async_channel_register(int vid, uint16_t > > queue_id, > > > } > > > } > > > > > > -async->ops.check_completed_copies = ops- > > >check_completed_copies; > > > -async->ops.transfer_data = ops->transfer_data; > > > - > > > vq->async = async; > > > > > > return 0; > > > @@ -1691,15 +1696,13 @@ async_channel_register(int vid, uint16_t > > queue_id, > > > } > > > > > > int > > > -rte_vhost_async_channel_register(int vid, uint16_t queue_id, > > > -struct rte_vhost_async_config config, > > > -struct rte_vhost_async_channel_ops *ops) > > > +rte_vhost_async_channel_register(int vid, uint16_t queue_id) > > > { > > > struct vhost_virtqueue *vq; > > > struct virtio_net *dev = get_device(vid); > > > int ret; > > > > > > -if (dev == NULL || ops == NULL) > > > +if (dev == NULL) > > > return -1; > > > > > > if (queue_id >= VHOST_MAX_VRING) > > > @@ -1710,33 +1713,20 @@ rte_vhost_async_channel_register(int vid, > > uint16_t > > > queue_id, > > > if (unlikely(vq == NULL || !dev->async_copy)) > > > return -1; > > > > > > -if (unlikely(!(config.features & RTE_VHOST_ASYNC_INORDER))) { > > > -VHOST_LOG_CONFIG(ERR, > > > -"async copy is not supported on non-inorder mode " > > > -"(vid %d, qid: %d)\n", vid, queue_id); > > > -return -1; > > > -} > > > - > > > -if (unlikely(ops->check_completed_copies == NULL || > > > -ops->transfer_data == NULL)) > > > -return -1; > > > - > > > rte_spinlock_lock(&vq->access_lock); > > > -ret = async_channel_register(vid, queue_id, ops); > > > +ret = async_channel_register(vid, queue_id); > > > rte_spinlock_unlock(&vq->access_lock); > > > > > > return ret; > > > } > > > > > > int > > > -rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t > > queue_id, > > > -struct rte_vhost_async_config config, > > > -struct rte_vhost_async_channel_ops *ops) > > > +rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t > > queue_id) > > > { > > > struct vhost_virtqueue *vq; > > > struct virtio_net *dev = get_device(vid); > > > > > > -if (dev == NULL || ops == NULL) > > > +if (dev == NULL) > > > return -1; > > > > > > if (queue_id >= VHOST_MAX_VRING) > > > @@ -1747,18 +1737,7 @@ > > rte_vhost_async_channel_register_thread_unsafe(int vid, > > > uint16_t queue_id, > > > if (unlikely(vq == NULL || !dev->async_copy)) > > > return -1; > > > > > > -if (unlikely(!(config.features & RTE_VHOST_ASYNC_INORDER))) { > > > -VHOST_LOG_CONFIG(ERR, > > > -"async copy is not supported on non-inorder mode " > > > -"(vid %d, qid: %d)\n", vid, queue_id); > > > -return -1; > > > -} > > > - > > > -if (unlikely(ops->check_completed_copies == NULL || > > > -ops->transfer_data == NULL)) > > > -return -1; > > > - > > > -return async_channel_register(vid, queue_id, ops); > > > +return async_channel_register(vid, queue_id); > > > } > > > > > > int > > > @@ -1835,6 +1814,83 @@ > > rte_vhost_async_channel_unregister_thread_unsafe(int > > > vid, uint16_t queue_id) > > > return 0; > > > } > > > > > > +static __rte_always_inline void > > > +vhost_free_async_dma_mem(void) > > > +{ > > > +uint16_t i; > > > + > > > +for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++) { > > > +struct async_dma_info *dma = &dma_copy_track[i]; > > > +int16_t j; > > > + > > > +if (dma->max_vchans == 0) { > > > +continue; > > > +} > > > + > > > +for (j = 0; j < dma->max_vchans; j++) { > > > +rte_free(dma->vchans[j].metadata); > > > +} > > > +rte_free(dma->vchans); > > > +dma->vchans = NULL; > > > +dma->max_vchans = 0; > > > +} > > > +} > > > + > > > +int > > > +rte_vhost_async_dma_configure(struct rte_vhost_async_dma_info *dmas, > > uint16_t > > > count) > > > +{ > > > +uint16_t i; > > > + > > > +if (!dmas) { > > > +VHOST_LOG_CONFIG(ERR, "Invalid DMA configuration > > parameter.\n"); > > > +return -1; > > > +} > > > + > > > +for (i = 0; i < count; i++) { > > > +struct async_dma_vchan_info *vchans; > > > +int16_t dev_id; > > > +uint16_t max_vchans; > > > +uint16_t max_desc; > > > +uint16_t j; > > > + > > > +dev_id = dmas[i].dev_id; > > > +max_vchans = dmas[i].max_vchans; > > > +max_desc = dmas[i].max_desc; > > > + > > > +if (!rte_is_power_of_2(max_desc)) { > > > +max_desc = rte_align32pow2(max_desc); > > > +} > > > > I think when aligning to power of 2, it should exceed not max_desc? > > Aligned max_desc is used to allocate context tracking array. We only need > to guarantee the size of the array for every vchannel is >= max_desc. So it's > OK to have greater array size than max_desc. > > > And based on above comment, if this max_desc is nb_desc configured for > > vchanl, you should just make sure the nb_desc be power-of-2. > > > > > + > > > +vchans = rte_zmalloc(NULL, sizeof(struct > > async_dma_vchan_info) * > > > max_vchans, > > > +RTE_CACHE_LINE_SIZE); > > > +if (vchans == NULL) { > > > +VHOST_LOG_CONFIG(ERR, "Failed to allocate vchans > > for dma- > > > %d." > > > +" Cannot enable async data-path.\n", > > dev_id); > > > +vhost_free_async_dma_mem(); > > > +return -1; > > > +} > > > + > > > +for (j = 0; j < max_vchans; j++) { > > > +vchans[j].metadata = rte_zmalloc(NULL, sizeof(bool *) > > * > > > max_desc, > > > +RTE_CACHE_LINE_SIZE); > > > +if (!vchans[j].metadata) { > > > +VHOST_LOG_CONFIG(ERR, "Failed to allocate > > metadata for > > > " > > > +"dma-%d vchan-%u\n", > > dev_id, j); > > > +vhost_free_async_dma_mem(); > > > +return -1; > > > +} > > > + > > > +vchans[j].ring_size = max_desc; > > > +vchans[j].ring_mask = max_desc - 1; > > > +} > > > + > > > +dma_copy_track[dev_id].vchans = vchans; > > > +dma_copy_track[dev_id].max_vchans = max_vchans; > > > +} > > > + > > > +return 0; > > > +} > > > + > > > int > > > rte_vhost_async_get_inflight(int vid, uint16_t queue_id) > > > { > > > diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h > > > index 7085e0885c..d9bda34e11 100644 > > > --- a/lib/vhost/vhost.h > > > +++ b/lib/vhost/vhost.h > > > @@ -19,6 +19,7 @@ > > > #include <rte_ether.h> > > > #include <rte_rwlock.h> > > > #include <rte_malloc.h> > > > +#include <rte_dmadev.h> > > > > > > #include "rte_vhost.h" > > > #include "rte_vdpa.h" > > > @@ -50,6 +51,7 @@ > > > > > > #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST) > > > #define VHOST_MAX_ASYNC_VEC 2048 > > > +#define VHOST_ASYNC_DMA_BATCHING_SIZE 32 > > > > > > #define PACKED_DESC_ENQUEUE_USED_FLAG(w)\ > > > ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | > > VRING_DESC_F_WRITE) : \ > > > @@ -119,6 +121,41 @@ struct vring_used_elem_packed { > > > uint32_t count; > > > }; > > > > > > +struct async_dma_vchan_info { > > > +/* circular array to track copy metadata */ > > > +bool **metadata; > > > > If the metadata will only be flags, maybe just use some > > name called XXX_flag > > Sure, I will rename it. > > > > > > + > > > +/* max elements in 'metadata' */ > > > +uint16_t ring_size; > > > +/* ring index mask for 'metadata' */ > > > +uint16_t ring_mask; > > > + > > > +/* batching copies before a DMA doorbell */ > > > +uint16_t nr_batching; > > > + > > > +/** > > > + * DMA virtual channel lock. Although it is able to bind DMA > > > + * virtual channels to data plane threads, vhost control plane > > > + * thread could call data plane functions too, thus causing > > > + * DMA device contention. > > > + * > > > + * For example, in VM exit case, vhost control plane thread needs > > > + * to clear in-flight packets before disable vring, but there could > > > + * be anotther data plane thread is enqueuing packets to the same > > > + * vring with the same DMA virtual channel. But dmadev PMD > > functions > > > + * are lock-free, so the control plane and data plane threads > > > + * could operate the same DMA virtual channel at the same time. > > > + */ > > > +rte_spinlock_t dma_lock; > > > +}; > > > + > > > +struct async_dma_info { > > > +uint16_t max_vchans; > > > +struct async_dma_vchan_info *vchans; > > > +}; > > > + > > > +extern struct async_dma_info > > dma_copy_track[RTE_DMADEV_DEFAULT_MAX]; > > > + > > > /** > > > * inflight async packet information > > > */ > > > @@ -129,9 +166,6 @@ struct async_inflight_info { > > > }; > > > > > > struct vhost_async { > > > -/* operation callbacks for DMA */ > > > -struct rte_vhost_async_channel_ops ops; > > > - > > > struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT]; > > > struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC]; > > > uint16_t iter_idx; > > > @@ -139,6 +173,19 @@ struct vhost_async { > > > > > > /* data transfer status */ > > > struct async_inflight_info *pkts_info; > > > +/** > > > + * packet reorder array. "true" indicates that DMA > > > + * device completes all copies for the packet. > > > + * > > > + * Note that this array could be written by multiple > > > + * threads at the same time. For example, two threads > > > + * enqueue packets to the same virtqueue with their > > > + * own DMA devices. However, since offloading is > > > + * per-packet basis, each packet flag will only be > > > + * written by one thread. And single byte write is > > > + * atomic, so no lock is needed. > > > + */ > > > +bool *pkts_cmpl_flag; > > > uint16_t pkts_idx; > > > uint16_t pkts_inflight_n; > > > union { > > > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c > > > index b3d954aab4..9f81fc9733 100644 > > > --- a/lib/vhost/virtio_net.c > > > +++ b/lib/vhost/virtio_net.c > > > @@ -11,6 +11,7 @@ > > > #include <rte_net.h> > > > #include <rte_ether.h> > > > #include <rte_ip.h> > > > +#include <rte_dmadev.h> > > > #include <rte_vhost.h> > > > #include <rte_tcp.h> > > > #include <rte_udp.h> > > > @@ -25,6 +26,9 @@ > > > > > > #define MAX_BATCH_LEN 256 > > > > > > +/* DMA device copy operation tracking array. */ > > > +struct async_dma_info dma_copy_track[RTE_DMADEV_DEFAULT_MAX]; > > > + > > > static __rte_always_inline bool > > > rxvq_is_mergeable(struct virtio_net *dev) > > > { > > > @@ -43,6 +47,108 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, > > uint32_t > > > nr_vring) > > > return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring; > > > } > > > > > > +static __rte_always_inline uint16_t > > > +vhost_async_dma_transfer(struct vhost_virtqueue *vq, int16_t dma_id, > > > +uint16_t vchan, uint16_t head_idx, > > > +struct rte_vhost_iov_iter *pkts, uint16_t nr_pkts) > > > +{ > > > +struct async_dma_vchan_info *dma_info = > > > &dma_copy_track[dma_id].vchans[vchan]; > > > +uint16_t ring_mask = dma_info->ring_mask; > > > +uint16_t pkt_idx; > > > + > > > +rte_spinlock_lock(&dma_info->dma_lock); > > > + > > > +for (pkt_idx = 0; pkt_idx < nr_pkts; pkt_idx++) { > > > +struct rte_vhost_iovec *iov = pkts[pkt_idx].iov; > > > +int copy_idx = 0; > > > +uint16_t nr_segs = pkts[pkt_idx].nr_segs; > > > +uint16_t i; > > > + > > > +if (rte_dma_burst_capacity(dma_id, vchan) < nr_segs) { > > > +goto out; > > > +} > > > + > > > +for (i = 0; i < nr_segs; i++) { > > > +/** > > > + * We have checked the available space before > > submit copies > > > to DMA > > > + * vChannel, so we don't handle error here. > > > + */ > > > +copy_idx = rte_dma_copy(dma_id, vchan, > > > (rte_iova_t)iov[i].src_addr, > > > +(rte_iova_t)iov[i].dst_addr, iov[i].len, > > > +RTE_DMA_OP_FLAG_LLC); > > > > This assumes rte_dma_copy will always succeed if there's available space. > > > > But the API doxygen says: > > > > * @return > > * - 0..UINT16_MAX: index of enqueued job. > > * - -ENOSPC: if no space left to enqueue. > > * - other values < 0 on failure. > > > > So it should consider other vendor-specific errors. > > Error handling is not free here. Specifically, SW fallback is a way to handle > failed > copy operations. But it requires vhost to track VA for every source and > destination > buffer for every copy. DMA library uses IOVA, so vhost only prepares IOVA for > copies of > every packet in async data-path. In the case of IOVA as PA, the prepared IOVAs > cannot > be used as SW fallback, which means vhost needs to store VA for every copy of > every > packet too, even if there no errors will happen or IOVA is VA. > > I am thinking that the only usable DMA engines in vhost are CBDMA and DSA, is > it worth > the cost for "future HW"? If there will be other vendor's HW in future, is it > OK to add the > support later? Or is there any way to get VA from IOVA? Let's investigate how much performance drop the error handling will bring and see... Thanks, Chenbo > > Thanks, > Jiayu > > > > Thanks, > > Chenbo > > > > >