Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs

Shahaf Shuler Tue, 01 Oct 2019 21:46:24 -0700

Wednesday, October 2, 2019 1:20 AM, Flavio Leitner:
> Subject: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs
> 
> The rte_vhost_dequeue_burst supports two ways of dequeuing data. If the
> data fits into a buffer, then all data is copied and a single linear buffer is
> returned. Otherwise it allocates additional mbufs and chains them together
> to return a multiple segments mbuf.
> 
> While that covers most use cases, it forces applications that need to work
> with larger data sizes to support multiple segments mbufs.
> The non-linear characteristic brings complexity and performance implications
> to the application.
> 
> To resolve the issue, change the API so that the application can optionally
> provide a second mempool containing larger mbufs. If that is not provided
> (NULL), the behavior remains as before the change.
> Otherwise, the data size is checked and the corresponding mempool is used
> to return linear mbufs.


I understand the motivation. 
However, providing a static pool w/ large buffers is not so efficient in terms 
of memory footprint. You will need to prepare to worst case (all packet are 
large) w/ max size of 64KB. 
Also, the two mempools are quite restrictive as the memory fill of the mbufs 
might be very sparse. E.g. mempool1 mbuf.size = 1.5K , mempool2 mbuf.size = 
64K, packet size 4KB. 

Instead, how about using the mbuf external buffer feature? 
The flow will be:
1. vhost PMD always receive a single mempool (like today)
2. on dequeue, PMD looks on the virtio packet size. If smaller than the mbuf 
size use the mbuf as is (like today)
3. otherwise, allocate a new buffer (inside the PMD) and link it to the mbuf as 
external buffer (rte_pktmbuf_attach_extbuf)

The pros of this approach is that you have full flexibility on the memory 
allocation, and therefore a lower footprint.
The cons is the OVS will need to know how to handle mbuf w/ external buffers 
(not too complex IMO).  

> 
> Signed-off-by: Flavio Leitner <f...@sysclose.org>
> ---
>  drivers/net/vhost/rte_eth_vhost.c |  4 +--
>  examples/tep_termination/main.c   |  2 +-
>  examples/vhost/main.c             |  2 +-
>  lib/librte_vhost/rte_vhost.h      |  5 ++-
>  lib/librte_vhost/virtio_net.c     | 57 +++++++++++++++++++++++--------
>  5 files changed, 50 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/net/vhost/rte_eth_vhost.c
> b/drivers/net/vhost/rte_eth_vhost.c
> index 46f01a7f4..ce7f68a5b 100644
> --- a/drivers/net/vhost/rte_eth_vhost.c
> +++ b/drivers/net/vhost/rte_eth_vhost.c
> @@ -393,8 +393,8 @@ eth_vhost_rx(void *q, struct rte_mbuf **bufs,
> uint16_t nb_bufs)
>                                                VHOST_MAX_PKT_BURST);
> 
>               nb_pkts = rte_vhost_dequeue_burst(r->vid, r-
> >virtqueue_id,
> -                                               r->mb_pool, &bufs[nb_rx],
> -                                               num);
> +                                               r->mb_pool, NULL,
> +                                               &bufs[nb_rx], num);
> 
>               nb_rx += nb_pkts;
>               nb_receive -= nb_pkts;
> diff --git a/examples/tep_termination/main.c
> b/examples/tep_termination/main.c index ab956ad7c..3ebf0fa6e 100644
> --- a/examples/tep_termination/main.c
> +++ b/examples/tep_termination/main.c
> @@ -697,7 +697,7 @@ switch_worker(__rte_unused void *arg)
>                       if (likely(!vdev->remove)) {
>                               /* Handle guest TX*/
>                               tx_count = rte_vhost_dequeue_burst(vdev-
> >vid,
> -                                             VIRTIO_TXQ, mbuf_pool,
> +                                             VIRTIO_TXQ, mbuf_pool,
> NULL,
>                                               pkts_burst,
> MAX_PKT_BURST);
>                               /* If this is the first received packet we need
> to learn the MAC */
>                               if (unlikely(vdev->ready ==
> DEVICE_MAC_LEARNING) && tx_count) { diff --git a/examples/vhost/main.c
> b/examples/vhost/main.c index ab649bf14..e9b306af3 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -1092,7 +1092,7 @@ drain_virtio_tx(struct vhost_dev *vdev)
>                                       pkts, MAX_PKT_BURST);
>       } else {
>               count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
> -                                     mbuf_pool, pkts, MAX_PKT_BURST);
> +                                     mbuf_pool, NULL, pkts,
> MAX_PKT_BURST);
>       }
> 
>       /* setup VMDq for the first packet */
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index
> 19474bca0..b05fd8e2a 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -593,6 +593,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t
> queue_id,
>   *  virtio queue index in mq case
>   * @param mbuf_pool
>   *  mbuf_pool where host mbuf is allocated.
> + * @param mbuf_pool_large
> + *  mbuf_pool where larger host mbuf is allocated.
>   * @param pkts
>   *  array to contain packets to be dequeued
>   * @param count
> @@ -601,7 +603,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t
> queue_id,
>   *  num of packets dequeued
>   */
>  uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> -     struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count);
> +     struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> +     struct rte_mbuf **pkts, uint16_t count);
> 
>  /**
>   * Get guest mem table: a list of memory regions.
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c 
> index
> 5b85b832d..da9d77732 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1291,10 +1291,12 @@ get_zmbuf(struct vhost_virtqueue *vq)
> 
>  static __rte_noinline uint16_t
>  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -     struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count)
> +     struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> +     struct rte_mbuf **pkts, uint16_t count)
>  {
>       uint16_t i;
>       uint16_t free_entries;
> +     uint16_t mbuf_avail;
> 
>       if (unlikely(dev->dequeue_zero_copy)) {
>               struct zcopy_mbuf *zmbuf, *next;
> @@ -1340,32 +1342,42 @@ virtio_dev_tx_split(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
>       VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u
> buffers\n",
>                       dev->vid, count);
> 
> +     /* If the large mpool is provided, find the threshold */
> +     mbuf_avail = 0;
> +     if (mbuf_pool_large)
> +             mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) -
> +RTE_PKTMBUF_HEADROOM;
> +
>       for (i = 0; i < count; i++) {
>               struct buf_vector buf_vec[BUF_VECTOR_MAX];
>               uint16_t head_idx;
> -             uint32_t dummy_len;
> +             uint32_t buf_len;
>               uint16_t nr_vec = 0;
> +             struct rte_mempool *mpool;
>               int err;
> 
>               if (unlikely(fill_vec_buf_split(dev, vq,
>                                               vq->last_avail_idx + i,
>                                               &nr_vec, buf_vec,
> -                                             &head_idx, &dummy_len,
> +                                             &head_idx, &buf_len,
>                                               VHOST_ACCESS_RO) < 0))
>                       break;
> 
>               if (likely(dev->dequeue_zero_copy == 0))
>                       update_shadow_used_ring_split(vq, head_idx, 0);
> 
> -             pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> +             if (mbuf_pool_large && buf_len > mbuf_avail)
> +                     mpool = mbuf_pool_large;
> +             else
> +                     mpool = mbuf_pool;
> +
> +             pkts[i] = rte_pktmbuf_alloc(mpool);
>               if (unlikely(pkts[i] == NULL)) {
>                       RTE_LOG(ERR, VHOST_DATA,
>                               "Failed to allocate memory for mbuf.\n");
>                       break;
>               }
> 
> -             err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> -                             mbuf_pool);
> +             err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> mpool);
>               if (unlikely(err)) {
>                       rte_pktmbuf_free(pkts[i]);
>                       break;
> @@ -1411,9 +1423,11 @@ virtio_dev_tx_split(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> 
>  static __rte_noinline uint16_t
>  virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -     struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count)
> +     struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> +     struct rte_mbuf **pkts, uint16_t count)
>  {
>       uint16_t i;
> +     uint16_t mbuf_avail;
> 
>       if (unlikely(dev->dequeue_zero_copy)) {
>               struct zcopy_mbuf *zmbuf, *next;
> @@ -1448,17 +1462,23 @@ virtio_dev_tx_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>       VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u
> buffers\n",
>                       dev->vid, count);
> 
> +     /* If the large mpool is provided, find the threshold */
> +     mbuf_avail = 0;
> +     if (mbuf_pool_large)
> +             mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) -
> +RTE_PKTMBUF_HEADROOM;
> +
>       for (i = 0; i < count; i++) {
>               struct buf_vector buf_vec[BUF_VECTOR_MAX];
>               uint16_t buf_id;
> -             uint32_t dummy_len;
> +             uint32_t buf_len;
>               uint16_t desc_count, nr_vec = 0;
> +             struct rte_mempool *mpool;
>               int err;
> 
>               if (unlikely(fill_vec_buf_packed(dev, vq,
>                                               vq->last_avail_idx,
> &desc_count,
>                                               buf_vec, &nr_vec,
> -                                             &buf_id, &dummy_len,
> +                                             &buf_id, &buf_len,
>                                               VHOST_ACCESS_RO) < 0))
>                       break;
> 
> @@ -1466,15 +1486,19 @@ virtio_dev_tx_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>                       update_shadow_used_ring_packed(vq, buf_id, 0,
>                                       desc_count);
> 
> -             pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> +             if (mbuf_pool_large && buf_len > mbuf_avail)
> +                     mpool = mbuf_pool_large;
> +             else
> +                     mpool = mbuf_pool;
> +
> +             pkts[i] = rte_pktmbuf_alloc(mpool);
>               if (unlikely(pkts[i] == NULL)) {
>                       RTE_LOG(ERR, VHOST_DATA,
>                               "Failed to allocate memory for mbuf.\n");
>                       break;
>               }
> 
> -             err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> -                             mbuf_pool);
> +             err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
> mpool);
>               if (unlikely(err)) {
>                       rte_pktmbuf_free(pkts[i]);
>                       break;
> @@ -1526,7 +1550,8 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> 
>  uint16_t
>  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> -     struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t
> count)
> +     struct rte_mempool *mbuf_pool, struct rte_mempool
> *mbuf_pool_large,
> +     struct rte_mbuf **pkts, uint16_t count)
>  {
>       struct virtio_net *dev;
>       struct rte_mbuf *rarp_mbuf = NULL;
> @@ -1598,9 +1623,11 @@ rte_vhost_dequeue_burst(int vid, uint16_t
> queue_id,
>       }
> 
>       if (vq_is_packed(dev))
> -             count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts,
> count);
> +             count = virtio_dev_tx_packed(dev, vq, mbuf_pool,
> mbuf_pool_large, pkts,
> +                                     count);
>       else
> -             count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
> +             count = virtio_dev_tx_split(dev, vq, mbuf_pool,
> mbuf_pool_large, pkts,
> +                                    count);
> 
>  out:
>       if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> --
> 2.20.1

Re: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs

Reply via email to