Wednesday, October 2, 2019 1:20 AM, Flavio Leitner: > Subject: [dpdk-dev] [PATCH] vhost: add support to large linear mbufs > > The rte_vhost_dequeue_burst supports two ways of dequeuing data. If the > data fits into a buffer, then all data is copied and a single linear buffer is > returned. Otherwise it allocates additional mbufs and chains them together > to return a multiple segments mbuf. > > While that covers most use cases, it forces applications that need to work > with larger data sizes to support multiple segments mbufs. > The non-linear characteristic brings complexity and performance implications > to the application. > > To resolve the issue, change the API so that the application can optionally > provide a second mempool containing larger mbufs. If that is not provided > (NULL), the behavior remains as before the change. > Otherwise, the data size is checked and the corresponding mempool is used > to return linear mbufs.
I understand the motivation. However, providing a static pool w/ large buffers is not so efficient in terms of memory footprint. You will need to prepare to worst case (all packet are large) w/ max size of 64KB. Also, the two mempools are quite restrictive as the memory fill of the mbufs might be very sparse. E.g. mempool1 mbuf.size = 1.5K , mempool2 mbuf.size = 64K, packet size 4KB. Instead, how about using the mbuf external buffer feature? The flow will be: 1. vhost PMD always receive a single mempool (like today) 2. on dequeue, PMD looks on the virtio packet size. If smaller than the mbuf size use the mbuf as is (like today) 3. otherwise, allocate a new buffer (inside the PMD) and link it to the mbuf as external buffer (rte_pktmbuf_attach_extbuf) The pros of this approach is that you have full flexibility on the memory allocation, and therefore a lower footprint. The cons is the OVS will need to know how to handle mbuf w/ external buffers (not too complex IMO). > > Signed-off-by: Flavio Leitner <f...@sysclose.org> > --- > drivers/net/vhost/rte_eth_vhost.c | 4 +-- > examples/tep_termination/main.c | 2 +- > examples/vhost/main.c | 2 +- > lib/librte_vhost/rte_vhost.h | 5 ++- > lib/librte_vhost/virtio_net.c | 57 +++++++++++++++++++++++-------- > 5 files changed, 50 insertions(+), 20 deletions(-) > > diff --git a/drivers/net/vhost/rte_eth_vhost.c > b/drivers/net/vhost/rte_eth_vhost.c > index 46f01a7f4..ce7f68a5b 100644 > --- a/drivers/net/vhost/rte_eth_vhost.c > +++ b/drivers/net/vhost/rte_eth_vhost.c > @@ -393,8 +393,8 @@ eth_vhost_rx(void *q, struct rte_mbuf **bufs, > uint16_t nb_bufs) > VHOST_MAX_PKT_BURST); > > nb_pkts = rte_vhost_dequeue_burst(r->vid, r- > >virtqueue_id, > - r->mb_pool, &bufs[nb_rx], > - num); > + r->mb_pool, NULL, > + &bufs[nb_rx], num); > > nb_rx += nb_pkts; > nb_receive -= nb_pkts; > diff --git a/examples/tep_termination/main.c > b/examples/tep_termination/main.c index ab956ad7c..3ebf0fa6e 100644 > --- a/examples/tep_termination/main.c > +++ b/examples/tep_termination/main.c > @@ -697,7 +697,7 @@ switch_worker(__rte_unused void *arg) > if (likely(!vdev->remove)) { > /* Handle guest TX*/ > tx_count = rte_vhost_dequeue_burst(vdev- > >vid, > - VIRTIO_TXQ, mbuf_pool, > + VIRTIO_TXQ, mbuf_pool, > NULL, > pkts_burst, > MAX_PKT_BURST); > /* If this is the first received packet we need > to learn the MAC */ > if (unlikely(vdev->ready == > DEVICE_MAC_LEARNING) && tx_count) { diff --git a/examples/vhost/main.c > b/examples/vhost/main.c index ab649bf14..e9b306af3 100644 > --- a/examples/vhost/main.c > +++ b/examples/vhost/main.c > @@ -1092,7 +1092,7 @@ drain_virtio_tx(struct vhost_dev *vdev) > pkts, MAX_PKT_BURST); > } else { > count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, > - mbuf_pool, pkts, MAX_PKT_BURST); > + mbuf_pool, NULL, pkts, > MAX_PKT_BURST); > } > > /* setup VMDq for the first packet */ > diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index > 19474bca0..b05fd8e2a 100644 > --- a/lib/librte_vhost/rte_vhost.h > +++ b/lib/librte_vhost/rte_vhost.h > @@ -593,6 +593,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t > queue_id, > * virtio queue index in mq case > * @param mbuf_pool > * mbuf_pool where host mbuf is allocated. > + * @param mbuf_pool_large > + * mbuf_pool where larger host mbuf is allocated. > * @param pkts > * array to contain packets to be dequeued > * @param count > @@ -601,7 +603,8 @@ uint16_t rte_vhost_enqueue_burst(int vid, uint16_t > queue_id, > * num of packets dequeued > */ > uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, > - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t > count); > + struct rte_mempool *mbuf_pool, struct rte_mempool > *mbuf_pool_large, > + struct rte_mbuf **pkts, uint16_t count); > > /** > * Get guest mem table: a list of memory regions. > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index > 5b85b832d..da9d77732 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -1291,10 +1291,12 @@ get_zmbuf(struct vhost_virtqueue *vq) > > static __rte_noinline uint16_t > virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t > count) > + struct rte_mempool *mbuf_pool, struct rte_mempool > *mbuf_pool_large, > + struct rte_mbuf **pkts, uint16_t count) > { > uint16_t i; > uint16_t free_entries; > + uint16_t mbuf_avail; > > if (unlikely(dev->dequeue_zero_copy)) { > struct zcopy_mbuf *zmbuf, *next; > @@ -1340,32 +1342,42 @@ virtio_dev_tx_split(struct virtio_net *dev, struct > vhost_virtqueue *vq, > VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u > buffers\n", > dev->vid, count); > > + /* If the large mpool is provided, find the threshold */ > + mbuf_avail = 0; > + if (mbuf_pool_large) > + mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) - > +RTE_PKTMBUF_HEADROOM; > + > for (i = 0; i < count; i++) { > struct buf_vector buf_vec[BUF_VECTOR_MAX]; > uint16_t head_idx; > - uint32_t dummy_len; > + uint32_t buf_len; > uint16_t nr_vec = 0; > + struct rte_mempool *mpool; > int err; > > if (unlikely(fill_vec_buf_split(dev, vq, > vq->last_avail_idx + i, > &nr_vec, buf_vec, > - &head_idx, &dummy_len, > + &head_idx, &buf_len, > VHOST_ACCESS_RO) < 0)) > break; > > if (likely(dev->dequeue_zero_copy == 0)) > update_shadow_used_ring_split(vq, head_idx, 0); > > - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); > + if (mbuf_pool_large && buf_len > mbuf_avail) > + mpool = mbuf_pool_large; > + else > + mpool = mbuf_pool; > + > + pkts[i] = rte_pktmbuf_alloc(mpool); > if (unlikely(pkts[i] == NULL)) { > RTE_LOG(ERR, VHOST_DATA, > "Failed to allocate memory for mbuf.\n"); > break; > } > > - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], > - mbuf_pool); > + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], > mpool); > if (unlikely(err)) { > rte_pktmbuf_free(pkts[i]); > break; > @@ -1411,9 +1423,11 @@ virtio_dev_tx_split(struct virtio_net *dev, struct > vhost_virtqueue *vq, > > static __rte_noinline uint16_t > virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t > count) > + struct rte_mempool *mbuf_pool, struct rte_mempool > *mbuf_pool_large, > + struct rte_mbuf **pkts, uint16_t count) > { > uint16_t i; > + uint16_t mbuf_avail; > > if (unlikely(dev->dequeue_zero_copy)) { > struct zcopy_mbuf *zmbuf, *next; > @@ -1448,17 +1462,23 @@ virtio_dev_tx_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, > VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u > buffers\n", > dev->vid, count); > > + /* If the large mpool is provided, find the threshold */ > + mbuf_avail = 0; > + if (mbuf_pool_large) > + mbuf_avail = rte_pktmbuf_data_room_size(mbuf_pool) - > +RTE_PKTMBUF_HEADROOM; > + > for (i = 0; i < count; i++) { > struct buf_vector buf_vec[BUF_VECTOR_MAX]; > uint16_t buf_id; > - uint32_t dummy_len; > + uint32_t buf_len; > uint16_t desc_count, nr_vec = 0; > + struct rte_mempool *mpool; > int err; > > if (unlikely(fill_vec_buf_packed(dev, vq, > vq->last_avail_idx, > &desc_count, > buf_vec, &nr_vec, > - &buf_id, &dummy_len, > + &buf_id, &buf_len, > VHOST_ACCESS_RO) < 0)) > break; > > @@ -1466,15 +1486,19 @@ virtio_dev_tx_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, > update_shadow_used_ring_packed(vq, buf_id, 0, > desc_count); > > - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); > + if (mbuf_pool_large && buf_len > mbuf_avail) > + mpool = mbuf_pool_large; > + else > + mpool = mbuf_pool; > + > + pkts[i] = rte_pktmbuf_alloc(mpool); > if (unlikely(pkts[i] == NULL)) { > RTE_LOG(ERR, VHOST_DATA, > "Failed to allocate memory for mbuf.\n"); > break; > } > > - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], > - mbuf_pool); > + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], > mpool); > if (unlikely(err)) { > rte_pktmbuf_free(pkts[i]); > break; > @@ -1526,7 +1550,8 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct > vhost_virtqueue *vq, > > uint16_t > rte_vhost_dequeue_burst(int vid, uint16_t queue_id, > - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t > count) > + struct rte_mempool *mbuf_pool, struct rte_mempool > *mbuf_pool_large, > + struct rte_mbuf **pkts, uint16_t count) > { > struct virtio_net *dev; > struct rte_mbuf *rarp_mbuf = NULL; > @@ -1598,9 +1623,11 @@ rte_vhost_dequeue_burst(int vid, uint16_t > queue_id, > } > > if (vq_is_packed(dev)) > - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, > count); > + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, > mbuf_pool_large, pkts, > + count); > else > - count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); > + count = virtio_dev_tx_split(dev, vq, mbuf_pool, > mbuf_pool_large, pkts, > + count); > > out: > if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) > -- > 2.20.1