On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> Indirect descriptors are usually supported by virtio-net devices,
> allowing to dispatch a larger number of requests.
> 
> When the virtio device sends a packet using indirect descriptors,
> only one slot is used in the ring, even for large packets.
> 
> The main effect is to improve the 0% packet loss benchmark.
> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> zero loss.
> 
> On the downside, micro-benchmark using testpmd txonly in VM and
> rxonly on host shows a loss between 1 and 4%.i But depending on
> the needs, feature can be disabled at VM boot time by passing
> indirect_desc=off argument to vhost-user device in Qemu.

Even better, change guest pmd to only use indirect
descriptors when this makes sense (e.g. sufficiently
large packets).

I would be very interested to know when does it make
sense.

The feature is there, it's up to guest whether to
use it.


> Signed-off-by: Maxime Coquelin <maxime.coquelin at redhat.com>
> ---
> Changes since v2:
> =================
>  - Revert back to not checking feature flag to be aligned with
> kernel implementation
>  - Ensure we don't have nested indirect descriptors
>  - Ensure the indirect desc address is valid, to protect against
> malicious guests
> 
> Changes since RFC:
> =================
>  - Enrich commit message with figures
>  - Rebased on top of dpdk-next-virtio's master
>  - Add feature check to ensure we don't receive an indirect desc
> if not supported by the virtio driver
> 
>  lib/librte_vhost/vhost.c      |  3 ++-
>  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
>  2 files changed, 33 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 46095c3..30bb0ce 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -65,7 +65,8 @@
>                               (1ULL << VIRTIO_NET_F_CSUM)    | \
>                               (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
>                               (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
> -                             (1ULL << VIRTIO_NET_F_GUEST_TSO6))
> +                             (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
> +                             (1ULL << VIRTIO_RING_F_INDIRECT_DESC))
>  
>  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
>  
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 8a151af..2e0a587 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct 
> ether_addr *mac)
>  }
>  
>  static inline int __attribute__((always_inline))
> -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -               struct rte_mbuf *m, uint16_t desc_idx,
> +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
> +               uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
>                 struct rte_mempool *mbuf_pool)
>  {
>       struct vring_desc *desc;
> @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>       /* A counter to avoid desc dead loop chain */
>       uint32_t nr_desc = 1;
>  
> -     desc = &vq->desc[desc_idx];
> -     if (unlikely(desc->len < dev->vhost_hlen))
> +     desc = &descs[desc_idx];
> +     if (unlikely((desc->len < dev->vhost_hlen)) ||
> +                     (desc->flags & VRING_DESC_F_INDIRECT))
>               return -1;
>  
>       desc_addr = gpa_to_vva(dev, desc->addr);
> @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>        */
>       if (likely((desc->len == dev->vhost_hlen) &&
>                  (desc->flags & VRING_DESC_F_NEXT) != 0)) {
> -             desc = &vq->desc[desc->next];
> +             desc = &descs[desc->next];
> +             if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> +                     return -1;
>  
>               desc_addr = gpa_to_vva(dev, desc->addr);
>               if (unlikely(!desc_addr))


Just to make sure, does this still allow a chain of
direct descriptors ending with an indirect one?
This is legal as per spec.

> @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct 
> vhost_virtqueue *vq,
>                       if ((desc->flags & VRING_DESC_F_NEXT) == 0)
>                               break;
>  
> -                     if (unlikely(desc->next >= vq->size ||
> -                                  ++nr_desc > vq->size))
> +                     if (unlikely(desc->next >= max_desc ||
> +                                  ++nr_desc > max_desc))
> +                             return -1;
> +                     desc = &descs[desc->next];
> +                     if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>                               return -1;
> -                     desc = &vq->desc[desc->next];
>  
>                       desc_addr = gpa_to_vva(dev, desc->addr);
>                       if (unlikely(!desc_addr))
> @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>       /* Prefetch descriptor index. */
>       rte_prefetch0(&vq->desc[desc_indexes[0]]);
>       for (i = 0; i < count; i++) {
> +             struct vring_desc *desc;
> +             uint16_t sz, idx;
>               int err;
>  
>               if (likely(i + 1 < count))
>                       rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
>  
> +             if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> +                     desc = (struct vring_desc *)gpa_to_vva(dev,
> +                                     vq->desc[desc_indexes[i]].addr);
> +                     if (unlikely(!desc))
> +                             break;
> +
> +                     rte_prefetch0(desc);
> +                     sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
> +                     idx = 0;
> +             } else {
> +                     desc = vq->desc;
> +                     sz = vq->size;
> +                     idx = desc_indexes[i];
> +             }
> +
>               pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
>               if (unlikely(pkts[i] == NULL)) {
>                       RTE_LOG(ERR, VHOST_DATA,
>                               "Failed to allocate memory for mbuf.\n");
>                       break;
>               }
> -             err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
> -                                     mbuf_pool);
> +             err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
>               if (unlikely(err)) {
>                       rte_pktmbuf_free(pkts[i]);
>                       break;
> -- 
> 2.7.4

Something that I'm missing here: it's legal for guest
to add indirect descriptors for RX.
I don't see the handling of RX here though.
I think it's required for spec compliance.

-- 
MST

Reply via email to