On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:
> 
> 
> On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:
> > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> > > Indirect descriptors are usually supported by virtio-net devices,
> > > allowing to dispatch a larger number of requests.
> > > 
> > > When the virtio device sends a packet using indirect descriptors,
> > > only one slot is used in the ring, even for large packets.
> > > 
> > > The main effect is to improve the 0% packet loss benchmark.
> > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > zero loss.
> > > 
> > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > the needs, feature can be disabled at VM boot time by passing
> > > indirect_desc=off argument to vhost-user device in Qemu.
> > 
> > Even better, change guest pmd to only use indirect
> > descriptors when this makes sense (e.g. sufficiently
> > large packets).
> With the micro-benchmark, the degradation is quite constant whatever
> the packet size.
> 
> For PVP, I could not test with larger packets than 64 bytes, as I don't
> have a 40G interface,

Don't 64 byte packets fit in a single slot anyway?
Why would there be an effect with that?

> and line rate with 10G is reached rapidly.

Right but focus on packet loss. you can have that at any rate.

> 
> > I would be very interested to know when does it make
> > sense.
> > 
> > The feature is there, it's up to guest whether to
> > use it.
> Do you mean the PMD should detect dynamically whether using indirect,
> or having an option at device init time to enable or not the feature?

guest PMD should not use indirect where it slows things down.

> > 
> > 
> > > Signed-off-by: Maxime Coquelin <maxime.coquelin at redhat.com>
> > > ---
> > > Changes since v2:
> > > =================
> > >  - Revert back to not checking feature flag to be aligned with
> > > kernel implementation
> > >  - Ensure we don't have nested indirect descriptors
> > >  - Ensure the indirect desc address is valid, to protect against
> > > malicious guests
> > > 
> > > Changes since RFC:
> > > =================
> > >  - Enrich commit message with figures
> > >  - Rebased on top of dpdk-next-virtio's master
> > >  - Add feature check to ensure we don't receive an indirect desc
> > > if not supported by the virtio driver
> > > 
> > >  lib/librte_vhost/vhost.c      |  3 ++-
> > >  lib/librte_vhost/virtio_net.c | 41 
> > > +++++++++++++++++++++++++++++++----------
> > >  2 files changed, 33 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > > index 46095c3..30bb0ce 100644
> > > --- a/lib/librte_vhost/vhost.c
> > > +++ b/lib/librte_vhost/vhost.c
> > > @@ -65,7 +65,8 @@
> > >                           (1ULL << VIRTIO_NET_F_CSUM)    | \
> > >                           (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
> > >                           (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
> > > -                         (1ULL << VIRTIO_NET_F_GUEST_TSO6))
> > > +                         (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
> > > +                         (1ULL << VIRTIO_RING_F_INDIRECT_DESC))
> > > 
> > >  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
> > > 
> > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > > index 8a151af..2e0a587 100644
> > > --- a/lib/librte_vhost/virtio_net.c
> > > +++ b/lib/librte_vhost/virtio_net.c
> > > @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const 
> > > struct ether_addr *mac)
> > >  }
> > > 
> > >  static inline int __attribute__((always_inline))
> > > -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > -           struct rte_mbuf *m, uint16_t desc_idx,
> > > +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
> > > +           uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
> > >             struct rte_mempool *mbuf_pool)
> > >  {
> > >   struct vring_desc *desc;
> > > @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct 
> > > vhost_virtqueue *vq,
> > >   /* A counter to avoid desc dead loop chain */
> > >   uint32_t nr_desc = 1;
> > > 
> > > - desc = &vq->desc[desc_idx];
> > > - if (unlikely(desc->len < dev->vhost_hlen))
> > > + desc = &descs[desc_idx];
> > > + if (unlikely((desc->len < dev->vhost_hlen)) ||
> > > +                 (desc->flags & VRING_DESC_F_INDIRECT))
> > >           return -1;
> > > 
> > >   desc_addr = gpa_to_vva(dev, desc->addr);
> > > @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct 
> > > vhost_virtqueue *vq,
> > >    */
> > >   if (likely((desc->len == dev->vhost_hlen) &&
> > >              (desc->flags & VRING_DESC_F_NEXT) != 0)) {
> > > -         desc = &vq->desc[desc->next];
> > > +         desc = &descs[desc->next];
> > > +         if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > > +                 return -1;
> > > 
> > >           desc_addr = gpa_to_vva(dev, desc->addr);
> > >           if (unlikely(!desc_addr))
> > 
> > 
> > Just to make sure, does this still allow a chain of
> > direct descriptors ending with an indirect one?
> > This is legal as per spec.
> > 
> > > @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct 
> > > vhost_virtqueue *vq,
> > >                   if ((desc->flags & VRING_DESC_F_NEXT) == 0)
> > >                           break;
> > > 
> > > -                 if (unlikely(desc->next >= vq->size ||
> > > -                              ++nr_desc > vq->size))
> > > +                 if (unlikely(desc->next >= max_desc ||
> > > +                              ++nr_desc > max_desc))
> > > +                         return -1;
> > > +                 desc = &descs[desc->next];
> > > +                 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > >                           return -1;
> > > -                 desc = &vq->desc[desc->next];
> > > 
> > >                   desc_addr = gpa_to_vva(dev, desc->addr);
> > >                   if (unlikely(!desc_addr))
> > > @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> > >   /* Prefetch descriptor index. */
> > >   rte_prefetch0(&vq->desc[desc_indexes[0]]);
> > >   for (i = 0; i < count; i++) {
> > > +         struct vring_desc *desc;
> > > +         uint16_t sz, idx;
> > >           int err;
> > > 
> > >           if (likely(i + 1 < count))
> > >                   rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
> > > 
> > > +         if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> > > +                 desc = (struct vring_desc *)gpa_to_vva(dev,
> > > +                                 vq->desc[desc_indexes[i]].addr);
> > > +                 if (unlikely(!desc))
> > > +                         break;
> > > +
> > > +                 rte_prefetch0(desc);
> > > +                 sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
> > > +                 idx = 0;
> > > +         } else {
> > > +                 desc = vq->desc;
> > > +                 sz = vq->size;
> > > +                 idx = desc_indexes[i];
> > > +         }
> > > +
> > >           pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> > >           if (unlikely(pkts[i] == NULL)) {
> > >                   RTE_LOG(ERR, VHOST_DATA,
> > >                           "Failed to allocate memory for mbuf.\n");
> > >                   break;
> > >           }
> > > -         err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
> > > -                                 mbuf_pool);
> > > +         err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
> > >           if (unlikely(err)) {
> > >                   rte_pktmbuf_free(pkts[i]);
> > >                   break;
> > > --
> > > 2.7.4
> > 
> > Something that I'm missing here: it's legal for guest
> > to add indirect descriptors for RX.
> > I don't see the handling of RX here though.
> > I think it's required for spec compliance.
> > 

Reply via email to