Hi all,

Any comments for this patch?
And what's the status for merging it into mainline?

Thanks in advance
Changchun

> -----Original Message-----
> From: Ouyang, Changchun
> Sent: Friday, August 15, 2014 12:58 PM
> To: dev at dpdk.org
> Cc: Cao, Waterman; Ouyang, Changchun
> Subject: [PATCH] examples/vhost: Support jumbo frame in user space vhost
> 
> This patch support mergeable RX feature and thus support jumbo frame RX
> and TX in user space vhost(as virtio backend).
> 
> On RX, it secures enough room from vring to accommodate one complete
> scattered packet which is received by PMD from physical port, and then copy
> data from mbuf to vring buffer, possibly across a few vring entries and
> descriptors.
> 
> On TX, it gets a jumbo frame, possibly described by a few vring descriptors
> which are chained together with the flags of 'NEXT', and then copy them into
> one scattered packet and TX it to physical port through PMD.
> 
> Signed-off-by: Changchun Ouyang <changchun.ouyang at intel.com>
> Acked-by: Huawei Xie <huawei.xie at intel.com>
> ---
>  examples/vhost/main.c       | 726
> ++++++++++++++++++++++++++++++++++++++++----
>  examples/vhost/virtio-net.h |  14 +
>  2 files changed, 687 insertions(+), 53 deletions(-)
> 
> diff --git a/examples/vhost/main.c b/examples/vhost/main.c index
> 193aa25..7d9e6a2 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -106,6 +106,8 @@
>  #define BURST_RX_WAIT_US 15  /* Defines how long we wait
> between retries on RX */
>  #define BURST_RX_RETRIES 4           /* Number of retries on RX. */
> 
> +#define JUMBO_FRAME_MAX_SIZE    0x2600
> +
>  /* State of virtio device. */
>  #define DEVICE_MAC_LEARNING 0
>  #define DEVICE_RX                    1
> @@ -676,8 +678,12 @@ us_vhost_parse_args(int argc, char **argv)
>                                       us_vhost_usage(prgname);
>                                       return -1;
>                               } else {
> -                                     if (ret)
> +                                     if (ret) {
> +
>       vmdq_conf_default.rxmode.jumbo_frame = 1;
> +
>       vmdq_conf_default.rxmode.max_rx_pkt_len
> +                                                     =
> JUMBO_FRAME_MAX_SIZE;
>                                               VHOST_FEATURES = (1ULL <<
> VIRTIO_NET_F_MRG_RXBUF);
> +                                     }
>                               }
>                       }
> 
> @@ -797,6 +803,14 @@ us_vhost_parse_args(int argc, char **argv)
>               return -1;
>       }
> 
> +     if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame
> == 1)) {
> +             RTE_LOG(INFO, VHOST_PORT,
> +                     "Vhost zero copy doesn't support jumbo frame,"
> +                     "please specify '--mergeable 0' to disable the "
> +                     "mergeable feature.\n");
> +             return -1;
> +     }
> +
>       return 0;
>  }
> 
> @@ -916,7 +930,7 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
>   * This function adds buffers to the virtio devices RX virtqueue. Buffers can
>   * be received from the physical port or from another virtio device. A packet
>   * count is returned to indicate the number of packets that were succesfully
> - * added to the RX queue.
> + * added to the RX queue. This function works when mergeable is disabled.
>   */
>  static inline uint32_t __attribute__((always_inline))  virtio_dev_rx(struct
> virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) @@ -930,7 +944,6
> @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t
> count)
>       uint64_t buff_hdr_addr = 0;
>       uint32_t head[MAX_PKT_BURST], packet_len = 0;
>       uint32_t head_idx, packet_success = 0;
> -     uint32_t mergeable, mrg_count = 0;
>       uint32_t retry = 0;
>       uint16_t avail_idx, res_cur_idx;
>       uint16_t res_base_idx, res_end_idx;
> @@ -940,6 +953,7 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf
> **pkts, uint32_t count)
>       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev-
> >device_fh);
>       vq = dev->virtqueue[VIRTIO_RXQ];
>       count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
> +
>       /* As many data cores may want access to available buffers, they
> need to be reserved. */
>       do {
>               res_base_idx = vq->last_used_idx_res; @@ -976,9 +990,6
> @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t
> count)
>       /* Prefetch available ring to retrieve indexes. */
>       rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
> 
> -     /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
> -     mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
> -
>       /* Retrieve all of the head indexes first to avoid caching issues. */
>       for (head_idx = 0; head_idx < count; head_idx++)
>               head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
> (vq->size - 1)]; @@ -997,56 +1008,44 @@ virtio_dev_rx(struct virtio_net
> *dev, struct rte_mbuf **pkts, uint32_t count)
>               /* Prefetch buffer address. */
>               rte_prefetch0((void*)(uintptr_t)buff_addr);
> 
> -             if (mergeable && (mrg_count != 0)) {
> -                     desc->len = packet_len =
> rte_pktmbuf_data_len(buff);
> -             } else {
> -                     /* Copy virtio_hdr to packet and increment buffer
> address */
> -                     buff_hdr_addr = buff_addr;
> -                     packet_len = rte_pktmbuf_data_len(buff) + vq-
> >vhost_hlen;
> +             /* Copy virtio_hdr to packet and increment buffer address */
> +             buff_hdr_addr = buff_addr;
> +             packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
> 
> -                     /*
> -                      * If the descriptors are chained the header and data
> are placed in
> -                      * separate buffers.
> -                      */
> -                     if (desc->flags & VRING_DESC_F_NEXT) {
> -                             desc->len = vq->vhost_hlen;
> -                             desc = &vq->desc[desc->next];
> -                             /* Buffer address translation. */
> -                             buff_addr = gpa_to_vva(dev, desc->addr);
> -                             desc->len = rte_pktmbuf_data_len(buff);
> -                     } else {
> -                             buff_addr += vq->vhost_hlen;
> -                             desc->len = packet_len;
> -                     }
> +             /*
> +              * If the descriptors are chained the header and data are
> +              * placed in separate buffers.
> +              */
> +             if (desc->flags & VRING_DESC_F_NEXT) {
> +                     desc->len = vq->vhost_hlen;
> +                     desc = &vq->desc[desc->next];
> +                     /* Buffer address translation. */
> +                     buff_addr = gpa_to_vva(dev, desc->addr);
> +                     desc->len = rte_pktmbuf_data_len(buff);
> +             } else {
> +                     buff_addr += vq->vhost_hlen;
> +                     desc->len = packet_len;
>               }
> 
> -             PRINT_PACKET(dev, (uintptr_t)buff_addr,
> rte_pktmbuf_data_len(buff), 0);
> -
>               /* Update used ring with desc information */
>               vq->used->ring[res_cur_idx & (vq->size - 1)].id =
> head[packet_success];
>               vq->used->ring[res_cur_idx & (vq->size - 1)].len =
> packet_len;
> 
>               /* Copy mbuf data to buffer */
> -             rte_memcpy((void *)(uintptr_t)buff_addr, (const
> void*)buff->pkt.data, rte_pktmbuf_data_len(buff));
> +             rte_memcpy((void *)(uintptr_t)buff_addr,
> +                     (const void *)buff->pkt.data,
> +                     rte_pktmbuf_data_len(buff));
> +             PRINT_PACKET(dev, (uintptr_t)buff_addr,
> +                     rte_pktmbuf_data_len(buff), 0);
> 
>               res_cur_idx++;
>               packet_success++;
> 
> -             /* If mergeable is disabled then a header is required per
> buffer. */
> -             if (!mergeable) {
> -                     rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
> (const void*)&virtio_hdr, vq->vhost_hlen);
> -                     PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq-
> >vhost_hlen, 1);
> -             } else {
> -                     mrg_count++;
> -                     /* Merge buffer can only handle so many buffers at a
> time. Tell the guest if this limit is reached. */
> -                     if ((mrg_count == MAX_MRG_PKT_BURST) ||
> (res_cur_idx == res_end_idx)) {
> -                             virtio_hdr.num_buffers = mrg_count;
> -                             LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX:
> Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
> -                             rte_memcpy((void
> *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
> -                             PRINT_PACKET(dev,
> (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
> -                             mrg_count = 0;
> -                     }
> -             }
> +             rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
> +                     (const void *)&virtio_hdr, vq->vhost_hlen);
> +
> +             PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq-
> >vhost_hlen, 1);
> +
>               if (res_cur_idx < res_end_idx) {
>                       /* Prefetch descriptor index. */
>                       rte_prefetch0(&vq->desc[head[packet_success]]);
> @@ -1068,6 +1067,356 @@ virtio_dev_rx(struct virtio_net *dev, struct
> rte_mbuf **pkts, uint32_t count)
>       return count;
>  }
> 
> +static inline uint32_t __attribute__((always_inline))
> +copy_from_mbuf_to_vring(struct virtio_net *dev,
> +     uint16_t res_base_idx, uint16_t res_end_idx,
> +     struct rte_mbuf *pkt)
> +{
> +     uint32_t vec_idx = 0;
> +     uint32_t entry_success = 0;
> +     struct vhost_virtqueue *vq;
> +     /* The virtio_hdr is initialised to 0. */
> +     struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
> +             {0, 0, 0, 0, 0, 0}, 0};
> +     uint16_t cur_idx = res_base_idx;
> +     uint64_t vb_addr = 0;
> +     uint64_t vb_hdr_addr = 0;
> +     uint32_t seg_offset = 0;
> +     uint32_t vb_offset = 0;
> +     uint32_t seg_avail;
> +     uint32_t vb_avail;
> +     uint32_t cpy_len, entry_len;
> +
> +     if (pkt == NULL)
> +             return 0;
> +
> +     LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
> +             "End Index %d\n",
> +             dev->device_fh, cur_idx, res_end_idx);
> +
> +     /*
> +      * Convert from gpa to vva
> +      * (guest physical addr -> vhost virtual addr)
> +      */
> +     vq = dev->virtqueue[VIRTIO_RXQ];
> +     vb_addr =
> +             gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
> +     vb_hdr_addr = vb_addr;
> +
> +     /* Prefetch buffer address. */
> +     rte_prefetch0((void *)(uintptr_t)vb_addr);
> +
> +     virtio_hdr.num_buffers = res_end_idx - res_base_idx;
> +
> +     LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge
> buffers %d\n",
> +             dev->device_fh, virtio_hdr.num_buffers);
> +
> +     rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
> +             (const void *)&virtio_hdr, vq->vhost_hlen);
> +
> +     PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
> +
> +     seg_avail = rte_pktmbuf_data_len(pkt);
> +     vb_offset = vq->vhost_hlen;
> +     vb_avail =
> +             vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
> +
> +     entry_len = vq->vhost_hlen;
> +
> +     if (vb_avail == 0) {
> +             uint32_t desc_idx =
> +                     vq->buf_vec[vec_idx].desc_idx;
> +             vq->desc[desc_idx].len = vq->vhost_hlen;
> +
> +             if ((vq->desc[desc_idx].flags
> +                     & VRING_DESC_F_NEXT) == 0) {
> +                     /* Update used ring with desc information */
> +                     vq->used->ring[cur_idx & (vq->size - 1)].id
> +                             = vq->buf_vec[vec_idx].desc_idx;
> +                     vq->used->ring[cur_idx & (vq->size - 1)].len
> +                             = entry_len;
> +
> +                     entry_len = 0;
> +                     cur_idx++;
> +                     entry_success++;
> +             }
> +
> +             vec_idx++;
> +             vb_addr =
> +                     gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
> +
> +             /* Prefetch buffer address. */
> +             rte_prefetch0((void *)(uintptr_t)vb_addr);
> +             vb_offset = 0;
> +             vb_avail = vq->buf_vec[vec_idx].buf_len;
> +     }
> +
> +     cpy_len = RTE_MIN(vb_avail, seg_avail);
> +
> +     while (cpy_len > 0) {
> +             /* Copy mbuf data to vring buffer */
> +             rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
> +                     (const void *)(rte_pktmbuf_mtod(pkt, char*) +
> seg_offset),
> +                     cpy_len);
> +
> +             PRINT_PACKET(dev,
> +                     (uintptr_t)(vb_addr + vb_offset),
> +                     cpy_len, 0);
> +
> +             seg_offset += cpy_len;
> +             vb_offset += cpy_len;
> +             seg_avail -= cpy_len;
> +             vb_avail -= cpy_len;
> +             entry_len += cpy_len;
> +
> +             if (seg_avail != 0) {
> +                     /*
> +                      * The virtio buffer in this vring
> +                      * entry reach to its end.
> +                      * But the segment doesn't complete.
> +                      */
> +                     if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
> +                             VRING_DESC_F_NEXT) == 0) {
> +                             /* Update used ring with desc information */
> +                             vq->used->ring[cur_idx & (vq->size - 1)].id
> +                                     = vq->buf_vec[vec_idx].desc_idx;
> +                             vq->used->ring[cur_idx & (vq->size - 1)].len
> +                                     = entry_len;
> +                             entry_len = 0;
> +                             cur_idx++;
> +                             entry_success++;
> +                     }
> +
> +                     vec_idx++;
> +                     vb_addr = gpa_to_vva(dev,
> +                             vq->buf_vec[vec_idx].buf_addr);
> +                     vb_offset = 0;
> +                     vb_avail = vq->buf_vec[vec_idx].buf_len;
> +                     cpy_len = RTE_MIN(vb_avail, seg_avail);
> +             } else {
> +                     /*
> +                      * This current segment complete, need continue to
> +                      * check if the whole packet complete or not.
> +                      */
> +                     pkt = pkt->pkt.next;
> +                     if (pkt != NULL) {
> +                             /*
> +                              * There are more segments.
> +                              */
> +                             if (vb_avail == 0) {
> +                                     /*
> +                                      * This current buffer from vring is
> +                                      * used up, need fetch next buffer
> +                                      * from buf_vec.
> +                                      */
> +                                     uint32_t desc_idx =
> +                                             vq-
> >buf_vec[vec_idx].desc_idx;
> +                                     vq->desc[desc_idx].len = vb_offset;
> +
> +                                     if ((vq->desc[desc_idx].flags &
> +                                             VRING_DESC_F_NEXT) == 0) {
> +                                             uint16_t wrapped_idx =
> +                                                     cur_idx & (vq->size -
> 1);
> +                                             /*
> +                                              * Update used ring with the
> +                                              * descriptor information
> +                                              */
> +                                             vq->used-
> >ring[wrapped_idx].id
> +                                                     = desc_idx;
> +                                             vq->used-
> >ring[wrapped_idx].len
> +                                                     = entry_len;
> +                                             entry_success++;
> +                                             entry_len = 0;
> +                                             cur_idx++;
> +                                     }
> +
> +                                     /* Get next buffer from buf_vec. */
> +                                     vec_idx++;
> +                                     vb_addr = gpa_to_vva(dev,
> +                                             vq-
> >buf_vec[vec_idx].buf_addr);
> +                                     vb_avail =
> +                                             vq-
> >buf_vec[vec_idx].buf_len;
> +                                     vb_offset = 0;
> +                             }
> +
> +                             seg_offset = 0;
> +                             seg_avail = rte_pktmbuf_data_len(pkt);
> +                             cpy_len = RTE_MIN(vb_avail, seg_avail);
> +                     } else {
> +                             /*
> +                              * This whole packet completes.
> +                              */
> +                             uint32_t desc_idx =
> +                                     vq->buf_vec[vec_idx].desc_idx;
> +                             vq->desc[desc_idx].len = vb_offset;
> +
> +                             while (vq->desc[desc_idx].flags &
> +                                     VRING_DESC_F_NEXT) {
> +                                     desc_idx = vq->desc[desc_idx].next;
> +                                      vq->desc[desc_idx].len = 0;
> +                             }
> +
> +                             /* Update used ring with desc information */
> +                             vq->used->ring[cur_idx & (vq->size - 1)].id
> +                                     = vq->buf_vec[vec_idx].desc_idx;
> +                             vq->used->ring[cur_idx & (vq->size - 1)].len
> +                                     = entry_len;
> +                             entry_len = 0;
> +                             cur_idx++;
> +                             entry_success++;
> +                             seg_avail = 0;
> +                             cpy_len = RTE_MIN(vb_avail, seg_avail);
> +                     }
> +             }
> +     }
> +
> +     return entry_success;
> +}
> +
> +/*
> + * This function adds buffers to the virtio devices RX virtqueue.
> +Buffers can
> + * be received from the physical port or from another virtio device. A
> +packet
> + * count is returned to indicate the number of packets that were
> +succesfully
> + * added to the RX queue. This function works for mergeable RX.
> + */
> +static inline uint32_t __attribute__((always_inline))
> +virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
> +     uint32_t count)
> +{
> +     struct vhost_virtqueue *vq;
> +     uint32_t pkt_idx = 0, entry_success = 0;
> +     uint32_t retry = 0;
> +     uint16_t avail_idx, res_cur_idx;
> +     uint16_t res_base_idx, res_end_idx;
> +     uint8_t success = 0;
> +
> +     LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
> +             dev->device_fh);
> +     vq = dev->virtqueue[VIRTIO_RXQ];
> +     count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
> +
> +     if (count == 0)
> +             return 0;
> +
> +     for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> +             uint32_t secure_len = 0;
> +             uint16_t need_cnt;
> +             uint32_t vec_idx = 0;
> +             uint32_t pkt_len = pkts[pkt_idx]->pkt.pkt_len + vq-
> >vhost_hlen;
> +             uint16_t i, id;
> +
> +             do {
> +                     /*
> +                      * As many data cores may want access to available
> +                      * buffers, they need to be reserved.
> +                      */
> +                     res_base_idx = vq->last_used_idx_res;
> +                     res_cur_idx = res_base_idx;
> +
> +                     do {
> +                             avail_idx = *((volatile uint16_t *)&vq->avail-
> >idx);
> +                             if (unlikely(res_cur_idx == avail_idx)) {
> +                                     /*
> +                                      * If retry is enabled and the queue is
> +                                      * full then we wait and retry to avoid
> +                                      * packet loss.
> +                                      */
> +                                     if (enable_retry) {
> +                                             uint8_t cont = 0;
> +                                             for (retry = 0; retry <
> burst_rx_retry_num; retry++) {
> +
>       rte_delay_us(burst_rx_delay_time);
> +                                                     avail_idx =
> +                                                             *((volatile
> uint16_t *)&vq->avail->idx);
> +                                                     if
> (likely(res_cur_idx != avail_idx)) {
> +                                                             cont = 1;
> +                                                             break;
> +                                                     }
> +                                             }
> +                                             if (cont == 1)
> +                                                     continue;
> +                                     }
> +
> +                                     LOG_DEBUG(VHOST_DATA,
> +                                             "(%"PRIu64") Failed "
> +                                             "to get enough desc from "
> +                                             "vring\n",
> +                                             dev->device_fh);
> +                                     return pkt_idx;
> +                             } else {
> +                                     uint16_t wrapped_idx =
> +                                             (res_cur_idx) & (vq->size - 1);
> +                                     uint32_t idx =
> +                                             vq->avail->ring[wrapped_idx];
> +                                     uint8_t next_desc;
> +
> +                                     do {
> +                                             next_desc = 0;
> +                                             secure_len += vq-
> >desc[idx].len;
> +                                             if (vq->desc[idx].flags &
> +
>       VRING_DESC_F_NEXT) {
> +                                                     idx = vq-
> >desc[idx].next;
> +                                                     next_desc = 1;
> +                                             }
> +                                     } while (next_desc);
> +
> +                                     res_cur_idx++;
> +                             }
> +                     } while (pkt_len > secure_len);
> +
> +                     /* vq->last_used_idx_res is atomically updated. */
> +                     success = rte_atomic16_cmpset(&vq-
> >last_used_idx_res,
> +                                                     res_base_idx,
> +                                                     res_cur_idx);
> +             } while (success == 0);
> +
> +             id = res_base_idx;
> +             need_cnt = res_cur_idx - res_base_idx;
> +
> +             for (i = 0; i < need_cnt; i++, id++) {
> +                     uint16_t wrapped_idx = id & (vq->size - 1);
> +                     uint32_t idx = vq->avail->ring[wrapped_idx];
> +                     uint8_t next_desc;
> +                     do {
> +                             next_desc = 0;
> +                             vq->buf_vec[vec_idx].buf_addr =
> +                                     vq->desc[idx].addr;
> +                             vq->buf_vec[vec_idx].buf_len =
> +                                     vq->desc[idx].len;
> +                             vq->buf_vec[vec_idx].desc_idx = idx;
> +                             vec_idx++;
> +
> +                             if (vq->desc[idx].flags &
> VRING_DESC_F_NEXT) {
> +                                     idx = vq->desc[idx].next;
> +                                     next_desc = 1;
> +                             }
> +                     } while (next_desc);
> +             }
> +
> +             res_end_idx = res_cur_idx;
> +
> +             entry_success = copy_from_mbuf_to_vring(dev,
> res_base_idx,
> +                     res_end_idx, pkts[pkt_idx]);
> +
> +             rte_compiler_barrier();
> +
> +             /*
> +              * Wait until it's our turn to add our buffer
> +              * to the used ring.
> +              */
> +             while (unlikely(vq->last_used_idx != res_base_idx))
> +                     rte_pause();
> +
> +             *(volatile uint16_t *)&vq->used->idx += entry_success;
> +             vq->last_used_idx = res_end_idx;
> +
> +             /* Kick the guest if necessary. */
> +             if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> +                     eventfd_write((int)vq->kickfd, 1);
> +     }
> +
> +     return count;
> +}
> +
>  /*
>   * Compares a packet destination MAC address to a device MAC address.
>   */
> @@ -1199,8 +1548,17 @@ virtio_tx_local(struct virtio_net *dev, struct
> rte_mbuf *m)
>                               /*drop the packet if the device is marked for
> removal*/
>                               LOG_DEBUG(VHOST_DATA, "(%"PRIu64")
> Device is marked for removal\n", dev_ll->dev->device_fh);
>                       } else {
> +                             uint32_t mergeable =
> +                                     dev_ll->dev->features &
> +                                     (1 << VIRTIO_NET_F_MRG_RXBUF);
> +
>                               /*send the packet to the local virtio device*/
> -                             ret = virtio_dev_rx(dev_ll->dev, &m, 1);
> +                             if (likely(mergeable == 0))
> +                                     ret = virtio_dev_rx(dev_ll->dev, &m,
> 1);
> +                             else
> +                                     ret = virtio_dev_merge_rx(dev_ll-
> >dev,
> +                                             &m, 1);
> +
>                               if (enable_stats) {
>                                       rte_atomic64_add(
>                                       &dev_statistics[dev_ll->dev-
> >device_fh].rx_total_atomic,
> @@ -1231,7 +1589,7 @@ virtio_tx_route(struct virtio_net* dev, struct
> rte_mbuf *m, struct rte_mempool *
>       struct mbuf_table *tx_q;
>       struct vlan_ethhdr *vlan_hdr;
>       struct rte_mbuf **m_table;
> -     struct rte_mbuf *mbuf;
> +     struct rte_mbuf *mbuf, *prev;
>       unsigned len, ret, offset = 0;
>       const uint16_t lcore_id = rte_lcore_id();
>       struct virtio_net_data_ll *dev_ll = ll_root_used; @@ -1284,12
> +1642,14 @@ virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m,
> struct rte_mempool *
>       /* Allocate an mbuf and populate the structure. */
>       mbuf = rte_pktmbuf_alloc(mbuf_pool);
>       if (unlikely(mbuf == NULL)) {
> -             RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for
> mbuf.\n");
> +             RTE_LOG(ERR, VHOST_DATA,
> +                     "Failed to allocate memory for mbuf.\n");
>               return;
>       }
> 
>       mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset;
> -     mbuf->pkt.pkt_len = mbuf->pkt.data_len;
> +     mbuf->pkt.pkt_len = m->pkt.pkt_len + VLAN_HLEN + offset;
> +     mbuf->pkt.nb_segs = m->pkt.nb_segs;
> 
>       /* Copy ethernet header to mbuf. */
>       rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data,
> ETH_HLEN); @@ -1304,6 +1664,29 @@ virtio_tx_route(struct virtio_net* dev,
> struct rte_mbuf *m, struct rte_mempool *
>       /* Copy the remaining packet contents to the mbuf. */
>       rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN),
>               (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m-
> >pkt.data_len - ETH_HLEN));
> +
> +     /* Copy the remaining segments for the whole packet. */
> +     prev = mbuf;
> +     while (m->pkt.next) {
> +             /* Allocate an mbuf and populate the structure. */
> +             struct rte_mbuf *next_mbuf =
> rte_pktmbuf_alloc(mbuf_pool);
> +             if (unlikely(next_mbuf == NULL)) {
> +                     rte_pktmbuf_free(mbuf);
> +                     RTE_LOG(ERR, VHOST_DATA,
> +                             "Failed to allocate memory for mbuf.\n");
> +                     return;
> +             }
> +
> +             m = m->pkt.next;
> +             prev->pkt.next = next_mbuf;
> +             prev = next_mbuf;
> +             next_mbuf->pkt.data_len = m->pkt.data_len;
> +
> +             /* Copy data to next mbuf. */
> +             rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
> +                     rte_pktmbuf_mtod(m, const void *), m-
> >pkt.data_len);
> +     }
> +
>       tx_q->m_table[len] = mbuf;
>       len++;
>       if (enable_stats) {
> @@ -1394,6 +1777,7 @@ virtio_dev_tx(struct virtio_net* dev, struct
> rte_mempool *mbuf_pool)
> 
>               /* Setup dummy mbuf. This is copied to a real mbuf if
> transmitted out the physical port. */
>               m.pkt.data_len = desc->len;
> +             m.pkt.pkt_len = desc->len;
>               m.pkt.data = (void*)(uintptr_t)buff_addr;
> 
>               PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); @@
> -1420,6 +1804,227 @@ virtio_dev_tx(struct virtio_net* dev, struct
> rte_mempool *mbuf_pool)
>               eventfd_write((int)vq->kickfd, 1);
>  }
> 
> +/* This function works for TX packets with mergeable feature enabled.
> +*/ static inline void __attribute__((always_inline))
> +virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool
> +*mbuf_pool) {
> +     struct rte_mbuf *m, *prev;
> +     struct vhost_virtqueue *vq;
> +     struct vring_desc *desc;
> +     uint64_t vb_addr = 0;
> +     uint32_t head[MAX_PKT_BURST];
> +     uint32_t used_idx;
> +     uint32_t i;
> +     uint16_t free_entries, entry_success = 0;
> +     uint16_t avail_idx;
> +     uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf)
> +                     + RTE_PKTMBUF_HEADROOM);
> +
> +     vq = dev->virtqueue[VIRTIO_TXQ];
> +     avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
> +
> +     /* If there are no available buffers then return. */
> +     if (vq->last_used_idx == avail_idx)
> +             return;
> +
> +     LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
> +             dev->device_fh);
> +
> +     /* Prefetch available ring to retrieve head indexes. */
> +     rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
> +
> +     /*get the number of free entries in the ring*/
> +     free_entries = (avail_idx - vq->last_used_idx);
> +
> +     /* Limit to MAX_PKT_BURST. */
> +     free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
> +
> +     LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
> +             dev->device_fh, free_entries);
> +     /* Retrieve all of the head indexes first to avoid caching issues. */
> +     for (i = 0; i < free_entries; i++)
> +             head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size -
> 1)];
> +
> +     /* Prefetch descriptor index. */
> +     rte_prefetch0(&vq->desc[head[entry_success]]);
> +     rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
> +
> +     while (entry_success < free_entries) {
> +             uint32_t vb_avail, vb_offset;
> +             uint32_t seg_avail, seg_offset;
> +             uint32_t cpy_len;
> +             uint32_t seg_num = 0;
> +             struct rte_mbuf *cur;
> +             uint8_t alloc_err = 0;
> +
> +             desc = &vq->desc[head[entry_success]];
> +
> +             /* Discard first buffer as it is the virtio header */
> +             desc = &vq->desc[desc->next];
> +
> +             /* Buffer address translation. */
> +             vb_addr = gpa_to_vva(dev, desc->addr);
> +             /* Prefetch buffer address. */
> +             rte_prefetch0((void *)(uintptr_t)vb_addr);
> +
> +             used_idx = vq->last_used_idx & (vq->size - 1);
> +
> +             if (entry_success < (free_entries - 1)) {
> +                     /* Prefetch descriptor index. */
> +                     rte_prefetch0(&vq->desc[head[entry_success+1]]);
> +                     rte_prefetch0(&vq->used->ring[(used_idx + 1) &
> (vq->size - 1)]);
> +             }
> +
> +             /* Update used index buffer information. */
> +             vq->used->ring[used_idx].id = head[entry_success];
> +             vq->used->ring[used_idx].len = 0;
> +
> +             vb_offset = 0;
> +             vb_avail = desc->len;
> +             seg_offset = 0;
> +             seg_avail = buf_size;
> +             cpy_len = RTE_MIN(vb_avail, seg_avail);
> +
> +             PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
> +
> +             /* Allocate an mbuf and populate the structure. */
> +             m = rte_pktmbuf_alloc(mbuf_pool);
> +             if (unlikely(m == NULL)) {
> +                     RTE_LOG(ERR, VHOST_DATA,
> +                             "Failed to allocate memory for mbuf.\n");
> +                     return;
> +             }
> +
> +             seg_num++;
> +             cur = m;
> +             prev = m;
> +             while (cpy_len != 0) {
> +                     rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *)
> + seg_offset),
> +                             (void *)((uintptr_t)(vb_addr + vb_offset)),
> +                             cpy_len);
> +
> +                     seg_offset += cpy_len;
> +                     vb_offset += cpy_len;
> +                     vb_avail -= cpy_len;
> +                     seg_avail -= cpy_len;
> +
> +                     if (vb_avail != 0) {
> +                             /*
> +                              * The segment reachs to its end,
> +                              * while the virtio buffer in TX vring has
> +                              * more data to be copied.
> +                              */
> +                             cur->pkt.data_len = seg_offset;
> +                             m->pkt.pkt_len += seg_offset;
> +                             /* Allocate mbuf and populate the structure.
> */
> +                             cur = rte_pktmbuf_alloc(mbuf_pool);
> +                             if (unlikely(cur == NULL)) {
> +                                     RTE_LOG(ERR, VHOST_DATA, "Failed
> to "
> +                                             "allocate memory for
> mbuf.\n");
> +                                     rte_pktmbuf_free(m);
> +                                     alloc_err = 1;
> +                                     break;
> +                             }
> +
> +                             seg_num++;
> +                             prev->pkt.next = cur;
> +                             prev = cur;
> +                             seg_offset = 0;
> +                             seg_avail = buf_size;
> +                     } else {
> +                             if (desc->flags & VRING_DESC_F_NEXT) {
> +                                     /*
> +                                      * There are more virtio buffers in
> +                                      * same vring entry need to be copied.
> +                                      */
> +                                     if (seg_avail == 0) {
> +                                             /*
> +                                              * The current segment hasn't
> +                                              * room to accomodate more
> +                                              * data.
> +                                              */
> +                                             cur->pkt.data_len =
> seg_offset;
> +                                             m->pkt.pkt_len +=
> seg_offset;
> +                                             /*
> +                                              * Allocate an mbuf and
> +                                              * populate the structure.
> +                                              */
> +                                             cur =
> rte_pktmbuf_alloc(mbuf_pool);
> +                                             if (unlikely(cur == NULL)) {
> +                                                     RTE_LOG(ERR,
> +                                                             VHOST_DATA,
> +                                                             "Failed to "
> +                                                             "allocate
> memory "
> +                                                             "for mbuf\n");
> +
>       rte_pktmbuf_free(m);
> +                                                     alloc_err = 1;
> +                                                     break;
> +                                             }
> +                                             seg_num++;
> +                                             prev->pkt.next = cur;
> +                                             prev = cur;
> +                                             seg_offset = 0;
> +                                             seg_avail = buf_size;
> +                                     }
> +
> +                                     desc = &vq->desc[desc->next];
> +
> +                                     /* Buffer address translation. */
> +                                     vb_addr = gpa_to_vva(dev, desc-
> >addr);
> +                                     /* Prefetch buffer address. */
> +                                     rte_prefetch0((void
> *)(uintptr_t)vb_addr);
> +                                     vb_offset = 0;
> +                                     vb_avail = desc->len;
> +
> +                                     PRINT_PACKET(dev,
> (uintptr_t)vb_addr,
> +                                             desc->len, 0);
> +                             } else {
> +                                     /* The whole packet completes. */
> +                                     cur->pkt.data_len = seg_offset;
> +                                     m->pkt.pkt_len += seg_offset;
> +                                     vb_avail = 0;
> +                             }
> +                     }
> +
> +                     cpy_len = RTE_MIN(vb_avail, seg_avail);
> +             }
> +
> +             if (unlikely(alloc_err == 1))
> +                     break;
> +
> +             m->pkt.nb_segs = seg_num;
> +
> +             /*
> +              * If this is the first received packet we need to learn
> +              * the MAC and setup VMDQ
> +              */
> +             if (dev->ready == DEVICE_MAC_LEARNING) {
> +                     if (dev->remove || (link_vmdq(dev, m) == -1)) {
> +                             /*
> +                              * Discard frame if device is scheduled for
> +                              * removal or a duplicate MAC address is
> found.
> +                              */
> +                             entry_success = free_entries;
> +                             vq->last_used_idx += entry_success;
> +                             rte_pktmbuf_free(m);
> +                             break;
> +                     }
> +             }
> +
> +             virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev-
> >device_fh);
> +             vq->last_used_idx++;
> +             entry_success++;
> +             rte_pktmbuf_free(m);
> +     }
> +
> +     rte_compiler_barrier();
> +     vq->used->idx += entry_success;
> +     /* Kick guest if required. */
> +     if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
> +             eventfd_write((int)vq->kickfd, 1);
> +
> +}
> +
>  /*
>   * This function is called by each data core. It handles all RX/TX registered
> with the
>   * core. For TX the specific lcore linked list is used. For RX, MAC 
> addresses are
> compared @@ -1440,8 +2045,9 @@ switch_worker(__attribute__((unused))
> void *arg)
>       const uint16_t lcore_id = rte_lcore_id();
>       const uint16_t num_cores = (uint16_t)rte_lcore_count();
>       uint16_t rx_count = 0;
> +     uint32_t mergeable = 0;
> 
> -     RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n",
> lcore_id);
> +     RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n",
> lcore_id);
>       lcore_ll = lcore_info[lcore_id].lcore_ll;
>       prev_tsc = 0;
> 
> @@ -1497,6 +2103,8 @@ switch_worker(__attribute__((unused)) void *arg)
>               while (dev_ll != NULL) {
>                       /*get virtio device ID*/
>                       dev = dev_ll->dev;
> +                     mergeable =
> +                             dev->features & (1 <<
> VIRTIO_NET_F_MRG_RXBUF);
> 
>                       if (dev->remove) {
>                               dev_ll = dev_ll->next;
> @@ -1510,7 +2118,15 @@ switch_worker(__attribute__((unused)) void *arg)
>                                       (uint16_t)dev->vmdq_rx_q,
> pkts_burst, MAX_PKT_BURST);
> 
>                               if (rx_count) {
> -                                     ret_count = virtio_dev_rx(dev,
> pkts_burst, rx_count);
> +                                     if (likely(mergeable == 0))
> +                                             ret_count =
> +                                                     virtio_dev_rx(dev,
> +                                                     pkts_burst, rx_count);
> +                                     else
> +                                             ret_count =
> +
>       virtio_dev_merge_rx(dev,
> +                                                     pkts_burst, rx_count);
> +
>                                       if (enable_stats) {
>                                               rte_atomic64_add(
>                                               &dev_statistics[dev_ll->dev-
> >device_fh].rx_total_atomic,
> @@ -1520,15 +2136,19 @@ switch_worker(__attribute__((unused)) void
> *arg)
>                                       }
>                                       while (likely(rx_count)) {
>                                               rx_count--;
> -
>       rte_pktmbuf_free_seg(pkts_burst[rx_count]);
> +
>       rte_pktmbuf_free(pkts_burst[rx_count]);
>                                       }
> 
>                               }
>                       }
> 
> -                     if (!dev->remove)
> +                     if (!dev->remove) {
>                               /*Handle guest TX*/
> -                             virtio_dev_tx(dev, mbuf_pool);
> +                             if (likely(mergeable == 0))
> +                                     virtio_dev_tx(dev, mbuf_pool);
> +                             else
> +                                     virtio_dev_merge_tx(dev,
> mbuf_pool);
> +                     }
> 
>                       /*move to the next device in the list*/
>                       dev_ll = dev_ll->next;
> diff --git a/examples/vhost/virtio-net.h b/examples/vhost/virtio-net.h index
> 3d1f255..1a2f0dc 100644
> --- a/examples/vhost/virtio-net.h
> +++ b/examples/vhost/virtio-net.h
> @@ -45,6 +45,18 @@
>  /* Enum for virtqueue management. */
>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
> 
> +#define BUF_VECTOR_MAX 256
> +
> +/*
> + * Structure contains buffer address, length and descriptor index
> + * from vring to do scatter RX.
> +*/
> +struct buf_vector {
> +uint64_t buf_addr;
> +uint32_t buf_len;
> +uint32_t desc_idx;
> +};
> +
>  /*
>   * Structure contains variables relevant to TX/RX virtqueues.
>   */
> @@ -60,6 +72,8 @@ struct vhost_virtqueue
>       volatile uint16_t       last_used_idx_res;      /* Used for multiple
> devices reserving buffers. */
>       eventfd_t                       callfd;                         /*
> Currently unused as polling mode is enabled. */
>       eventfd_t                       kickfd;                         /*
> Used to notify the guest (trigger interrupt). */
> +     /* Used for scatter RX. */
> +     struct buf_vector       buf_vec[BUF_VECTOR_MAX];
>  } __rte_cache_aligned;
> 
>  /*
> --
> 1.8.4.2

Reply via email to