Hi all, Any comments for this patch? And what's the status for merging it into mainline?
Thanks in advance Changchun > -----Original Message----- > From: Ouyang, Changchun > Sent: Friday, August 15, 2014 12:58 PM > To: dev at dpdk.org > Cc: Cao, Waterman; Ouyang, Changchun > Subject: [PATCH] examples/vhost: Support jumbo frame in user space vhost > > This patch support mergeable RX feature and thus support jumbo frame RX > and TX in user space vhost(as virtio backend). > > On RX, it secures enough room from vring to accommodate one complete > scattered packet which is received by PMD from physical port, and then copy > data from mbuf to vring buffer, possibly across a few vring entries and > descriptors. > > On TX, it gets a jumbo frame, possibly described by a few vring descriptors > which are chained together with the flags of 'NEXT', and then copy them into > one scattered packet and TX it to physical port through PMD. > > Signed-off-by: Changchun Ouyang <changchun.ouyang at intel.com> > Acked-by: Huawei Xie <huawei.xie at intel.com> > --- > examples/vhost/main.c | 726 > ++++++++++++++++++++++++++++++++++++++++---- > examples/vhost/virtio-net.h | 14 + > 2 files changed, 687 insertions(+), 53 deletions(-) > > diff --git a/examples/vhost/main.c b/examples/vhost/main.c index > 193aa25..7d9e6a2 100644 > --- a/examples/vhost/main.c > +++ b/examples/vhost/main.c > @@ -106,6 +106,8 @@ > #define BURST_RX_WAIT_US 15 /* Defines how long we wait > between retries on RX */ > #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ > > +#define JUMBO_FRAME_MAX_SIZE 0x2600 > + > /* State of virtio device. */ > #define DEVICE_MAC_LEARNING 0 > #define DEVICE_RX 1 > @@ -676,8 +678,12 @@ us_vhost_parse_args(int argc, char **argv) > us_vhost_usage(prgname); > return -1; > } else { > - if (ret) > + if (ret) { > + > vmdq_conf_default.rxmode.jumbo_frame = 1; > + > vmdq_conf_default.rxmode.max_rx_pkt_len > + = > JUMBO_FRAME_MAX_SIZE; > VHOST_FEATURES = (1ULL << > VIRTIO_NET_F_MRG_RXBUF); > + } > } > } > > @@ -797,6 +803,14 @@ us_vhost_parse_args(int argc, char **argv) > return -1; > } > > + if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame > == 1)) { > + RTE_LOG(INFO, VHOST_PORT, > + "Vhost zero copy doesn't support jumbo frame," > + "please specify '--mergeable 0' to disable the " > + "mergeable feature.\n"); > + return -1; > + } > + > return 0; > } > > @@ -916,7 +930,7 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa, > * This function adds buffers to the virtio devices RX virtqueue. Buffers can > * be received from the physical port or from another virtio device. A packet > * count is returned to indicate the number of packets that were succesfully > - * added to the RX queue. > + * added to the RX queue. This function works when mergeable is disabled. > */ > static inline uint32_t __attribute__((always_inline)) virtio_dev_rx(struct > virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) @@ -930,7 +944,6 > @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t > count) > uint64_t buff_hdr_addr = 0; > uint32_t head[MAX_PKT_BURST], packet_len = 0; > uint32_t head_idx, packet_success = 0; > - uint32_t mergeable, mrg_count = 0; > uint32_t retry = 0; > uint16_t avail_idx, res_cur_idx; > uint16_t res_base_idx, res_end_idx; > @@ -940,6 +953,7 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf > **pkts, uint32_t count) > LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev- > >device_fh); > vq = dev->virtqueue[VIRTIO_RXQ]; > count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; > + > /* As many data cores may want access to available buffers, they > need to be reserved. */ > do { > res_base_idx = vq->last_used_idx_res; @@ -976,9 +990,6 > @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t > count) > /* Prefetch available ring to retrieve indexes. */ > rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); > > - /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */ > - mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); > - > /* Retrieve all of the head indexes first to avoid caching issues. */ > for (head_idx = 0; head_idx < count; head_idx++) > head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & > (vq->size - 1)]; @@ -997,56 +1008,44 @@ virtio_dev_rx(struct virtio_net > *dev, struct rte_mbuf **pkts, uint32_t count) > /* Prefetch buffer address. */ > rte_prefetch0((void*)(uintptr_t)buff_addr); > > - if (mergeable && (mrg_count != 0)) { > - desc->len = packet_len = > rte_pktmbuf_data_len(buff); > - } else { > - /* Copy virtio_hdr to packet and increment buffer > address */ > - buff_hdr_addr = buff_addr; > - packet_len = rte_pktmbuf_data_len(buff) + vq- > >vhost_hlen; > + /* Copy virtio_hdr to packet and increment buffer address */ > + buff_hdr_addr = buff_addr; > + packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; > > - /* > - * If the descriptors are chained the header and data > are placed in > - * separate buffers. > - */ > - if (desc->flags & VRING_DESC_F_NEXT) { > - desc->len = vq->vhost_hlen; > - desc = &vq->desc[desc->next]; > - /* Buffer address translation. */ > - buff_addr = gpa_to_vva(dev, desc->addr); > - desc->len = rte_pktmbuf_data_len(buff); > - } else { > - buff_addr += vq->vhost_hlen; > - desc->len = packet_len; > - } > + /* > + * If the descriptors are chained the header and data are > + * placed in separate buffers. > + */ > + if (desc->flags & VRING_DESC_F_NEXT) { > + desc->len = vq->vhost_hlen; > + desc = &vq->desc[desc->next]; > + /* Buffer address translation. */ > + buff_addr = gpa_to_vva(dev, desc->addr); > + desc->len = rte_pktmbuf_data_len(buff); > + } else { > + buff_addr += vq->vhost_hlen; > + desc->len = packet_len; > } > > - PRINT_PACKET(dev, (uintptr_t)buff_addr, > rte_pktmbuf_data_len(buff), 0); > - > /* Update used ring with desc information */ > vq->used->ring[res_cur_idx & (vq->size - 1)].id = > head[packet_success]; > vq->used->ring[res_cur_idx & (vq->size - 1)].len = > packet_len; > > /* Copy mbuf data to buffer */ > - rte_memcpy((void *)(uintptr_t)buff_addr, (const > void*)buff->pkt.data, rte_pktmbuf_data_len(buff)); > + rte_memcpy((void *)(uintptr_t)buff_addr, > + (const void *)buff->pkt.data, > + rte_pktmbuf_data_len(buff)); > + PRINT_PACKET(dev, (uintptr_t)buff_addr, > + rte_pktmbuf_data_len(buff), 0); > > res_cur_idx++; > packet_success++; > > - /* If mergeable is disabled then a header is required per > buffer. */ > - if (!mergeable) { > - rte_memcpy((void *)(uintptr_t)buff_hdr_addr, > (const void*)&virtio_hdr, vq->vhost_hlen); > - PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq- > >vhost_hlen, 1); > - } else { > - mrg_count++; > - /* Merge buffer can only handle so many buffers at a > time. Tell the guest if this limit is reached. */ > - if ((mrg_count == MAX_MRG_PKT_BURST) || > (res_cur_idx == res_end_idx)) { > - virtio_hdr.num_buffers = mrg_count; > - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: > Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers); > - rte_memcpy((void > *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen); > - PRINT_PACKET(dev, > (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); > - mrg_count = 0; > - } > - } > + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, > + (const void *)&virtio_hdr, vq->vhost_hlen); > + > + PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq- > >vhost_hlen, 1); > + > if (res_cur_idx < res_end_idx) { > /* Prefetch descriptor index. */ > rte_prefetch0(&vq->desc[head[packet_success]]); > @@ -1068,6 +1067,356 @@ virtio_dev_rx(struct virtio_net *dev, struct > rte_mbuf **pkts, uint32_t count) > return count; > } > > +static inline uint32_t __attribute__((always_inline)) > +copy_from_mbuf_to_vring(struct virtio_net *dev, > + uint16_t res_base_idx, uint16_t res_end_idx, > + struct rte_mbuf *pkt) > +{ > + uint32_t vec_idx = 0; > + uint32_t entry_success = 0; > + struct vhost_virtqueue *vq; > + /* The virtio_hdr is initialised to 0. */ > + struct virtio_net_hdr_mrg_rxbuf virtio_hdr = { > + {0, 0, 0, 0, 0, 0}, 0}; > + uint16_t cur_idx = res_base_idx; > + uint64_t vb_addr = 0; > + uint64_t vb_hdr_addr = 0; > + uint32_t seg_offset = 0; > + uint32_t vb_offset = 0; > + uint32_t seg_avail; > + uint32_t vb_avail; > + uint32_t cpy_len, entry_len; > + > + if (pkt == NULL) > + return 0; > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| " > + "End Index %d\n", > + dev->device_fh, cur_idx, res_end_idx); > + > + /* > + * Convert from gpa to vva > + * (guest physical addr -> vhost virtual addr) > + */ > + vq = dev->virtqueue[VIRTIO_RXQ]; > + vb_addr = > + gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); > + vb_hdr_addr = vb_addr; > + > + /* Prefetch buffer address. */ > + rte_prefetch0((void *)(uintptr_t)vb_addr); > + > + virtio_hdr.num_buffers = res_end_idx - res_base_idx; > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge > buffers %d\n", > + dev->device_fh, virtio_hdr.num_buffers); > + > + rte_memcpy((void *)(uintptr_t)vb_hdr_addr, > + (const void *)&virtio_hdr, vq->vhost_hlen); > + > + PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1); > + > + seg_avail = rte_pktmbuf_data_len(pkt); > + vb_offset = vq->vhost_hlen; > + vb_avail = > + vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; > + > + entry_len = vq->vhost_hlen; > + > + if (vb_avail == 0) { > + uint32_t desc_idx = > + vq->buf_vec[vec_idx].desc_idx; > + vq->desc[desc_idx].len = vq->vhost_hlen; > + > + if ((vq->desc[desc_idx].flags > + & VRING_DESC_F_NEXT) == 0) { > + /* Update used ring with desc information */ > + vq->used->ring[cur_idx & (vq->size - 1)].id > + = vq->buf_vec[vec_idx].desc_idx; > + vq->used->ring[cur_idx & (vq->size - 1)].len > + = entry_len; > + > + entry_len = 0; > + cur_idx++; > + entry_success++; > + } > + > + vec_idx++; > + vb_addr = > + gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); > + > + /* Prefetch buffer address. */ > + rte_prefetch0((void *)(uintptr_t)vb_addr); > + vb_offset = 0; > + vb_avail = vq->buf_vec[vec_idx].buf_len; > + } > + > + cpy_len = RTE_MIN(vb_avail, seg_avail); > + > + while (cpy_len > 0) { > + /* Copy mbuf data to vring buffer */ > + rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset), > + (const void *)(rte_pktmbuf_mtod(pkt, char*) + > seg_offset), > + cpy_len); > + > + PRINT_PACKET(dev, > + (uintptr_t)(vb_addr + vb_offset), > + cpy_len, 0); > + > + seg_offset += cpy_len; > + vb_offset += cpy_len; > + seg_avail -= cpy_len; > + vb_avail -= cpy_len; > + entry_len += cpy_len; > + > + if (seg_avail != 0) { > + /* > + * The virtio buffer in this vring > + * entry reach to its end. > + * But the segment doesn't complete. > + */ > + if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags & > + VRING_DESC_F_NEXT) == 0) { > + /* Update used ring with desc information */ > + vq->used->ring[cur_idx & (vq->size - 1)].id > + = vq->buf_vec[vec_idx].desc_idx; > + vq->used->ring[cur_idx & (vq->size - 1)].len > + = entry_len; > + entry_len = 0; > + cur_idx++; > + entry_success++; > + } > + > + vec_idx++; > + vb_addr = gpa_to_vva(dev, > + vq->buf_vec[vec_idx].buf_addr); > + vb_offset = 0; > + vb_avail = vq->buf_vec[vec_idx].buf_len; > + cpy_len = RTE_MIN(vb_avail, seg_avail); > + } else { > + /* > + * This current segment complete, need continue to > + * check if the whole packet complete or not. > + */ > + pkt = pkt->pkt.next; > + if (pkt != NULL) { > + /* > + * There are more segments. > + */ > + if (vb_avail == 0) { > + /* > + * This current buffer from vring is > + * used up, need fetch next buffer > + * from buf_vec. > + */ > + uint32_t desc_idx = > + vq- > >buf_vec[vec_idx].desc_idx; > + vq->desc[desc_idx].len = vb_offset; > + > + if ((vq->desc[desc_idx].flags & > + VRING_DESC_F_NEXT) == 0) { > + uint16_t wrapped_idx = > + cur_idx & (vq->size - > 1); > + /* > + * Update used ring with the > + * descriptor information > + */ > + vq->used- > >ring[wrapped_idx].id > + = desc_idx; > + vq->used- > >ring[wrapped_idx].len > + = entry_len; > + entry_success++; > + entry_len = 0; > + cur_idx++; > + } > + > + /* Get next buffer from buf_vec. */ > + vec_idx++; > + vb_addr = gpa_to_vva(dev, > + vq- > >buf_vec[vec_idx].buf_addr); > + vb_avail = > + vq- > >buf_vec[vec_idx].buf_len; > + vb_offset = 0; > + } > + > + seg_offset = 0; > + seg_avail = rte_pktmbuf_data_len(pkt); > + cpy_len = RTE_MIN(vb_avail, seg_avail); > + } else { > + /* > + * This whole packet completes. > + */ > + uint32_t desc_idx = > + vq->buf_vec[vec_idx].desc_idx; > + vq->desc[desc_idx].len = vb_offset; > + > + while (vq->desc[desc_idx].flags & > + VRING_DESC_F_NEXT) { > + desc_idx = vq->desc[desc_idx].next; > + vq->desc[desc_idx].len = 0; > + } > + > + /* Update used ring with desc information */ > + vq->used->ring[cur_idx & (vq->size - 1)].id > + = vq->buf_vec[vec_idx].desc_idx; > + vq->used->ring[cur_idx & (vq->size - 1)].len > + = entry_len; > + entry_len = 0; > + cur_idx++; > + entry_success++; > + seg_avail = 0; > + cpy_len = RTE_MIN(vb_avail, seg_avail); > + } > + } > + } > + > + return entry_success; > +} > + > +/* > + * This function adds buffers to the virtio devices RX virtqueue. > +Buffers can > + * be received from the physical port or from another virtio device. A > +packet > + * count is returned to indicate the number of packets that were > +succesfully > + * added to the RX queue. This function works for mergeable RX. > + */ > +static inline uint32_t __attribute__((always_inline)) > +virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts, > + uint32_t count) > +{ > + struct vhost_virtqueue *vq; > + uint32_t pkt_idx = 0, entry_success = 0; > + uint32_t retry = 0; > + uint16_t avail_idx, res_cur_idx; > + uint16_t res_base_idx, res_end_idx; > + uint8_t success = 0; > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n", > + dev->device_fh); > + vq = dev->virtqueue[VIRTIO_RXQ]; > + count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); > + > + if (count == 0) > + return 0; > + > + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { > + uint32_t secure_len = 0; > + uint16_t need_cnt; > + uint32_t vec_idx = 0; > + uint32_t pkt_len = pkts[pkt_idx]->pkt.pkt_len + vq- > >vhost_hlen; > + uint16_t i, id; > + > + do { > + /* > + * As many data cores may want access to available > + * buffers, they need to be reserved. > + */ > + res_base_idx = vq->last_used_idx_res; > + res_cur_idx = res_base_idx; > + > + do { > + avail_idx = *((volatile uint16_t *)&vq->avail- > >idx); > + if (unlikely(res_cur_idx == avail_idx)) { > + /* > + * If retry is enabled and the queue is > + * full then we wait and retry to avoid > + * packet loss. > + */ > + if (enable_retry) { > + uint8_t cont = 0; > + for (retry = 0; retry < > burst_rx_retry_num; retry++) { > + > rte_delay_us(burst_rx_delay_time); > + avail_idx = > + *((volatile > uint16_t *)&vq->avail->idx); > + if > (likely(res_cur_idx != avail_idx)) { > + cont = 1; > + break; > + } > + } > + if (cont == 1) > + continue; > + } > + > + LOG_DEBUG(VHOST_DATA, > + "(%"PRIu64") Failed " > + "to get enough desc from " > + "vring\n", > + dev->device_fh); > + return pkt_idx; > + } else { > + uint16_t wrapped_idx = > + (res_cur_idx) & (vq->size - 1); > + uint32_t idx = > + vq->avail->ring[wrapped_idx]; > + uint8_t next_desc; > + > + do { > + next_desc = 0; > + secure_len += vq- > >desc[idx].len; > + if (vq->desc[idx].flags & > + > VRING_DESC_F_NEXT) { > + idx = vq- > >desc[idx].next; > + next_desc = 1; > + } > + } while (next_desc); > + > + res_cur_idx++; > + } > + } while (pkt_len > secure_len); > + > + /* vq->last_used_idx_res is atomically updated. */ > + success = rte_atomic16_cmpset(&vq- > >last_used_idx_res, > + res_base_idx, > + res_cur_idx); > + } while (success == 0); > + > + id = res_base_idx; > + need_cnt = res_cur_idx - res_base_idx; > + > + for (i = 0; i < need_cnt; i++, id++) { > + uint16_t wrapped_idx = id & (vq->size - 1); > + uint32_t idx = vq->avail->ring[wrapped_idx]; > + uint8_t next_desc; > + do { > + next_desc = 0; > + vq->buf_vec[vec_idx].buf_addr = > + vq->desc[idx].addr; > + vq->buf_vec[vec_idx].buf_len = > + vq->desc[idx].len; > + vq->buf_vec[vec_idx].desc_idx = idx; > + vec_idx++; > + > + if (vq->desc[idx].flags & > VRING_DESC_F_NEXT) { > + idx = vq->desc[idx].next; > + next_desc = 1; > + } > + } while (next_desc); > + } > + > + res_end_idx = res_cur_idx; > + > + entry_success = copy_from_mbuf_to_vring(dev, > res_base_idx, > + res_end_idx, pkts[pkt_idx]); > + > + rte_compiler_barrier(); > + > + /* > + * Wait until it's our turn to add our buffer > + * to the used ring. > + */ > + while (unlikely(vq->last_used_idx != res_base_idx)) > + rte_pause(); > + > + *(volatile uint16_t *)&vq->used->idx += entry_success; > + vq->last_used_idx = res_end_idx; > + > + /* Kick the guest if necessary. */ > + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) > + eventfd_write((int)vq->kickfd, 1); > + } > + > + return count; > +} > + > /* > * Compares a packet destination MAC address to a device MAC address. > */ > @@ -1199,8 +1548,17 @@ virtio_tx_local(struct virtio_net *dev, struct > rte_mbuf *m) > /*drop the packet if the device is marked for > removal*/ > LOG_DEBUG(VHOST_DATA, "(%"PRIu64") > Device is marked for removal\n", dev_ll->dev->device_fh); > } else { > + uint32_t mergeable = > + dev_ll->dev->features & > + (1 << VIRTIO_NET_F_MRG_RXBUF); > + > /*send the packet to the local virtio device*/ > - ret = virtio_dev_rx(dev_ll->dev, &m, 1); > + if (likely(mergeable == 0)) > + ret = virtio_dev_rx(dev_ll->dev, &m, > 1); > + else > + ret = virtio_dev_merge_rx(dev_ll- > >dev, > + &m, 1); > + > if (enable_stats) { > rte_atomic64_add( > &dev_statistics[dev_ll->dev- > >device_fh].rx_total_atomic, > @@ -1231,7 +1589,7 @@ virtio_tx_route(struct virtio_net* dev, struct > rte_mbuf *m, struct rte_mempool * > struct mbuf_table *tx_q; > struct vlan_ethhdr *vlan_hdr; > struct rte_mbuf **m_table; > - struct rte_mbuf *mbuf; > + struct rte_mbuf *mbuf, *prev; > unsigned len, ret, offset = 0; > const uint16_t lcore_id = rte_lcore_id(); > struct virtio_net_data_ll *dev_ll = ll_root_used; @@ -1284,12 > +1642,14 @@ virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, > struct rte_mempool * > /* Allocate an mbuf and populate the structure. */ > mbuf = rte_pktmbuf_alloc(mbuf_pool); > if (unlikely(mbuf == NULL)) { > - RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for > mbuf.\n"); > + RTE_LOG(ERR, VHOST_DATA, > + "Failed to allocate memory for mbuf.\n"); > return; > } > > mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset; > - mbuf->pkt.pkt_len = mbuf->pkt.data_len; > + mbuf->pkt.pkt_len = m->pkt.pkt_len + VLAN_HLEN + offset; > + mbuf->pkt.nb_segs = m->pkt.nb_segs; > > /* Copy ethernet header to mbuf. */ > rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, > ETH_HLEN); @@ -1304,6 +1664,29 @@ virtio_tx_route(struct virtio_net* dev, > struct rte_mbuf *m, struct rte_mempool * > /* Copy the remaining packet contents to the mbuf. */ > rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN), > (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m- > >pkt.data_len - ETH_HLEN)); > + > + /* Copy the remaining segments for the whole packet. */ > + prev = mbuf; > + while (m->pkt.next) { > + /* Allocate an mbuf and populate the structure. */ > + struct rte_mbuf *next_mbuf = > rte_pktmbuf_alloc(mbuf_pool); > + if (unlikely(next_mbuf == NULL)) { > + rte_pktmbuf_free(mbuf); > + RTE_LOG(ERR, VHOST_DATA, > + "Failed to allocate memory for mbuf.\n"); > + return; > + } > + > + m = m->pkt.next; > + prev->pkt.next = next_mbuf; > + prev = next_mbuf; > + next_mbuf->pkt.data_len = m->pkt.data_len; > + > + /* Copy data to next mbuf. */ > + rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *), > + rte_pktmbuf_mtod(m, const void *), m- > >pkt.data_len); > + } > + > tx_q->m_table[len] = mbuf; > len++; > if (enable_stats) { > @@ -1394,6 +1777,7 @@ virtio_dev_tx(struct virtio_net* dev, struct > rte_mempool *mbuf_pool) > > /* Setup dummy mbuf. This is copied to a real mbuf if > transmitted out the physical port. */ > m.pkt.data_len = desc->len; > + m.pkt.pkt_len = desc->len; > m.pkt.data = (void*)(uintptr_t)buff_addr; > > PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); @@ > -1420,6 +1804,227 @@ virtio_dev_tx(struct virtio_net* dev, struct > rte_mempool *mbuf_pool) > eventfd_write((int)vq->kickfd, 1); > } > > +/* This function works for TX packets with mergeable feature enabled. > +*/ static inline void __attribute__((always_inline)) > +virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool > +*mbuf_pool) { > + struct rte_mbuf *m, *prev; > + struct vhost_virtqueue *vq; > + struct vring_desc *desc; > + uint64_t vb_addr = 0; > + uint32_t head[MAX_PKT_BURST]; > + uint32_t used_idx; > + uint32_t i; > + uint16_t free_entries, entry_success = 0; > + uint16_t avail_idx; > + uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf) > + + RTE_PKTMBUF_HEADROOM); > + > + vq = dev->virtqueue[VIRTIO_TXQ]; > + avail_idx = *((volatile uint16_t *)&vq->avail->idx); > + > + /* If there are no available buffers then return. */ > + if (vq->last_used_idx == avail_idx) > + return; > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n", > + dev->device_fh); > + > + /* Prefetch available ring to retrieve head indexes. */ > + rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); > + > + /*get the number of free entries in the ring*/ > + free_entries = (avail_idx - vq->last_used_idx); > + > + /* Limit to MAX_PKT_BURST. */ > + free_entries = RTE_MIN(free_entries, MAX_PKT_BURST); > + > + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", > + dev->device_fh, free_entries); > + /* Retrieve all of the head indexes first to avoid caching issues. */ > + for (i = 0; i < free_entries; i++) > + head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - > 1)]; > + > + /* Prefetch descriptor index. */ > + rte_prefetch0(&vq->desc[head[entry_success]]); > + rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); > + > + while (entry_success < free_entries) { > + uint32_t vb_avail, vb_offset; > + uint32_t seg_avail, seg_offset; > + uint32_t cpy_len; > + uint32_t seg_num = 0; > + struct rte_mbuf *cur; > + uint8_t alloc_err = 0; > + > + desc = &vq->desc[head[entry_success]]; > + > + /* Discard first buffer as it is the virtio header */ > + desc = &vq->desc[desc->next]; > + > + /* Buffer address translation. */ > + vb_addr = gpa_to_vva(dev, desc->addr); > + /* Prefetch buffer address. */ > + rte_prefetch0((void *)(uintptr_t)vb_addr); > + > + used_idx = vq->last_used_idx & (vq->size - 1); > + > + if (entry_success < (free_entries - 1)) { > + /* Prefetch descriptor index. */ > + rte_prefetch0(&vq->desc[head[entry_success+1]]); > + rte_prefetch0(&vq->used->ring[(used_idx + 1) & > (vq->size - 1)]); > + } > + > + /* Update used index buffer information. */ > + vq->used->ring[used_idx].id = head[entry_success]; > + vq->used->ring[used_idx].len = 0; > + > + vb_offset = 0; > + vb_avail = desc->len; > + seg_offset = 0; > + seg_avail = buf_size; > + cpy_len = RTE_MIN(vb_avail, seg_avail); > + > + PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0); > + > + /* Allocate an mbuf and populate the structure. */ > + m = rte_pktmbuf_alloc(mbuf_pool); > + if (unlikely(m == NULL)) { > + RTE_LOG(ERR, VHOST_DATA, > + "Failed to allocate memory for mbuf.\n"); > + return; > + } > + > + seg_num++; > + cur = m; > + prev = m; > + while (cpy_len != 0) { > + rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) > + seg_offset), > + (void *)((uintptr_t)(vb_addr + vb_offset)), > + cpy_len); > + > + seg_offset += cpy_len; > + vb_offset += cpy_len; > + vb_avail -= cpy_len; > + seg_avail -= cpy_len; > + > + if (vb_avail != 0) { > + /* > + * The segment reachs to its end, > + * while the virtio buffer in TX vring has > + * more data to be copied. > + */ > + cur->pkt.data_len = seg_offset; > + m->pkt.pkt_len += seg_offset; > + /* Allocate mbuf and populate the structure. > */ > + cur = rte_pktmbuf_alloc(mbuf_pool); > + if (unlikely(cur == NULL)) { > + RTE_LOG(ERR, VHOST_DATA, "Failed > to " > + "allocate memory for > mbuf.\n"); > + rte_pktmbuf_free(m); > + alloc_err = 1; > + break; > + } > + > + seg_num++; > + prev->pkt.next = cur; > + prev = cur; > + seg_offset = 0; > + seg_avail = buf_size; > + } else { > + if (desc->flags & VRING_DESC_F_NEXT) { > + /* > + * There are more virtio buffers in > + * same vring entry need to be copied. > + */ > + if (seg_avail == 0) { > + /* > + * The current segment hasn't > + * room to accomodate more > + * data. > + */ > + cur->pkt.data_len = > seg_offset; > + m->pkt.pkt_len += > seg_offset; > + /* > + * Allocate an mbuf and > + * populate the structure. > + */ > + cur = > rte_pktmbuf_alloc(mbuf_pool); > + if (unlikely(cur == NULL)) { > + RTE_LOG(ERR, > + VHOST_DATA, > + "Failed to " > + "allocate > memory " > + "for mbuf\n"); > + > rte_pktmbuf_free(m); > + alloc_err = 1; > + break; > + } > + seg_num++; > + prev->pkt.next = cur; > + prev = cur; > + seg_offset = 0; > + seg_avail = buf_size; > + } > + > + desc = &vq->desc[desc->next]; > + > + /* Buffer address translation. */ > + vb_addr = gpa_to_vva(dev, desc- > >addr); > + /* Prefetch buffer address. */ > + rte_prefetch0((void > *)(uintptr_t)vb_addr); > + vb_offset = 0; > + vb_avail = desc->len; > + > + PRINT_PACKET(dev, > (uintptr_t)vb_addr, > + desc->len, 0); > + } else { > + /* The whole packet completes. */ > + cur->pkt.data_len = seg_offset; > + m->pkt.pkt_len += seg_offset; > + vb_avail = 0; > + } > + } > + > + cpy_len = RTE_MIN(vb_avail, seg_avail); > + } > + > + if (unlikely(alloc_err == 1)) > + break; > + > + m->pkt.nb_segs = seg_num; > + > + /* > + * If this is the first received packet we need to learn > + * the MAC and setup VMDQ > + */ > + if (dev->ready == DEVICE_MAC_LEARNING) { > + if (dev->remove || (link_vmdq(dev, m) == -1)) { > + /* > + * Discard frame if device is scheduled for > + * removal or a duplicate MAC address is > found. > + */ > + entry_success = free_entries; > + vq->last_used_idx += entry_success; > + rte_pktmbuf_free(m); > + break; > + } > + } > + > + virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev- > >device_fh); > + vq->last_used_idx++; > + entry_success++; > + rte_pktmbuf_free(m); > + } > + > + rte_compiler_barrier(); > + vq->used->idx += entry_success; > + /* Kick guest if required. */ > + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) > + eventfd_write((int)vq->kickfd, 1); > + > +} > + > /* > * This function is called by each data core. It handles all RX/TX registered > with the > * core. For TX the specific lcore linked list is used. For RX, MAC > addresses are > compared @@ -1440,8 +2045,9 @@ switch_worker(__attribute__((unused)) > void *arg) > const uint16_t lcore_id = rte_lcore_id(); > const uint16_t num_cores = (uint16_t)rte_lcore_count(); > uint16_t rx_count = 0; > + uint32_t mergeable = 0; > > - RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n", > lcore_id); > + RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", > lcore_id); > lcore_ll = lcore_info[lcore_id].lcore_ll; > prev_tsc = 0; > > @@ -1497,6 +2103,8 @@ switch_worker(__attribute__((unused)) void *arg) > while (dev_ll != NULL) { > /*get virtio device ID*/ > dev = dev_ll->dev; > + mergeable = > + dev->features & (1 << > VIRTIO_NET_F_MRG_RXBUF); > > if (dev->remove) { > dev_ll = dev_ll->next; > @@ -1510,7 +2118,15 @@ switch_worker(__attribute__((unused)) void *arg) > (uint16_t)dev->vmdq_rx_q, > pkts_burst, MAX_PKT_BURST); > > if (rx_count) { > - ret_count = virtio_dev_rx(dev, > pkts_burst, rx_count); > + if (likely(mergeable == 0)) > + ret_count = > + virtio_dev_rx(dev, > + pkts_burst, rx_count); > + else > + ret_count = > + > virtio_dev_merge_rx(dev, > + pkts_burst, rx_count); > + > if (enable_stats) { > rte_atomic64_add( > &dev_statistics[dev_ll->dev- > >device_fh].rx_total_atomic, > @@ -1520,15 +2136,19 @@ switch_worker(__attribute__((unused)) void > *arg) > } > while (likely(rx_count)) { > rx_count--; > - > rte_pktmbuf_free_seg(pkts_burst[rx_count]); > + > rte_pktmbuf_free(pkts_burst[rx_count]); > } > > } > } > > - if (!dev->remove) > + if (!dev->remove) { > /*Handle guest TX*/ > - virtio_dev_tx(dev, mbuf_pool); > + if (likely(mergeable == 0)) > + virtio_dev_tx(dev, mbuf_pool); > + else > + virtio_dev_merge_tx(dev, > mbuf_pool); > + } > > /*move to the next device in the list*/ > dev_ll = dev_ll->next; > diff --git a/examples/vhost/virtio-net.h b/examples/vhost/virtio-net.h index > 3d1f255..1a2f0dc 100644 > --- a/examples/vhost/virtio-net.h > +++ b/examples/vhost/virtio-net.h > @@ -45,6 +45,18 @@ > /* Enum for virtqueue management. */ > enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; > > +#define BUF_VECTOR_MAX 256 > + > +/* > + * Structure contains buffer address, length and descriptor index > + * from vring to do scatter RX. > +*/ > +struct buf_vector { > +uint64_t buf_addr; > +uint32_t buf_len; > +uint32_t desc_idx; > +}; > + > /* > * Structure contains variables relevant to TX/RX virtqueues. > */ > @@ -60,6 +72,8 @@ struct vhost_virtqueue > volatile uint16_t last_used_idx_res; /* Used for multiple > devices reserving buffers. */ > eventfd_t callfd; /* > Currently unused as polling mode is enabled. */ > eventfd_t kickfd; /* > Used to notify the guest (trigger interrupt). */ > + /* Used for scatter RX. */ > + struct buf_vector buf_vec[BUF_VECTOR_MAX]; > } __rte_cache_aligned; > > /* > -- > 1.8.4.2