For memif non-zero-copy mode, there is a branch to compare the mbuf and memif buffer size during memory copying. Add a fast memory copy path by removing this branch with mbuf and memif buffer size defined at compile time. The removal of the branch leads to considerable performance uplift.
When memif <= buffer size, Rx chooses the fast memcpy path, otherwise it would choose the original path. Test with 1p1q on Ampere Altra AArch64 server, -------------------------------------------- buf size | memif <= mbuf | memif > mbuf | -------------------------------------------- non-zc gain | 4.30% | -0.52% | -------------------------------------------- zc gain | 2.46% | 0.70% | -------------------------------------------- Test with 1p1q on Cascade Lake Xeon X86server, ------------------------------------------- buf size | memif <= mbuf | memif > mbuf | ------------------------------------------- non-zc gain | 2.13% | -1.40% | ------------------------------------------- zc gain | 0.18% | 0.48% | ------------------------------------------- Signed-off-by: Joyce Kong <joyce.k...@arm.com> --- drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++---------- 1 file changed, 84 insertions(+), 40 deletions(-) diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c index 587ad45576..f55776ca46 100644 --- a/drivers/net/memif/rte_eth_memif.c +++ b/drivers/net/memif/rte_eth_memif.c @@ -342,66 +342,111 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) goto refill; n_slots = last_slot - cur_slot; - while (n_slots && n_rx_pkts < nb_pkts) { - mbuf_head = rte_pktmbuf_alloc(mq->mempool); - if (unlikely(mbuf_head == NULL)) - goto no_free_bufs; - mbuf = mbuf_head; - mbuf->port = mq->in_port; + if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) { + while (n_slots && n_rx_pkts < nb_pkts) { + mbuf_head = rte_pktmbuf_alloc(mq->mempool); + if (unlikely(mbuf_head == NULL)) + goto no_free_bufs; + mbuf = mbuf_head; + mbuf->port = mq->in_port; + +next_slot1: + s0 = cur_slot & mask; + d0 = &ring->desc[s0]; -next_slot: - s0 = cur_slot & mask; - d0 = &ring->desc[s0]; + cp_len = d0->length; - src_len = d0->length; - dst_off = 0; - src_off = 0; + rte_pktmbuf_data_len(mbuf) = cp_len; + rte_pktmbuf_pkt_len(mbuf) = cp_len; + if (mbuf != mbuf_head) + rte_pktmbuf_pkt_len(mbuf_head) += cp_len; - do { - dst_len = mbuf_size - dst_off; - if (dst_len == 0) { - dst_off = 0; - dst_len = mbuf_size; + rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), + (uint8_t *)memif_get_buffer(proc_private, d0), cp_len); + + cur_slot++; + n_slots--; - /* store pointer to tail */ + if (d0->flags & MEMIF_DESC_FLAG_NEXT) { mbuf_tail = mbuf; mbuf = rte_pktmbuf_alloc(mq->mempool); if (unlikely(mbuf == NULL)) goto no_free_bufs; - mbuf->port = mq->in_port; ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf); if (unlikely(ret < 0)) { MIF_LOG(ERR, "number-of-segments-overflow"); rte_pktmbuf_free(mbuf); goto no_free_bufs; } + goto next_slot1; } - cp_len = RTE_MIN(dst_len, src_len); - rte_pktmbuf_data_len(mbuf) += cp_len; - rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf); - if (mbuf != mbuf_head) - rte_pktmbuf_pkt_len(mbuf_head) += cp_len; + mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head); + *bufs++ = mbuf_head; + n_rx_pkts++; + } + } else { + while (n_slots && n_rx_pkts < nb_pkts) { + mbuf_head = rte_pktmbuf_alloc(mq->mempool); + if (unlikely(mbuf_head == NULL)) + goto no_free_bufs; + mbuf = mbuf_head; + mbuf->port = mq->in_port; + +next_slot2: + s0 = cur_slot & mask; + d0 = &ring->desc[s0]; - rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *, - dst_off), - (uint8_t *)memif_get_buffer(proc_private, d0) + - src_off, cp_len); + src_len = d0->length; + dst_off = 0; + src_off = 0; - src_off += cp_len; - dst_off += cp_len; - src_len -= cp_len; - } while (src_len); + do { + dst_len = mbuf_size - dst_off; + if (dst_len == 0) { + dst_off = 0; + dst_len = mbuf_size; + + /* store pointer to tail */ + mbuf_tail = mbuf; + mbuf = rte_pktmbuf_alloc(mq->mempool); + if (unlikely(mbuf == NULL)) + goto no_free_bufs; + mbuf->port = mq->in_port; + ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf); + if (unlikely(ret < 0)) { + MIF_LOG(ERR, "number-of-segments-overflow"); + rte_pktmbuf_free(mbuf); + goto no_free_bufs; + } + } + cp_len = RTE_MIN(dst_len, src_len); - cur_slot++; - n_slots--; + rte_pktmbuf_data_len(mbuf) += cp_len; + rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf); + if (mbuf != mbuf_head) + rte_pktmbuf_pkt_len(mbuf_head) += cp_len; - if (d0->flags & MEMIF_DESC_FLAG_NEXT) - goto next_slot; + rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *, + dst_off), + (uint8_t *)memif_get_buffer(proc_private, d0) + + src_off, cp_len); - mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head); - *bufs++ = mbuf_head; - n_rx_pkts++; + src_off += cp_len; + dst_off += cp_len; + src_len -= cp_len; + } while (src_len); + + cur_slot++; + n_slots--; + + if (d0->flags & MEMIF_DESC_FLAG_NEXT) + goto next_slot2; + + mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head); + *bufs++ = mbuf_head; + n_rx_pkts++; + } } no_free_bufs: @@ -694,7 +739,6 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) return n_tx_pkts; } - static int memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq, memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask, -- 2.25.1