In the Rx bulk path, packets which are taken from the HW ring, are first copied to the stage data structure and then later copied from the stage to the rx_pkts array. For the number of packets requested immediately by the receiving function, this two-step process adds extra overhead that is not necessary.
Instead, put requested number of packets directly into the rx_pkts array and only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up to 4% performance improvement. On x86, no difference in performance was observed. Signed-off-by: Kathleen Capella <kathleen.cape...@arm.com> Suggested-by: Dharmik Thakkar <dharmik.thak...@arm.com> --- drivers/net/iavf/iavf_rxtx.c | 74 ++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 16e8d021f9..245dd225fd 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -1813,7 +1813,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, #define IAVF_LOOK_AHEAD 8 static inline int -iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq) +iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq, + struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) { volatile union iavf_rx_flex_desc *rxdp; struct rte_mbuf **rxep; @@ -1822,6 +1824,7 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq) uint16_t pkt_len; int32_t s[IAVF_LOOK_AHEAD], var, nb_dd; int32_t i, j, nb_rx = 0; + int32_t nb_staged = 0; uint64_t pkt_flags; const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; @@ -1867,8 +1870,6 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq) #endif } - nb_rx += nb_dd; - /* Translate descriptor info to mbuf parameters */ for (j = 0; j < nb_dd; j++) { IAVF_DUMP_RX_DESC(rxq, &rxdp[j], @@ -1892,24 +1893,34 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq) pkt_flags = iavf_flex_rxd_error_to_pkt_flags(stat_err0); mb->ol_flags |= pkt_flags; - } - for (j = 0; j < IAVF_LOOK_AHEAD; j++) - rxq->rx_stage[i + j] = rxep[j]; + /* Put up to nb_pkts directly into buffers */ + if ((i + j) < nb_pkts) { + rx_pkts[i + j] = rxep[j]; + nb_rx++; + } else { + /* Stage excess pkts received */ + rxq->rx_stage[nb_staged] = rxep[j]; + nb_staged++; + } + } if (nb_dd != IAVF_LOOK_AHEAD) break; } + /* Update rxq->rx_nb_avail to reflect number of staged pkts */ + rxq->rx_nb_avail = nb_staged; + /* Clear software ring entries */ - for (i = 0; i < nb_rx; i++) + for (i = 0; i < (nb_rx + nb_staged); i++) rxq->sw_ring[rxq->rx_tail + i] = NULL; return nb_rx; } static inline int -iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq) +iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) { volatile union iavf_rx_desc *rxdp; struct rte_mbuf **rxep; @@ -1919,6 +1930,7 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq) uint32_t rx_status; int32_t s[IAVF_LOOK_AHEAD], var, nb_dd; int32_t i, j, nb_rx = 0; + int32_t nb_staged = 0; uint64_t pkt_flags; const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; @@ -1970,8 +1982,6 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq) #endif } - nb_rx += nb_dd; - /* Translate descriptor info to mbuf parameters */ for (j = 0; j < nb_dd; j++) { IAVF_DUMP_RX_DESC(rxq, &rxdp[j], @@ -2000,17 +2010,26 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq) pkt_flags |= iavf_rxd_build_fdir(&rxdp[j], mb); mb->ol_flags |= pkt_flags; - } - for (j = 0; j < IAVF_LOOK_AHEAD; j++) - rxq->rx_stage[i + j] = rxep[j]; + /* Put up to nb_pkts directly into buffers */ + if ((i + j) < nb_pkts) { + rx_pkts[i + j] = rxep[j]; + nb_rx++; + } else { /* Stage excess pkts received */ + rxq->rx_stage[nb_staged] = rxep[j]; + nb_staged++; + } + } if (nb_dd != IAVF_LOOK_AHEAD) break; } + /* Update rxq->rx_nb_avail to reflect number of staged pkts */ + rxq->rx_nb_avail = nb_staged; + /* Clear software ring entries */ - for (i = 0; i < nb_rx; i++) + for (i = 0; i < (nb_rx + nb_staged); i++) rxq->sw_ring[rxq->rx_tail + i] = NULL; return nb_rx; @@ -2098,23 +2117,31 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts); if (rxq->rxdid >= IAVF_RXDID_FLEX_NIC && rxq->rxdid <= IAVF_RXDID_LAST) - nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq); + nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq, rx_pkts, nb_pkts); else - nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq); + nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq, rx_pkts, nb_pkts); + rxq->rx_next_avail = 0; - rxq->rx_nb_avail = nb_rx; - rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx); + rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx + rxq->rx_nb_avail); if (rxq->rx_tail > rxq->rx_free_trigger) { if (iavf_rx_alloc_bufs(rxq) != 0) { - uint16_t i, j; + uint16_t i, j, nb_staged; /* TODO: count rx_mbuf_alloc_failed here */ + nb_staged = rxq->rx_nb_avail; rxq->rx_nb_avail = 0; - rxq->rx_tail = (uint16_t)(rxq->rx_tail - nb_rx); - for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) + + rxq->rx_tail = (uint16_t)(rxq->rx_tail - (nb_rx + nb_staged)); + for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) { + rxq->sw_ring[j] = rx_pkts[i]; + rx_pkts[i] = NULL; + } + for (i = 0, j = rxq->rx_tail + nb_rx; i < nb_staged; i++, j++) { rxq->sw_ring[j] = rxq->rx_stage[i]; + rx_pkts[i] = NULL; + } return 0; } @@ -2127,10 +2154,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) rxq->port_id, rxq->queue_id, rxq->rx_tail, nb_rx); - if (rxq->rx_nb_avail) - return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts); - - return 0; + return nb_rx; } static uint16_t -- 2.31.1