In the Rx bulk path, packets which are taken from the HW ring, are first
copied to the stage data structure and then later copied from the stage
to the rx_pkts array. For the number of packets requested immediately
by the receiving function, this two-step process adds extra overhead
that is not necessary.

Instead, put requested number of packets directly into the rx_pkts array
and only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up
to 4% performance improvement. On x86, no difference in performance was
observed.

Signed-off-by: Kathleen Capella <kathleen.cape...@arm.com>
Suggested-by: Dharmik Thakkar <dharmik.thak...@arm.com>
---
 drivers/net/iavf/iavf_rxtx.c | 74 ++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 16e8d021f9..245dd225fd 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -1813,7 +1813,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf 
**rx_pkts,
 
 #define IAVF_LOOK_AHEAD 8
 static inline int
-iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
+iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
+                           struct rte_mbuf **rx_pkts,
+                           uint16_t nb_pkts)
 {
        volatile union iavf_rx_flex_desc *rxdp;
        struct rte_mbuf **rxep;
@@ -1822,6 +1824,7 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
        uint16_t pkt_len;
        int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
        int32_t i, j, nb_rx = 0;
+       int32_t nb_staged = 0;
        uint64_t pkt_flags;
        const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
@@ -1867,8 +1870,6 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
 #endif
                }
 
-               nb_rx += nb_dd;
-
                /* Translate descriptor info to mbuf parameters */
                for (j = 0; j < nb_dd; j++) {
                        IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
@@ -1892,24 +1893,34 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
                        pkt_flags = iavf_flex_rxd_error_to_pkt_flags(stat_err0);
 
                        mb->ol_flags |= pkt_flags;
-               }
 
-               for (j = 0; j < IAVF_LOOK_AHEAD; j++)
-                       rxq->rx_stage[i + j] = rxep[j];
+                       /* Put up to nb_pkts directly into buffers */
+                       if ((i + j) < nb_pkts) {
+                               rx_pkts[i + j] = rxep[j];
+                               nb_rx++;
+                       } else {
+                               /* Stage excess pkts received */
+                               rxq->rx_stage[nb_staged] = rxep[j];
+                               nb_staged++;
+                       }
+               }
 
                if (nb_dd != IAVF_LOOK_AHEAD)
                        break;
        }
 
+       /* Update rxq->rx_nb_avail to reflect number of staged pkts */
+       rxq->rx_nb_avail = nb_staged;
+
        /* Clear software ring entries */
-       for (i = 0; i < nb_rx; i++)
+       for (i = 0; i < (nb_rx + nb_staged); i++)
                rxq->sw_ring[rxq->rx_tail + i] = NULL;
 
        return nb_rx;
 }
 
 static inline int
-iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
+iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, 
uint16_t nb_pkts)
 {
        volatile union iavf_rx_desc *rxdp;
        struct rte_mbuf **rxep;
@@ -1919,6 +1930,7 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
        uint32_t rx_status;
        int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
        int32_t i, j, nb_rx = 0;
+       int32_t nb_staged = 0;
        uint64_t pkt_flags;
        const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
@@ -1970,8 +1982,6 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
 #endif
                }
 
-               nb_rx += nb_dd;
-
                /* Translate descriptor info to mbuf parameters */
                for (j = 0; j < nb_dd; j++) {
                        IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
@@ -2000,17 +2010,26 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
                                pkt_flags |= iavf_rxd_build_fdir(&rxdp[j], mb);
 
                        mb->ol_flags |= pkt_flags;
-               }
 
-               for (j = 0; j < IAVF_LOOK_AHEAD; j++)
-                       rxq->rx_stage[i + j] = rxep[j];
+                       /* Put up to nb_pkts directly into buffers */
+                       if ((i + j) < nb_pkts) {
+                               rx_pkts[i + j] = rxep[j];
+                               nb_rx++;
+                       } else { /* Stage excess pkts received */
+                               rxq->rx_stage[nb_staged] = rxep[j];
+                               nb_staged++;
+                       }
+               }
 
                if (nb_dd != IAVF_LOOK_AHEAD)
                        break;
        }
 
+       /* Update rxq->rx_nb_avail to reflect number of staged pkts */
+       rxq->rx_nb_avail = nb_staged;
+
        /* Clear software ring entries */
-       for (i = 0; i < nb_rx; i++)
+       for (i = 0; i < (nb_rx + nb_staged); i++)
                rxq->sw_ring[rxq->rx_tail + i] = NULL;
 
        return nb_rx;
@@ -2098,23 +2117,31 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, 
uint16_t nb_pkts)
                return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
 
        if (rxq->rxdid >= IAVF_RXDID_FLEX_NIC && rxq->rxdid <= IAVF_RXDID_LAST)
-               nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq);
+               nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq, rx_pkts, 
nb_pkts);
        else
-               nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq);
+               nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq, rx_pkts, nb_pkts);
+
        rxq->rx_next_avail = 0;
-       rxq->rx_nb_avail = nb_rx;
-       rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx);
+       rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx + rxq->rx_nb_avail);
 
        if (rxq->rx_tail > rxq->rx_free_trigger) {
                if (iavf_rx_alloc_bufs(rxq) != 0) {
-                       uint16_t i, j;
+                       uint16_t i, j, nb_staged;
 
                        /* TODO: count rx_mbuf_alloc_failed here */
 
+                       nb_staged = rxq->rx_nb_avail;
                        rxq->rx_nb_avail = 0;
-                       rxq->rx_tail = (uint16_t)(rxq->rx_tail - nb_rx);
-                       for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++)
+
+                       rxq->rx_tail = (uint16_t)(rxq->rx_tail - (nb_rx + 
nb_staged));
+                       for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) {
+                               rxq->sw_ring[j] = rx_pkts[i];
+                               rx_pkts[i] = NULL;
+                       }
+                       for (i = 0, j = rxq->rx_tail + nb_rx; i < nb_staged; 
i++, j++) {
                                rxq->sw_ring[j] = rxq->rx_stage[i];
+                               rx_pkts[i] = NULL;
+                       }
 
                        return 0;
                }
@@ -2127,10 +2154,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, 
uint16_t nb_pkts)
                   rxq->port_id, rxq->queue_id,
                   rxq->rx_tail, nb_rx);
 
-       if (rxq->rx_nb_avail)
-               return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
-
-       return 0;
+       return nb_rx;
 }
 
 static uint16_t
-- 
2.31.1

Reply via email to