[PATCH] net/iavf: remove extra copy step in Rx bulk path

Kathleen Capella Thu, 24 Mar 2022 15:11:48 -0700

In the Rx bulk path, packets which are taken from the HW ring, are first
copied to the stage data structure and then later copied from the stage
to the rx_pkts array. For the number of packets requested immediately
by the receiving function, this two-step process adds extra overhead
that is not necessary.


Instead, put requested number of packets directly into the rx_pkts array
and only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up
to 4% performance improvement. On x86, no difference in performance was
observed.

Signed-off-by: Kathleen Capella <kathleen.cape...@arm.com>
Suggested-by: Dharmik Thakkar <dharmik.thak...@arm.com>
---
 drivers/net/iavf/iavf_rxtx.c | 74 ++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 16e8d021f9..245dd225fd 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -1813,7 +1813,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf 
**rx_pkts,
 
 #define IAVF_LOOK_AHEAD 8
 static inline int
-iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
+iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
+                           struct rte_mbuf **rx_pkts,
+                           uint16_t nb_pkts)
 {
        volatile union iavf_rx_flex_desc *rxdp;
        struct rte_mbuf **rxep;
@@ -1822,6 +1824,7 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
        uint16_t pkt_len;
        int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
        int32_t i, j, nb_rx = 0;
+       int32_t nb_staged = 0;
        uint64_t pkt_flags;
        const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
@@ -1867,8 +1870,6 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
 #endif
                }
 
-               nb_rx += nb_dd;
-
                /* Translate descriptor info to mbuf parameters */
                for (j = 0; j < nb_dd; j++) {
                        IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
@@ -1892,24 +1893,34 @@ iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
                        pkt_flags = iavf_flex_rxd_error_to_pkt_flags(stat_err0);
 
                        mb->ol_flags |= pkt_flags;
-               }
 
-               for (j = 0; j < IAVF_LOOK_AHEAD; j++)
-                       rxq->rx_stage[i + j] = rxep[j];
+                       /* Put up to nb_pkts directly into buffers */
+                       if ((i + j) < nb_pkts) {
+                               rx_pkts[i + j] = rxep[j];
+                               nb_rx++;
+                       } else {
+                               /* Stage excess pkts received */
+                               rxq->rx_stage[nb_staged] = rxep[j];
+                               nb_staged++;
+                       }
+               }
 
                if (nb_dd != IAVF_LOOK_AHEAD)
                        break;
        }
 
+       /* Update rxq->rx_nb_avail to reflect number of staged pkts */
+       rxq->rx_nb_avail = nb_staged;
+
        /* Clear software ring entries */
-       for (i = 0; i < nb_rx; i++)
+       for (i = 0; i < (nb_rx + nb_staged); i++)
                rxq->sw_ring[rxq->rx_tail + i] = NULL;
 
        return nb_rx;
 }
 
 static inline int
-iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
+iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, 
uint16_t nb_pkts)
 {
        volatile union iavf_rx_desc *rxdp;
        struct rte_mbuf **rxep;
@@ -1919,6 +1930,7 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
        uint32_t rx_status;
        int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
        int32_t i, j, nb_rx = 0;
+       int32_t nb_staged = 0;
        uint64_t pkt_flags;
        const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
@@ -1970,8 +1982,6 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
 #endif
                }
 
-               nb_rx += nb_dd;
-
                /* Translate descriptor info to mbuf parameters */
                for (j = 0; j < nb_dd; j++) {
                        IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
@@ -2000,17 +2010,26 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
                                pkt_flags |= iavf_rxd_build_fdir(&rxdp[j], mb);
 
                        mb->ol_flags |= pkt_flags;
-               }
 
-               for (j = 0; j < IAVF_LOOK_AHEAD; j++)
-                       rxq->rx_stage[i + j] = rxep[j];
+                       /* Put up to nb_pkts directly into buffers */
+                       if ((i + j) < nb_pkts) {
+                               rx_pkts[i + j] = rxep[j];
+                               nb_rx++;
+                       } else { /* Stage excess pkts received */
+                               rxq->rx_stage[nb_staged] = rxep[j];
+                               nb_staged++;
+                       }
+               }
 
                if (nb_dd != IAVF_LOOK_AHEAD)
                        break;
        }
 
+       /* Update rxq->rx_nb_avail to reflect number of staged pkts */
+       rxq->rx_nb_avail = nb_staged;
+
        /* Clear software ring entries */
-       for (i = 0; i < nb_rx; i++)
+       for (i = 0; i < (nb_rx + nb_staged); i++)
                rxq->sw_ring[rxq->rx_tail + i] = NULL;
 
        return nb_rx;
@@ -2098,23 +2117,31 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, 
uint16_t nb_pkts)
                return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
 
        if (rxq->rxdid >= IAVF_RXDID_FLEX_NIC && rxq->rxdid <= IAVF_RXDID_LAST)
-               nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq);
+               nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq, rx_pkts, 
nb_pkts);
        else
-               nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq);
+               nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq, rx_pkts, nb_pkts);
+
        rxq->rx_next_avail = 0;
-       rxq->rx_nb_avail = nb_rx;
-       rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx);
+       rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx + rxq->rx_nb_avail);
 
        if (rxq->rx_tail > rxq->rx_free_trigger) {
                if (iavf_rx_alloc_bufs(rxq) != 0) {
-                       uint16_t i, j;
+                       uint16_t i, j, nb_staged;
 
                        /* TODO: count rx_mbuf_alloc_failed here */
 
+                       nb_staged = rxq->rx_nb_avail;
                        rxq->rx_nb_avail = 0;
-                       rxq->rx_tail = (uint16_t)(rxq->rx_tail - nb_rx);
-                       for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++)
+
+                       rxq->rx_tail = (uint16_t)(rxq->rx_tail - (nb_rx + 
nb_staged));
+                       for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) {
+                               rxq->sw_ring[j] = rx_pkts[i];
+                               rx_pkts[i] = NULL;
+                       }
+                       for (i = 0, j = rxq->rx_tail + nb_rx; i < nb_staged; 
i++, j++) {
                                rxq->sw_ring[j] = rxq->rx_stage[i];
+                               rx_pkts[i] = NULL;
+                       }
 
                        return 0;
                }
@@ -2127,10 +2154,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, 
uint16_t nb_pkts)
                   rxq->port_id, rxq->queue_id,
                   rxq->rx_tail, nb_rx);
 
-       if (rxq->rx_nb_avail)
-               return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
-
-       return 0;
+       return nb_rx;
 }
 
 static uint16_t
-- 
2.31.1

[PATCH] net/iavf: remove extra copy step in Rx bulk path

Reply via email to