Added check of minimum of 2 packet allocation count to eliminate the extra 
overhead for 
supporting prefetch for the case of checking for only one packet allocated into 
the queue 
at a time.

Used some standard variables to help reduce overhead of non-standard variable 
sizes.

Added second level prefetch to get packet address in cache 0 earlier and 
eliminated
calculation inside loop to determine end of prefetch loop.

Used old time instruction C optimization methods of, using pointers instead of 
arrays, 
and reducing scope of some variables to improve chances of using register 
variables 
instead of a stack variables.

Signed-off-by: Mike A. Polehn <mike.a.polehn at intel.com>

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index ec62f75..2032e06 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -64,6 +64,7 @@
 #define DEFAULT_TX_FREE_THRESH 32
 #define I40E_MAX_PKT_TYPE      256
 #define I40E_RX_INPUT_BUF_MAX  256
+#define I40E_RX_FREE_THRESH_MIN  2

 #define I40E_TX_MAX_BURST  32

@@ -942,6 +943,12 @@ check_rx_burst_bulk_alloc_preconditions(__rte_unused 
struct i40e_rx_queue *rxq)
                             "rxq->rx_free_thresh=%d",
                             rxq->nb_rx_desc, rxq->rx_free_thresh);
                ret = -EINVAL;
+       } else if (rxq->rx_free_thresh < I40E_RX_FREE_THRESH_MIN) {
+               PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions: "
+                               "rxq->rx_free_thresh=%d, "
+                               "I40E_RX_FREE_THRESH_MIN=%d",
+                               rxq->rx_free_thresh, I40E_RX_FREE_THRESH_MIN);
+                               ret = -EINVAL;
        } else if (!(rxq->nb_rx_desc < (I40E_MAX_RING_DESC -
                                RTE_PMD_I40E_RX_MAX_BURST))) {
                PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions: "
@@ -1058,9 +1065,8 @@ i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq)
 {
        volatile union i40e_rx_desc *rxdp;
        struct i40e_rx_entry *rxep;
-       struct rte_mbuf *mb;
-       unsigned alloc_idx, i;
-       uint64_t dma_addr;
+       struct rte_mbuf *pk, *npk;
+       unsigned alloc_idx, i, l;
        int diag;

        /* Allocate buffers in bulk */
@@ -1076,22 +1082,36 @@ i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq)
                return -ENOMEM;
        }

+       pk = rxep->mbuf;
+       rte_prefetch0(pk);
+       rxep++;
+       npk = rxep->mbuf;
+       rte_prefetch0(npk);
+       rxep++;
+       l = rxq->rx_free_thresh - 2;
+
        rxdp = &rxq->rx_ring[alloc_idx];
        for (i = 0; i < rxq->rx_free_thresh; i++) {
-               if (likely(i < (rxq->rx_free_thresh - 1)))
+               struct rte_mbuf *mb = pk;
+               pk = npk;
+               if (likely(i < l)) {
                        /* Prefetch next mbuf */
-                       rte_prefetch0(rxep[i + 1].mbuf);
-
-               mb = rxep[i].mbuf;
-               rte_mbuf_refcnt_set(mb, 1);
-               mb->next = NULL;
+                       npk = rxep->mbuf;
+                       rte_prefetch0(npk);
+                       rxep++;
+               }
                mb->data_off = RTE_PKTMBUF_HEADROOM;
+               rte_mbuf_refcnt_set(mb, 1);
                mb->nb_segs = 1;
                mb->port = rxq->port_id;
-               dma_addr = rte_cpu_to_le_64(\
-                       RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb));
-               rxdp[i].read.hdr_addr = 0;
-               rxdp[i].read.pkt_addr = dma_addr;
+               mb->next = NULL;
+               {
+                       uint64_t dma_addr = rte_cpu_to_le_64(
+                               RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb));
+                       rxdp->read.hdr_addr = dma_addr;
+                       rxdp->read.pkt_addr = dma_addr;
+               }
+               rxdp++;
        }

        rxq->rx_last_pos = alloc_idx + rxq->rx_free_thresh - 1;

Reply via email to