Added check of minimum of 2 packet allocation count to eliminate the extra overhead for supporting prefetch for the case of checking for only one packet allocated into the queue at a time.
Used some standard variables to help reduce overhead of non-standard variable sizes. Added second level prefetch to get packet address in cache 0 earlier and eliminated calculation inside loop to determine end of prefetch loop. Used old time instruction C optimization methods of, using pointers instead of arrays, and reducing scope of some variables to improve chances of using register variables instead of a stack variables. Signed-off-by: Mike A. Polehn <mike.a.polehn at intel.com> diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index ec62f75..2032e06 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -64,6 +64,7 @@ #define DEFAULT_TX_FREE_THRESH 32 #define I40E_MAX_PKT_TYPE 256 #define I40E_RX_INPUT_BUF_MAX 256 +#define I40E_RX_FREE_THRESH_MIN 2 #define I40E_TX_MAX_BURST 32 @@ -942,6 +943,12 @@ check_rx_burst_bulk_alloc_preconditions(__rte_unused struct i40e_rx_queue *rxq) "rxq->rx_free_thresh=%d", rxq->nb_rx_desc, rxq->rx_free_thresh); ret = -EINVAL; + } else if (rxq->rx_free_thresh < I40E_RX_FREE_THRESH_MIN) { + PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions: " + "rxq->rx_free_thresh=%d, " + "I40E_RX_FREE_THRESH_MIN=%d", + rxq->rx_free_thresh, I40E_RX_FREE_THRESH_MIN); + ret = -EINVAL; } else if (!(rxq->nb_rx_desc < (I40E_MAX_RING_DESC - RTE_PMD_I40E_RX_MAX_BURST))) { PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions: " @@ -1058,9 +1065,8 @@ i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *rxep; - struct rte_mbuf *mb; - unsigned alloc_idx, i; - uint64_t dma_addr; + struct rte_mbuf *pk, *npk; + unsigned alloc_idx, i, l; int diag; /* Allocate buffers in bulk */ @@ -1076,22 +1082,36 @@ i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq) return -ENOMEM; } + pk = rxep->mbuf; + rte_prefetch0(pk); + rxep++; + npk = rxep->mbuf; + rte_prefetch0(npk); + rxep++; + l = rxq->rx_free_thresh - 2; + rxdp = &rxq->rx_ring[alloc_idx]; for (i = 0; i < rxq->rx_free_thresh; i++) { - if (likely(i < (rxq->rx_free_thresh - 1))) + struct rte_mbuf *mb = pk; + pk = npk; + if (likely(i < l)) { /* Prefetch next mbuf */ - rte_prefetch0(rxep[i + 1].mbuf); - - mb = rxep[i].mbuf; - rte_mbuf_refcnt_set(mb, 1); - mb->next = NULL; + npk = rxep->mbuf; + rte_prefetch0(npk); + rxep++; + } mb->data_off = RTE_PKTMBUF_HEADROOM; + rte_mbuf_refcnt_set(mb, 1); mb->nb_segs = 1; mb->port = rxq->port_id; - dma_addr = rte_cpu_to_le_64(\ - RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb)); - rxdp[i].read.hdr_addr = 0; - rxdp[i].read.pkt_addr = dma_addr; + mb->next = NULL; + { + uint64_t dma_addr = rte_cpu_to_le_64( + RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb)); + rxdp->read.hdr_addr = dma_addr; + rxdp->read.pkt_addr = dma_addr; + } + rxdp++; } rxq->rx_last_pos = alloc_idx + rxq->rx_free_thresh - 1;