Refer to "i40e_tx_free_bufs_avx512", this patch puts mempool cache out of API to free buffers directly. There are two changes different with previous version: 1. change txep from "i40e_entry" to "i40e_vec_entry" 2. put cache out of "mempool_bulk" API to copy buffers into it directly
Performance Test with l3fwd neon path: with this patch n1sdp: no perforamnce change amper-altra: +4.0% Suggested-by: Konstantin Ananyev <konstantin.v.anan...@yandex.ru> Suggested-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com> Signed-off-by: Feifei Wang <feifei.wa...@arm.com> --- drivers/net/i40e/i40e_rxtx_vec_common.h | 36 ++++++++++++++++++++----- drivers/net/i40e/i40e_rxtx_vec_neon.c | 10 ++++--- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h b/drivers/net/i40e/i40e_rxtx_vec_common.h index 959832ed6a..e418225b4e 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_common.h +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h @@ -81,7 +81,7 @@ reassemble_packets(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_bufs, static __rte_always_inline int i40e_tx_free_bufs(struct i40e_tx_queue *txq) { - struct i40e_tx_entry *txep; + struct i40e_vec_tx_entry *txep; uint32_t n; uint32_t i; int nb_free = 0; @@ -98,17 +98,39 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq) /* first buffer to free from S/W ring is at index * tx_next_dd - (tx_rs_thresh-1) */ - txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)]; + txep = (void *)txq->sw_ring; + txep += txq->tx_next_dd - (n - 1); if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { - for (i = 0; i < n; i++) { - free[i] = txep[i].mbuf; - /* no need to reset txep[i].mbuf in vector path */ + struct rte_mempool *mp = txep[0].mbuf->pool; + void **cache_objs; + struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, + rte_lcore_id()); + + if (!cache || cache->len == 0) + goto normal; + + cache_objs = &cache->objs[cache->len]; + + if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { + rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); + goto done; + } + + rte_memcpy(cache_objs, txep, sizeof(void *) * n); + /* no need to reset txep[i].mbuf in vector path */ + cache->len += n; + + if (cache->len >= cache->flushthresh) { + rte_mempool_ops_enqueue_bulk + (mp, &cache->objs[cache->size], + cache->len - cache->size); + cache->len = cache->size; } - rte_mempool_put_bulk(free[0]->pool, (void **)free, n); goto done; } +normal: m = rte_pktmbuf_prefree_seg(txep[0].mbuf); if (likely(m != NULL)) { free[0] = m; @@ -147,7 +169,7 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq) } static __rte_always_inline void -tx_backlog_entry(struct i40e_tx_entry *txep, +tx_backlog_entry(struct i40e_vec_tx_entry *txep, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) { int i; diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c index 12e6f1cbcb..d2d61e8ef4 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c @@ -680,12 +680,15 @@ i40e_xmit_fixed_burst_vec(void *__rte_restrict tx_queue, { struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue; volatile struct i40e_tx_desc *txdp; - struct i40e_tx_entry *txep; + struct i40e_vec_tx_entry *txep; uint16_t n, nb_commit, tx_id; uint64_t flags = I40E_TD_CMD; uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD; int i; + /* cross rx_thresh boundary is not allowed */ + nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); + if (txq->nb_tx_free < txq->tx_free_thresh) i40e_tx_free_bufs(txq); @@ -695,7 +698,8 @@ i40e_xmit_fixed_burst_vec(void *__rte_restrict tx_queue, tx_id = txq->tx_tail; txdp = &txq->tx_ring[tx_id]; - txep = &txq->sw_ring[tx_id]; + txep = (void *)txq->sw_ring; + txep += tx_id; txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts); @@ -715,7 +719,7 @@ i40e_xmit_fixed_burst_vec(void *__rte_restrict tx_queue, /* avoid reach the end of ring */ txdp = &txq->tx_ring[tx_id]; - txep = &txq->sw_ring[tx_id]; + txep = (void *)txq->sw_ring; } tx_backlog_entry(txep, tx_pkts, nb_commit); -- 2.25.1