Refer to "i40e_tx_free_bufs_avx512", this patch puts mempool cache
out of API to free buffers directly. There are two changes different
with previous version:
1. change txep from "i40e_entry" to "i40e_vec_entry"
2. put cache out of "mempool_bulk" API to copy buffers into it directly

Performance Test with l3fwd neon path:
                with this patch
n1sdp:          no perforamnce change
amper-altra:    +4.0%

Suggested-by: Konstantin Ananyev <konstantin.v.anan...@yandex.ru>
Suggested-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
Signed-off-by: Feifei Wang <feifei.wa...@arm.com>
---
 drivers/net/i40e/i40e_rxtx_vec_common.h | 36 ++++++++++++++++++++-----
 drivers/net/i40e/i40e_rxtx_vec_neon.c   | 10 ++++---
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h 
b/drivers/net/i40e/i40e_rxtx_vec_common.h
index 959832ed6a..e418225b4e 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_common.h
+++ b/drivers/net/i40e/i40e_rxtx_vec_common.h
@@ -81,7 +81,7 @@ reassemble_packets(struct i40e_rx_queue *rxq, struct rte_mbuf 
**rx_bufs,
 static __rte_always_inline int
 i40e_tx_free_bufs(struct i40e_tx_queue *txq)
 {
-       struct i40e_tx_entry *txep;
+       struct i40e_vec_tx_entry *txep;
        uint32_t n;
        uint32_t i;
        int nb_free = 0;
@@ -98,17 +98,39 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)
         /* first buffer to free from S/W ring is at index
          * tx_next_dd - (tx_rs_thresh-1)
          */
-       txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+       txep = (void *)txq->sw_ring;
+       txep += txq->tx_next_dd - (n - 1);
 
        if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
-               for (i = 0; i < n; i++) {
-                       free[i] = txep[i].mbuf;
-                       /* no need to reset txep[i].mbuf in vector path */
+               struct rte_mempool *mp = txep[0].mbuf->pool;
+               void **cache_objs;
+               struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
+                               rte_lcore_id());
+
+               if (!cache || cache->len == 0)
+                       goto normal;
+
+               cache_objs = &cache->objs[cache->len];
+
+               if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+                       rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
+                       goto done;
+               }
+
+               rte_memcpy(cache_objs, txep, sizeof(void *) * n);
+               /* no need to reset txep[i].mbuf in vector path */
+               cache->len += n;
+
+               if (cache->len >= cache->flushthresh) {
+                       rte_mempool_ops_enqueue_bulk
+                               (mp, &cache->objs[cache->size],
+                               cache->len - cache->size);
+                       cache->len = cache->size;
                }
-               rte_mempool_put_bulk(free[0]->pool, (void **)free, n);
                goto done;
        }
 
+normal:
        m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
        if (likely(m != NULL)) {
                free[0] = m;
@@ -147,7 +169,7 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)
 }
 
 static __rte_always_inline void
-tx_backlog_entry(struct i40e_tx_entry *txep,
+tx_backlog_entry(struct i40e_vec_tx_entry *txep,
                 struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 {
        int i;
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c 
b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index 12e6f1cbcb..d2d61e8ef4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -680,12 +680,15 @@ i40e_xmit_fixed_burst_vec(void *__rte_restrict tx_queue,
 {
        struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
        volatile struct i40e_tx_desc *txdp;
-       struct i40e_tx_entry *txep;
+       struct i40e_vec_tx_entry *txep;
        uint16_t n, nb_commit, tx_id;
        uint64_t flags = I40E_TD_CMD;
        uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
        int i;
 
+       /* cross rx_thresh boundary is not allowed */
+       nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
        if (txq->nb_tx_free < txq->tx_free_thresh)
                i40e_tx_free_bufs(txq);
 
@@ -695,7 +698,8 @@ i40e_xmit_fixed_burst_vec(void *__rte_restrict tx_queue,
 
        tx_id = txq->tx_tail;
        txdp = &txq->tx_ring[tx_id];
-       txep = &txq->sw_ring[tx_id];
+       txep = (void *)txq->sw_ring;
+       txep += tx_id;
 
        txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
 
@@ -715,7 +719,7 @@ i40e_xmit_fixed_burst_vec(void *__rte_restrict tx_queue,
 
                /* avoid reach the end of ring */
                txdp = &txq->tx_ring[tx_id];
-               txep = &txq->sw_ring[tx_id];
+               txep = (void *)txq->sw_ring;
        }
 
        tx_backlog_entry(txep, tx_pkts, nb_commit);
-- 
2.25.1

Reply via email to