+CC: i40e maintainers > From: Kamalakshitha Aligeri [mailto:kamalakshitha.alig...@arm.com] > Sent: Monday, 9 January 2023 15.58 > > Integrated zero-copy get and put API's in mempool cache in i40e PMD > > Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.alig...@arm.com> > --- > 1. I have replaced the rte_mempool_get_bulk and rte_mempool_put_bulk in > net/i40e with the zero-copy get and put API's > > drivers/net/i40e/i40e_rxtx_vec_avx512.c | 10 +--------- > drivers/net/i40e/i40e_rxtx_vec_common.h | 21 +++++++++++++-------- > drivers/net/i40e/i40e_rxtx_vec_neon.c | 16 ++++++++++++---- > 3 files changed, 26 insertions(+), 21 deletions(-) > > diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c > b/drivers/net/i40e/i40e_rxtx_vec_avx512.c > index 60c97d5331..736bd4650f 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c > +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
i40e_rxq_rearm() also accesses the cache directly, and thus needs rewriting to the new mempool cache API. > @@ -909,7 +909,7 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq) > if (!cache || cache->len == 0) This is not your doing, but I don't understand the reason for the cache->len == 0 comparison here. Why not store objects in the cache if it is empty? Maybe an old copy-paste bug? > goto normal; > > - cache_objs = &cache->objs[cache->len]; > + cache_objs = rte_mempool_cache_zc_put_bulk(cache, mp, n); > > if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { This comparison should be (cache_objs == NULL) instead. > rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); The comment block on lines 919-923 must be deleted too. > @@ -936,14 +936,6 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue > *txq) > _mm512_storeu_si512(&cache_objs[copied + 24], d); > copied += 32; > } > - cache->len += n; > - > - if (cache->len >= cache->flushthresh) { > - rte_mempool_ops_enqueue_bulk > - (mp, &cache->objs[cache->size], > - cache->len - cache->size); > - cache->len = cache->size; > - } > goto done; > } > > diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h > b/drivers/net/i40e/i40e_rxtx_vec_common.h > index fe1a6ec75e..4fc4aa0aec 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_common.h > +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h > @@ -89,23 +89,28 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq) > > /* check DD bits on threshold descriptor */ > if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz & > - rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != > + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) != > rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) > return 0; > > n = txq->tx_rs_thresh; > > - /* first buffer to free from S/W ring is at index > - * tx_next_dd - (tx_rs_thresh-1) > - */ > + /* first buffer to free from S/W ring is at index > + * tx_next_dd - (tx_rs_thresh-1) > + */ > txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)]; > + struct rte_mempool *mp = txep[0].mbuf->pool; > + struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, > rte_lcore_id()); > + void **cache_objs; > + > + cache_objs = rte_mempool_cache_zc_put_bulk(cache, mp, n); These belong inside the "if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)" block. > > if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) { > for (i = 0; i < n; i++) { > - free[i] = txep[i].mbuf; > + rte_memcpy(&cache_objs[i], &txep->mbuf, sizeof(struct > rte_mbuf)); You must copy pointers to mbufs, not mbuf structures. I.e. instead of rte_memcpy(...) do this: + cache_objs[i] = txep->mbuf; > /* no need to reset txep[i].mbuf in vector path */ > + txep++; > } > - rte_mempool_put_bulk(free[0]->pool, (void **)free, n); > goto done; > } > > @@ -120,8 +125,8 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq) > free[nb_free++] = m; > } else { > rte_mempool_put_bulk(free[0]->pool, > - (void *)free, > - nb_free); > + (void *)free, > + nb_free); > free[0] = m; > nb_free = 1; > } > diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c > b/drivers/net/i40e/i40e_rxtx_vec_neon.c > index 12e6f1cbcb..ebc2161b84 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c > +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c > @@ -28,15 +28,19 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) > uint64x2_t dma_addr0, dma_addr1; > uint64x2_t zero = vdupq_n_u64(0); > uint64_t paddr; > + uint32_t index, n; No need for "index"; just reuse "i" instead. No need for "n"; just use RTE_I40E_RXQ_REARM_THRESH. > > + n = RTE_I40E_RXQ_REARM_THRESH; > rxdp = rxq->rx_ring + rxq->rxrearm_start; > + struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq- > >mp, rte_lcore_id()); > + void **cache_objs; You must add support for mempools without cache: if (cache == NULL) ... > + > + cache_objs = rte_mempool_cache_zc_get_bulk(cache, rxq->mp, n); > > /* Pull 'n' more MBUFs into the software ring */ > - if (unlikely(rte_mempool_get_bulk(rxq->mp, > - (void *)rxep, > - RTE_I40E_RXQ_REARM_THRESH) < 0)) { > + if (unlikely(!cache_objs)) { > if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >= > - rxq->nb_rx_desc) { > + rxq->nb_rx_desc) { > for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { > rxep[i].mbuf = &rxq->fake_mbuf; > vst1q_u64((uint64_t *)&rxdp[i].read, zero); > @@ -46,6 +50,10 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) > RTE_I40E_RXQ_REARM_THRESH; > return; > } > + for (index = 0; index < n; index++) { > + rxep->mbuf = cache_objs[index]; > + rxep++; > + } Please note that struct i40e_rx_entry is essentially the same as struct rte_mbuf [1]. This was taken advantage of in the rte_mempool_get_bulk() above. [1]: https://elixir.bootlin.com/dpdk/latest/source/drivers/net/i40e/i40e_rxtx.h#L77 It means that the loop that copies the mbuf pointers from the cache_objs[] array to the rxep[] array can be replaced by: rte_memcpy(rxep, cache_objs, RTE_I40E_RXQ_REARM_THRESH * sizeof(void *)); > > /* Initialize the mbufs in vector, process 2 mbufs in one loop */ > for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) { > -- > 2.25.1 >