rte_pktmbuf_free_bulk calls rte_mempool_put_bulk with the number of pending packets to return to the mempool. In contrast, rte_pktmbuf_free calls rte_mempool_put that calls rte_mempool_put_bulk with one object. An important performance related downside of adding one packet at a time to the mempool is that on each call, the per-core cache pointer needs to be read from tls while a single rte_mempool_put_bulk only reads from the tls once.
Signed-off-by: Balazs Nemeth <bnem...@redhat.com> Reviewed-by: Igor Russkikh <irussk...@marvell.com> --- drivers/net/qede/qede_rxtx.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index 9294f79eb..f439ee056 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -893,6 +893,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, struct rte_mbuf *mbuf; uint16_t nb_segs; uint16_t idx; + uint16_t first_idx; rte_compiler_barrier(); sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); @@ -907,6 +908,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, remaining = hw_bd_cons - sw_tx_cons; txq->nb_tx_avail += remaining; + first_idx = idx; while (remaining) { mbuf = txq->sw_tx_ring[idx]; @@ -921,11 +923,19 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, nb_segs--; } - rte_pktmbuf_free(mbuf); idx = (idx + 1) & mask; PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); } txq->sw_tx_cons = idx; + + if (first_idx > idx) { + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], + mask - first_idx + 1); + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[0], idx); + } else { + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], + idx - first_idx); + } } static int qede_drain_txq(struct qede_dev *qdev, -- 2.30.2