rte_pktmbuf_free_bulk calls rte_mempool_put_bulk with the number of pending packets to return to the mempool. In contrast, rte_pktmbuf_free calls rte_mempool_put that calls rte_mempool_put_bulk with one object. An important performance related downside of adding one packet at a time to the mempool is that on each call, the per-core cache pointer needs to be read from tls while a single rte_mempool_put_bulk only reads from the tls once.
Signed-off-by: Balazs Nemeth <bnem...@redhat.com> --- drivers/net/qede/qede_rxtx.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index 7ce4c00e3..e24a937f4 100644 --- a/drivers/net/qede/qede_rxtx.c +++ b/drivers/net/qede/qede_rxtx.c @@ -891,6 +891,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, struct rte_mbuf *mbuf; uint16_t nb_segs; uint16_t idx; + uint16_t first_idx; rte_compiler_barrier(); sw_tx_cons = ecore_chain_get_cons_idx(&txq->tx_pbl); @@ -905,6 +906,7 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, remaining = hw_bd_cons - sw_tx_cons; txq->nb_tx_avail += remaining; + first_idx = idx; while (remaining) { mbuf = txq->sw_tx_ring[idx]; @@ -919,11 +921,17 @@ qede_process_tx_compl(__rte_unused struct ecore_dev *edev, nb_segs--; } - rte_pktmbuf_free(mbuf); idx = (idx + 1) & mask; PMD_TX_LOG(DEBUG, txq, "Freed tx packet\n"); } txq->sw_tx_cons = idx; + + if (first_idx > idx) { + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], mask - first_idx + 1); + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[0], idx); + } else { + rte_pktmbuf_free_bulk(&txq->sw_tx_ring[first_idx], idx - first_idx); + } } static int qede_drain_txq(struct qede_dev *qdev, -- 2.29.2