On Wed, Jan 31, 2024 at 07:45:50PM -0800, lon...@linuxonhyperv.com wrote: > From: Long Li <lon...@microsoft.com> > > Instead of allocating mbufs one by one during RX, use > rte_pktmbuf_alloc_bulk() to allocate them in a batch. > > There are no measurable performance improvements in benchmarks. However, > this patch should improve CPU cycles and reduce potential locking > conflicts in real-world applications. > > Signed-off-by: Long Li <lon...@microsoft.com> > --- > Change in v2: > use rte_calloc_socket() in place of rte_calloc() > > v3: > add more comment explaining the benefit of doing alloc_bulk. > free mbufs that are failed to post > > drivers/net/mana/rx.c | 74 +++++++++++++++++++++++++++++-------------- > 1 file changed, 50 insertions(+), 24 deletions(-) > > diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c > index acad5e26cd..6112db2219 100644 > --- a/drivers/net/mana/rx.c > +++ b/drivers/net/mana/rx.c > @@ -2,6 +2,7 @@ > * Copyright 2022 Microsoft Corporation > */ > #include <ethdev_driver.h> > +#include <rte_malloc.h> > > #include <infiniband/verbs.h> > #include <infiniband/manadv.h> > @@ -59,9 +60,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq) > } > > static int > -mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq) > +mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf) > { > - struct rte_mbuf *mbuf = NULL; > struct gdma_sgl_element sgl[1]; > struct gdma_work_request request; > uint32_t wqe_size_in_bu; > @@ -69,12 +69,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq) > int ret; > struct mana_mr_cache *mr; > > - mbuf = rte_pktmbuf_alloc(rxq->mp); > - if (!mbuf) { > - rxq->stats.nombuf++; > - return -ENOMEM; > - } > - > mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf); > if (!mr) { > DP_LOG(ERR, "failed to register RX MR"); > @@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq) > * Post work requests for a Rx queue. > */ > static int > -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq) > +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count) > { > int ret; > uint32_t i; > + struct rte_mbuf **mbufs; > + > + mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct > rte_mbuf *), > + 0, rxq->mp->socket_id); > + if (!mbufs) > + return -ENOMEM; > + > + ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count); > + if (ret) { > + DP_LOG(ERR, "failed to allocate mbufs for RX"); > + rxq->stats.nombuf += count; > + goto fail; > + } > > #ifdef RTE_ARCH_32 > rxq->wqe_cnt_to_short_db = 0; > #endif > - for (i = 0; i < rxq->num_desc; i++) { > - ret = mana_alloc_and_post_rx_wqe(rxq); > + for (i = 0; i < count; i++) { > + ret = mana_post_rx_wqe(rxq, mbufs[i]); > if (ret) { > DP_LOG(ERR, "failed to post RX ret = %d", ret); > - return ret; > + break; > } > > #ifdef RTE_ARCH_32 > @@ -144,8 +151,16 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq) > #endif > } > > + /* Free the remaining mbufs that are not posted */ > + while (i < count) { > + rte_pktmbuf_free(mbufs[i]); > + i++; > + }
there is also rte_pktmbuf_free_bulk() that could be used. probably won't make any material difference to perf though so just an fyi.