On Wed, Jan 31, 2024 at 07:45:50PM -0800, lon...@linuxonhyperv.com wrote:
> From: Long Li <lon...@microsoft.com>
> 
> Instead of allocating mbufs one by one during RX, use
> rte_pktmbuf_alloc_bulk() to allocate them in a batch.
> 
> There are no measurable performance improvements in benchmarks. However,
> this patch should improve CPU cycles and reduce potential locking
> conflicts in real-world applications.
> 
> Signed-off-by: Long Li <lon...@microsoft.com>
> ---
> Change in v2:
> use rte_calloc_socket() in place of rte_calloc()
> 
> v3:
> add more comment explaining the benefit of doing alloc_bulk.
> free mbufs that are failed to post
> 
>  drivers/net/mana/rx.c | 74 +++++++++++++++++++++++++++++--------------
>  1 file changed, 50 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
> index acad5e26cd..6112db2219 100644
> --- a/drivers/net/mana/rx.c
> +++ b/drivers/net/mana/rx.c
> @@ -2,6 +2,7 @@
>   * Copyright 2022 Microsoft Corporation
>   */
>  #include <ethdev_driver.h>
> +#include <rte_malloc.h>
>  
>  #include <infiniband/verbs.h>
>  #include <infiniband/manadv.h>
> @@ -59,9 +60,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
>  }
>  
>  static int
> -mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
> +mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
>  {
> -     struct rte_mbuf *mbuf = NULL;
>       struct gdma_sgl_element sgl[1];
>       struct gdma_work_request request;
>       uint32_t wqe_size_in_bu;
> @@ -69,12 +69,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
>       int ret;
>       struct mana_mr_cache *mr;
>  
> -     mbuf = rte_pktmbuf_alloc(rxq->mp);
> -     if (!mbuf) {
> -             rxq->stats.nombuf++;
> -             return -ENOMEM;
> -     }
> -
>       mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
>       if (!mr) {
>               DP_LOG(ERR, "failed to register RX MR");
> @@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
>   * Post work requests for a Rx queue.
>   */
>  static int
> -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
>  {
>       int ret;
>       uint32_t i;
> +     struct rte_mbuf **mbufs;
> +
> +     mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct 
> rte_mbuf *),
> +                               0, rxq->mp->socket_id);
> +     if (!mbufs)
> +             return -ENOMEM;
> +
> +     ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
> +     if (ret) {
> +             DP_LOG(ERR, "failed to allocate mbufs for RX");
> +             rxq->stats.nombuf += count;
> +             goto fail;
> +     }
>  
>  #ifdef RTE_ARCH_32
>       rxq->wqe_cnt_to_short_db = 0;
>  #endif
> -     for (i = 0; i < rxq->num_desc; i++) {
> -             ret = mana_alloc_and_post_rx_wqe(rxq);
> +     for (i = 0; i < count; i++) {
> +             ret = mana_post_rx_wqe(rxq, mbufs[i]);
>               if (ret) {
>                       DP_LOG(ERR, "failed to post RX ret = %d", ret);
> -                     return ret;
> +                     break;
>               }
>  
>  #ifdef RTE_ARCH_32
> @@ -144,8 +151,16 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
>  #endif
>       }
>  
> +     /* Free the remaining mbufs that are not posted */
> +     while (i < count) {
> +             rte_pktmbuf_free(mbufs[i]);
> +             i++;
> +     }

there is also rte_pktmbuf_free_bulk() that could be used. probably won't
make any material difference to perf though so just an fyi.

Reply via email to