PING for review.

@Bruce, you seemed to acknowledge this, but never sent a formal Ack.

Med venlig hilsen / Kind regards,
-Morten Brørup

> -----Original Message-----
> From: Morten Brørup [mailto:m...@smartsharesystems.com]
> Sent: Wednesday, 26 February 2025 16.59
> To: Andrew Rybchenko; dev@dpdk.org
> Cc: Morten Brørup
> Subject: [PATCH] mempool: micro optimizations
> 
> The comparisons lcore_id < RTE_MAX_LCORE and lcore_id != LCORE_ID_ANY
> are
> equivalent, but the latter compiles to fewer bytes of code space.
> Similarly for lcore_id >= RTE_MAX_LCORE and lcore_id == LCORE_ID_ANY.
> 
> The rte_mempool_get_ops() function is also used in the fast path, so
> RTE_VERIFY() was replaced by RTE_ASSERT().
> 
> Compilers implicitly consider comparisons of variable == 0 likely, so
> unlikely() was added to the check for no mempool cache (mp->cache_size
> ==
> 0) in the rte_mempool_default_cache() function.
> 
> The rte_mempool_do_generic_put() function for adding objects to a
> mempool
> was refactored as follows:
> - The comparison for the request itself being too big, which is
> considered
>   unlikely, was moved down and out of the code path where the cache has
>   sufficient room for the added objects, which is considered the most
>   likely code path.
> - Added __rte_assume() about the cache length, size and threshold, for
>   compiler optimization when "n" is compile time constant.
> - Added __rte_assume() about "ret" being zero, so other functions using
>   the value returned by this function can be potentially optimized by
> the
>   compiler; especially when it merges multiple sequential code paths of
>   inlined code depending on the return value being either zero or
>   negative.
> - The refactored source code (with comments) made the separate comment
>   describing the cache flush/add algorithm superfluous, so it was
> removed.
> 
> A few more likely()/unlikely() were added.
> 
> A few comments were improved for readability.
> 
> Some assertions, RTE_ASSERT(), were added. Most importantly to assert
> that
> the return values of the mempool drivers' enqueue and dequeue
> operations
> are API compliant, i.e. 0 (for success) or negative (for failure), and
> never positive.
> 
> Signed-off-by: Morten Brørup <m...@smartsharesystems.com>
> ---
>  lib/mempool/rte_mempool.h | 67 ++++++++++++++++++++++-----------------
>  1 file changed, 38 insertions(+), 29 deletions(-)
> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index c495cc012f..aedc100964 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -334,7 +334,7 @@ struct __rte_cache_aligned rte_mempool {
>  #ifdef RTE_LIBRTE_MEMPOOL_STATS
>  #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> \
>               unsigned int __lcore_id = rte_lcore_id();
> \
> -             if (likely(__lcore_id < RTE_MAX_LCORE))
> \
> +             if (likely(__lcore_id != LCORE_ID_ANY))
> \
>                       (mp)->stats[__lcore_id].name += (n);
> \
>               else
> \
>                       rte_atomic_fetch_add_explicit(&((mp)-
> >stats[RTE_MAX_LCORE].name),  \
> @@ -751,7 +751,7 @@ extern struct rte_mempool_ops_table
> rte_mempool_ops_table;
>  static inline struct rte_mempool_ops *
>  rte_mempool_get_ops(int ops_index)
>  {
> -     RTE_VERIFY((ops_index >= 0) && (ops_index <
> RTE_MEMPOOL_MAX_OPS_IDX));
> +     RTE_ASSERT((ops_index >= 0) && (ops_index <
> RTE_MEMPOOL_MAX_OPS_IDX));
> 
>       return &rte_mempool_ops_table.ops[ops_index];
>  }
> @@ -791,7 +791,8 @@ rte_mempool_ops_dequeue_bulk(struct rte_mempool
> *mp,
>       rte_mempool_trace_ops_dequeue_bulk(mp, obj_table, n);
>       ops = rte_mempool_get_ops(mp->ops_index);
>       ret = ops->dequeue(mp, obj_table, n);
> -     if (ret == 0) {
> +     RTE_ASSERT(ret <= 0);
> +     if (likely(ret == 0)) {
>               RTE_MEMPOOL_STAT_ADD(mp, get_common_pool_bulk, 1);
>               RTE_MEMPOOL_STAT_ADD(mp, get_common_pool_objs, n);
>       }
> @@ -816,11 +817,14 @@ rte_mempool_ops_dequeue_contig_blocks(struct
> rte_mempool *mp,
>               void **first_obj_table, unsigned int n)
>  {
>       struct rte_mempool_ops *ops;
> +     int ret;
> 
>       ops = rte_mempool_get_ops(mp->ops_index);
>       RTE_ASSERT(ops->dequeue_contig_blocks != NULL);
>       rte_mempool_trace_ops_dequeue_contig_blocks(mp, first_obj_table,
> n);
> -     return ops->dequeue_contig_blocks(mp, first_obj_table, n);
> +     ret = ops->dequeue_contig_blocks(mp, first_obj_table, n);
> +     RTE_ASSERT(ret <= 0);
> +     return ret;
>  }
> 
>  /**
> @@ -848,6 +852,7 @@ rte_mempool_ops_enqueue_bulk(struct rte_mempool
> *mp, void * const *obj_table,
>       rte_mempool_trace_ops_enqueue_bulk(mp, obj_table, n);
>       ops = rte_mempool_get_ops(mp->ops_index);
>       ret = ops->enqueue(mp, obj_table, n);
> +     RTE_ASSERT(ret <= 0);
>  #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
>       if (unlikely(ret < 0))
>               RTE_MEMPOOL_LOG(CRIT, "cannot enqueue %u objects to mempool
> %s",
> @@ -1333,10 +1338,10 @@ rte_mempool_cache_free(struct rte_mempool_cache
> *cache);
>  static __rte_always_inline struct rte_mempool_cache *
>  rte_mempool_default_cache(struct rte_mempool *mp, unsigned lcore_id)
>  {
> -     if (mp->cache_size == 0)
> +     if (unlikely(mp->cache_size == 0))
>               return NULL;
> 
> -     if (lcore_id >= RTE_MAX_LCORE)
> +     if (unlikely(lcore_id == LCORE_ID_ANY))
>               return NULL;
> 
>       rte_mempool_trace_default_cache(mp, lcore_id,
> @@ -1383,32 +1388,33 @@ rte_mempool_do_generic_put(struct rte_mempool
> *mp, void * const *obj_table,
>  {
>       void **cache_objs;
> 
> -     /* No cache provided */
> +     /* No cache provided? */
>       if (unlikely(cache == NULL))
>               goto driver_enqueue;
> 
> -     /* increment stat now, adding in mempool always success */
> +     /* Increment stats now, adding in mempool always succeeds. */
>       RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
>       RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> 
> -     /* The request itself is too big for the cache */
> -     if (unlikely(n > cache->flushthresh))
> -             goto driver_enqueue_stats_incremented;
> -
> -     /*
> -      * The cache follows the following algorithm:
> -      *   1. If the objects cannot be added to the cache without
> crossing
> -      *      the flush threshold, flush the cache to the backend.
> -      *   2. Add the objects to the cache.
> -      */
> -
> -     if (cache->len + n <= cache->flushthresh) {
> +     __rte_assume(cache->flushthresh <= RTE_MEMPOOL_CACHE_MAX_SIZE *
> 2);
> +     __rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> +     __rte_assume(cache->len <= cache->flushthresh);
> +     if (likely(cache->len + n <= cache->flushthresh)) {
> +             /* Sufficient room in the cache for the objects. */
>               cache_objs = &cache->objs[cache->len];
>               cache->len += n;
> -     } else {
> +     } else if (n <= cache->flushthresh) {
> +             /*
> +              * The cache is big enough for the objects, but - as
> detected by
> +              * the comparison above - has insufficient room for them.
> +              * Flush the cache to make room for the objects.
> +              */
>               cache_objs = &cache->objs[0];
>               rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
>               cache->len = n;
> +     } else {
> +             /* The request itself is too big for the cache. */
> +             goto driver_enqueue_stats_incremented;
>       }
> 
>       /* Add the objects to the cache. */
> @@ -1515,7 +1521,7 @@ rte_mempool_do_generic_get(struct rte_mempool
> *mp, void **obj_table,
>       uint32_t index, len;
>       void **cache_objs;
> 
> -     /* No cache provided */
> +     /* No cache provided? */
>       if (unlikely(cache == NULL)) {
>               remaining = n;
>               goto driver_dequeue;
> @@ -1524,6 +1530,7 @@ rte_mempool_do_generic_get(struct rte_mempool
> *mp, void **obj_table,
>       /* The cache is a stack, so copy will be in reverse order. */
>       cache_objs = &cache->objs[cache->len];
> 
> +     __rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
>       if (__rte_constant(n) && n <= cache->len) {
>               /*
>                * The request size is known at build time, and
> @@ -1556,7 +1563,7 @@ rte_mempool_do_generic_get(struct rte_mempool
> *mp, void **obj_table,
>        * where the entire request can be satisfied from the cache
>        * has already been handled above, so omit handling it here.
>        */
> -     if (!__rte_constant(n) && remaining == 0) {
> +     if (!__rte_constant(n) && likely(remaining == 0)) {
>               /* The entire request is satisfied from the cache. */
> 
>               RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> @@ -1565,7 +1572,7 @@ rte_mempool_do_generic_get(struct rte_mempool
> *mp, void **obj_table,
>               return 0;
>       }
> 
> -     /* if dequeue below would overflow mem allocated for cache */
> +     /* Dequeue below would overflow mem allocated for cache? */
>       if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
>               goto driver_dequeue;
> 
> @@ -1574,8 +1581,7 @@ rte_mempool_do_generic_get(struct rte_mempool
> *mp, void **obj_table,
>                       cache->size + remaining);
>       if (unlikely(ret < 0)) {
>               /*
> -              * We are buffer constrained, and not able to allocate
> -              * cache + remaining.
> +              * We are buffer constrained, and not able to fetch all
> that.
>                * Do not fill the cache, just satisfy the remaining part
> of
>                * the request directly from the backend.
>                */
> @@ -1583,6 +1589,8 @@ rte_mempool_do_generic_get(struct rte_mempool
> *mp, void **obj_table,
>       }
> 
>       /* Satisfy the remaining part of the request from the filled
> cache. */
> +     __rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> +     __rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE);
>       cache_objs = &cache->objs[cache->size + remaining];
>       for (index = 0; index < remaining; index++)
>               *obj_table++ = *--cache_objs;
> @@ -1599,7 +1607,7 @@ rte_mempool_do_generic_get(struct rte_mempool
> *mp, void **obj_table,
>       /* Get remaining objects directly from the backend. */
>       ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining);
> 
> -     if (ret < 0) {
> +     if (unlikely(ret < 0)) {
>               if (likely(cache != NULL)) {
>                       cache->len = n - remaining;
>                       /*
> @@ -1619,6 +1627,7 @@ rte_mempool_do_generic_get(struct rte_mempool
> *mp, void **obj_table,
>                       RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
>                       RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n);
>               }
> +             __rte_assume(ret == 0);
>       }
> 
>       return ret;
> @@ -1650,7 +1659,7 @@ rte_mempool_generic_get(struct rte_mempool *mp,
> void **obj_table,
>  {
>       int ret;
>       ret = rte_mempool_do_generic_get(mp, obj_table, n, cache);
> -     if (ret == 0)
> +     if (likely(ret == 0))
>               RTE_MEMPOOL_CHECK_COOKIES(mp, obj_table, n, 1);
>       rte_mempool_trace_generic_get(mp, obj_table, n, cache);
>       return ret;
> @@ -1741,7 +1750,7 @@ rte_mempool_get_contig_blocks(struct rte_mempool
> *mp,
>       int ret;
> 
>       ret = rte_mempool_ops_dequeue_contig_blocks(mp, first_obj_table,
> n);
> -     if (ret == 0) {
> +     if (likely(ret == 0)) {
>               RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1);
>               RTE_MEMPOOL_STAT_ADD(mp, get_success_blks, n);
>               RTE_MEMPOOL_CONTIG_BLOCKS_CHECK_COOKIES(mp,
> first_obj_table, n,
> --
> 2.43.0

Reply via email to