On Sun, Apr 19, 2026 at 09:55:26AM +0000, Morten Brørup wrote:
> This patch refactors the mempool cache to eliminate some unexpected
> behaviour and reduce the mempool cache miss rate.
> 

Agree in principle with most of these changes. As we dicussed at the DPDK
summit conference, only issue I really have is with the threshold limits
here - allocating and freeing only half the cache at a time seems overly
conservative. Thinking about use-cases:

1 for apps where alloc + free (generally Rx+Tx) is on the same core(s),
  then we should run (almost) entirely out of cache.
2 for apps where we have alloc and free on different cores, then we have
  some caches always being filled and others always being emptied

For case #1, we only need worry about the thresholds for the odd case where
we have a burst that causes us to overflow our cache (and we can't increase
our cache size to cope and avoid that). Otherwise the thresholds don't
matter. However, for case #2, the thresholds are constantly involved as we
always are going to backing store. In this case, we really want to have the
allocs *always* fill the cache completely, and the frees completely empty
the cache.

Because of this, while we want to avoid cases where we fill the cache
completely only to have a further free causing it to be flushed, because of
case #2 we cannot be overly conservative in how much we free/empty.
Ideally, we want to fill to full less a single burst, and empty leaving
only a single burst in the cache. Unfortunately, we don't know what those
burst limits are, so we have to try and guess the best behaviour from
everything else.

All that said, commits with specific suggestions inline.

/Bruce

<snip>

> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 2e54fc4466..432c43ab15 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -89,7 +89,7 @@ struct __rte_cache_aligned rte_mempool_debug_stats {
>   */
>  struct __rte_cache_aligned rte_mempool_cache {
>       uint32_t size;        /**< Size of the cache */
> -     uint32_t flushthresh; /**< Threshold before we flush excess elements */
> +     uint32_t flushthresh; /**< Obsolete; for API/ABI compatibility purposes 
> only */
>       uint32_t len;         /**< Current cache count */
>  #ifdef RTE_LIBRTE_MEMPOOL_STATS
>       uint32_t unused;
> @@ -107,8 +107,10 @@ struct __rte_cache_aligned rte_mempool_cache {
>       /**
>        * Cache objects
>        *
> -      * Cache is allocated to this size to allow it to overflow in certain
> -      * cases to avoid needless emptying of cache.
> +      * Note:
> +      * Cache is allocated at double size for API/ABI compatibility purposes 
> only.
> +      * When reducing its size at an API/ABI breaking release,
> +      * remember to add a cache guard after it.
>        */
>       alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
>  };
> @@ -1046,12 +1048,17 @@ rte_mempool_free(struct rte_mempool *mp);
>   * @param cache_size
>   *   If cache_size is non-zero, the rte_mempool library will try to
>   *   limit the accesses to the common lockless pool, by maintaining a
> - *   per-lcore object cache. This argument must be lower or equal to
> - *   RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5.
> + *   per-lcore object cache. This argument must be an even number,
> + *   lower or equal to RTE_MEMPOOL_CACHE_MAX_SIZE and n.
>   *   The access to the per-lcore table is of course
>   *   faster than the multi-producer/consumer pool. The cache can be
>   *   disabled if the cache_size argument is set to 0; it can be useful to
>   *   avoid losing objects in cache.
> + *   Note:
> + *   Mempool put/get requests of more than cache_size / 2 objects may be
> + *   partially or fully served directly by the multi-producer/consumer
> + *   pool, to avoid the overhead of copying the objects twice (instead of
> + *   once) when using the cache as a bounce buffer.
>   * @param private_data_size
>   *   The size of the private data appended after the mempool
>   *   structure. This is useful for storing some private data after the
> @@ -1390,24 +1397,32 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, 
> void * const *obj_table,
>       RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
>       RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
>  
> -     __rte_assume(cache->flushthresh <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> -     __rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> -     __rte_assume(cache->len <= cache->flushthresh);
> -     if (likely(cache->len + n <= cache->flushthresh)) {
> +     __rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> +     __rte_assume(cache->size / 2 <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +     __rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> +     __rte_assume(cache->len <= cache->size);
> +     if (likely(cache->len + n <= cache->size)) {
>               /* Sufficient room in the cache for the objects. */
>               cache_objs = &cache->objs[cache->len];
>               cache->len += n;
> -     } else if (n <= cache->flushthresh) {
> +     } else if (n <= cache->size / 2) {
>               /*
> -              * The cache is big enough for the objects, but - as detected by
> -              * the comparison above - has insufficient room for them.
> -              * Flush the cache to make room for the objects.
> +              * The number of objects is within the cache bounce buffer 
> limit,
> +              * but - as detected by the comparison above - the cache has
> +              * insufficient room for them.
> +              * Flush the cache to the backend to make room for the objects;
> +              * flush (size / 2) objects from the bottom of the cache, where
> +              * objects are less hot, and move down the remaining objects, 
> which
> +              * are more hot, from the upper half of the cache.
>                */
> -             cache_objs = &cache->objs[0];
> -             rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
> -             cache->len = n;
> +             __rte_assume(cache->len > cache->size / 2);
> +             rte_mempool_ops_enqueue_bulk(mp, &cache->objs[0], cache->size / 
> 2);
> +             rte_memcpy(&cache->objs[0], &cache->objs[cache->size / 2],
> +                             sizeof(void *) * (cache->len - cache->size / 
> 2));
> +             cache_objs = &cache->objs[cache->len - cache->size / 2];
> +             cache->len = cache->len - cache->size / 2 + n;

The flushing of only half the cache I'm not so certain about. I agree that
we want to not flush to empty, but I also think that we want to do more
than a half-flush, especially since we do an enqueue to the cache
immediately afterwards. Consider the case where we have a cache size of
128, and we do an enqueue of 32, with the cache currently full. In that
case we only flush 64, reducing the cache to 64, but then immediately
bringing it back up to 96. For cases where we have pipelines with all alloc
on one core and all free on another, this half-flush would be inefficient.

Instead, I would look to have a lower target threshold post-flush, and I
would suggest 25% - taking into account the newly freed buffers. For
example:

        /* if n > our target of 1/4 full, flush everything,
         * else flush so that we end up with 1/4 full after n added.
         */
        flush_count = n > cache->size/4 ? cache->len :
                        (cache->len + n) - cache->size/4;


>       } else {
> -             /* The request itself is too big for the cache. */
> +             /* The request itself is too big. */
>               goto driver_enqueue_stats_incremented;

I think original comment is better. The request itself is not too big for
the whole mempool, just for the cache.

>       }
>  
> @@ -1524,7 +1539,7 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void 
> **obj_table,
>       /* The cache is a stack, so copy will be in reverse order. */
>       cache_objs = &cache->objs[cache->len];
>  
> -     __rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2);
> +     __rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE);
>       if (likely(n <= cache->len)) {
>               /* The entire request can be satisfied from the cache. */
>               RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
> @@ -1548,13 +1563,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, 
> void **obj_table,
>       for (index = 0; index < len; index++)
>               *obj_table++ = *--cache_objs;
>  
> -     /* Dequeue below would overflow mem allocated for cache? */
> -     if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))
> +     /* Dequeue below would exceed the cache bounce buffer limit? */
> +     __rte_assume(cache->size / 2 <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +     if (unlikely(remaining > cache->size / 2))
>               goto driver_dequeue;
>  
> -     /* Fill the cache from the backend; fetch size + remaining objects. */
> -     ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,
> -                     cache->size + remaining);
> +     /* Fill the cache from the backend; fetch (size / 2) objects. */
> +     ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, cache->size / 2);

Again, the cache->size / 2 doesn't seem right here. We at most half-fill
the cache and then take some objects from that, meaning that have just done
a re-fill of cache but end the function with it less than half full. Since
we take from this value, I'd suggest just filling the cache completely.

>       if (unlikely(ret < 0)) {
>               /*
>                * We are buffer constrained, and not able to fetch all that.
> @@ -1568,10 +1583,11 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, 
> void **obj_table,
>       RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
>       RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);
>  
> -     __rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> -     __rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE);
> -     cache_objs = &cache->objs[cache->size + remaining];
> -     cache->len = cache->size;
> +     __rte_assume(cache->size / 2 <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +     __rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE / 2);
> +     __rte_assume(remaining <= cache->size / 2);
> +     cache_objs = &cache->objs[cache->size / 2];
> +     cache->len = cache->size / 2 - remaining;
>       for (index = 0; index < remaining; index++)
>               *obj_table++ = *--cache_objs;
>  
> -- 
> 2.43.0
> 

Reply via email to