> Current mempool per core cache implementation stores pointers to mbufs
> On 64b architectures, each pointer consumes 8B
> This patch replaces it with index-based implementation,
> where in each buffer is addressed by (pool base address + index)
> It reduces the amount of memory/cache required for per core cache
>
> L3Fwd performance testing reveals minor improvements in the cache
> performance (L1 and L2 misses reduced by 0.60%)
> with no change in throughput
I feel really sceptical about that patch and the whole idea in general:
- From what I read above there is no real performance improvement observed.
(In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
see below for more details).
- Space utilization difference looks neglectable too.
- The change introduces a new build time config option with a major limitation:
All memzones in a pool have to be within the same 4GB boundary.
To address it properly, extra changes will be required in init(/populate)
part of the code.
All that will complicate mempool code, will make it more error prone
and harder to maintain.
But, as there is no real gain in return - no point to add such extra complexity
at all.
Konstantin
CSX 2.1 GHz
==========
echo 'mempool_perf_autotest' | ./dpdk-test -n 4 --lcores='6-13' --no-pci
params :
rate_persec
(normal/index-based/diff %)
(with cache)
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 :
740989337.00/504116019.00/-31.97
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 :
756495155.00/615002931.00/-18.70
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 :
1483499110.00/1007248997.00/-32.10
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 :
1512439807.00/1229927218.00/-18.68
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 :
5933668757.00/4029048421.00/-32.10
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 :
6049234942.00/4921111344.00/-18.65
(with user-owned cache)
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 :
630600499.00/504312627.00/-20.03
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 :
756259225.00/615042252.00/-18.67
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 :
1262052966.00/1007039283.00/-20.21
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 :
1517853081.00/1230818508.00/-18.91
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32
:5054529533.00/4028052273.00/-20.31
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 :
6059340592.00/4912893129.00/-18.92
>
> Suggested-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
> Signed-off-by: Dharmik Thakkar <dharmik.thak...@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com>
> ---
> lib/mempool/rte_mempool.h | 114 +++++++++++++++++++++++++-
> lib/mempool/rte_mempool_ops_default.c | 7 ++
> 2 files changed, 119 insertions(+), 2 deletions(-)
>
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 1e7a3c15273c..4fabd3b1920b 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -50,6 +50,10 @@
> #include <rte_memcpy.h>
> #include <rte_common.h>
>
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#include <rte_vect.h>
> +#endif
> +
> #include "rte_mempool_trace_fp.h"
>
> #ifdef __cplusplus
> @@ -239,6 +243,9 @@ struct rte_mempool {
> int32_t ops_index;
>
> struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + void *pool_base_value; /**< Base value to calculate indices */
> +#endif
>
> uint32_t populated_size; /**< Number of populated objects. */
> struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache
> *cache,
> if (cache == NULL || cache->len == 0)
> return;
> rte_mempool_trace_cache_flush(cache, mp);
> +
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + unsigned int i;
> + unsigned int cache_len = cache->len;
> + void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> + void *base_value = mp->pool_base_value;
> + uint32_t *cache_objs = (uint32_t *) cache->objs;
> + for (i = 0; i < cache_len; i++)
> + obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> + rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> +#else
> rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +#endif
> cache->len = 0;
> }
>
> @@ -1334,8 +1353,13 @@ static __rte_always_inline void
> rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
> unsigned int n, struct rte_mempool_cache *cache)
> {
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + uint32_t *cache_objs;
> + void *base_value;
> + uint32_t i;
> +#else
> void **cache_objs;
> -
> +#endif
> /* increment stat now, adding in mempool always success */
> RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> @@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp,
> void * const *obj_table,
> if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
> goto ring_enqueue;
>
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + cache_objs = (uint32_t *) cache->objs;
> + cache_objs = &cache_objs[cache->len];
> + base_value = mp->pool_base_value;
> +#else
> cache_objs = &cache->objs[cache->len];
> +#endif
>
> /*
> * The cache follows the following algorithm
> @@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp,
> void * const *obj_table,
> */
>
> /* Add elements back into the cache */
> +
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#if defined __ARM_NEON
> + uint64x2_t v_obj_table;
> + uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> + uint32x2_t v_cache_objs;
> +
> + for (i = 0; i < (n & ~0x1); i += 2) {
> + v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> + v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> + vst1_u32(cache_objs + i, v_cache_objs);
> + }
> + if (n & 0x1) {
> + cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> + }
> +#else
> + for (i = 0; i < n; i++) {
> + cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> + }
> +#endif
> +#else
> rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +#endif
>
> cache->len += n;
>
> if (cache->len >= cache->flushthresh) {
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len -
> cache->size,
> + cache->len - cache->size);
> +#else
> rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> cache->len - cache->size);
> +#endif
> cache->len = cache->size;
> }
>
> @@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp,
> void **obj_table,
> {
> int ret;
> uint32_t index, len;
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + uint32_t i;
> + uint32_t *cache_objs;
> +#else
> void **cache_objs;
> -
> +#endif
> /* No cache provided or cannot be satisfied from cache */
> if (unlikely(cache == NULL || n >= cache->size))
> goto ring_dequeue;
>
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + void *base_value = mp->pool_base_value;
> + cache_objs = (uint32_t *) cache->objs;
> +#else
> cache_objs = cache->objs;
> +#endif
>
> /* Can this be satisfied from the cache? */
> if (cache->len < n) {
> @@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp,
> void **obj_table,
> uint32_t req = n + (cache->size - cache->len);
>
> /* How many do we require i.e. number to fill the cache + the
> request */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects */
> + ret = rte_mempool_ops_dequeue_bulk(mp,
> + temp_objs, req);
> +#else
> ret = rte_mempool_ops_dequeue_bulk(mp,
> &cache->objs[cache->len], req);
> +#endif
> if (unlikely(ret < 0)) {
> /*
> * In the off chance that we are buffer constrained,
> @@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp,
> void **obj_table,
> goto ring_dequeue;
> }
>
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + len = cache->len;
> + for (i = 0; i < req; ++i, ++len) {
> + cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> + base_value);
> + }
> +#endif
> cache->len += req;
> }
>
> /* Now fill in the response ... */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#if defined __ARM_NEON
> + uint64x2_t v_obj_table;
> + uint64x2_t v_cache_objs;
> + uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
> + for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
> + len -= 4, obj_table += 4) {
> + v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> + v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> + vst1q_u64((uint64_t *)obj_table, v_obj_table);
> + v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
> + v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> + vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
> + }
> + switch (n & 0x3) {
> + case 3:
> + *(obj_table++) = (void *) RTE_PTR_ADD(base_value,
> cache_objs[len--]);
> + /* fallthrough
> */
> + case 2:
> + *(obj_table++) = (void *) RTE_PTR_ADD(base_value,
> cache_objs[len--]);
> + /* fallthrough
> */
> + case 1:
> + *(obj_table++) = (void *) RTE_PTR_ADD(base_value,
> cache_objs[len--]);
> + }
> +#else
> + for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> obj_table++)
> + *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
> +#else
> for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> obj_table++)
> *obj_table = cache_objs[len];
> +#endif
>
> cache->len -= n;
>
> diff --git a/lib/mempool/rte_mempool_ops_default.c
> b/lib/mempool/rte_mempool_ops_default.c
> index 22fccf9d7619..3543cad9d4ce 100644
> --- a/lib/mempool/rte_mempool_ops_default.c
> +++ b/lib/mempool/rte_mempool_ops_default.c
> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp,
> unsigned int flags,
> obj = va + off;
> obj_cb(mp, obj_cb_arg, obj,
> (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + /* Store pool base value to calculate indices for index-based
> + * lcore cache implementation
> + */
> + if (i == 0)
> + mp->pool_base_value = obj;
> +#endif
> rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> off += mp->elt_size + mp->trailer_size;
> }
> --
> 2.25.1