> From: Morten Brørup [mailto:m...@smartsharesystems.com] > Sent: Friday, 28 October 2022 08.42 > > When built with debug enabled (RTE_LIBRTE_MEMPOOL_DEBUG defined), the > performance of mempools with caches is improved as follows. > > Accessing objects in the mempool is likely to increment either the > put_bulk and put_objs or the get_success_bulk and get_success_objs > debug statistics counters. > > By adding an alternative set of these counters to the mempool cache > structure, accessing the dedicated debug statistics structure is > avoided in > the likely cases where these counters are incremented. > > The trick here is that the cache line holding the mempool cache > structure > is accessed anyway, in order to update the "len" field. Updating some > debug statistics counters in the same cache line has lower performance > cost than accessing the debug statistics counters in the dedicated > debug > statistics structure, i.e. in another cache line. > > Running mempool_perf_autotest on a VMware virtual server shows an avg. > increase of 6.4 % in rate_persec for the tests with cache. (Only when > built with debug enabled, obviously!) > > For the tests without cache, the avg. increase in rate_persec is 0.8 %. > I > assume this is noise from the test environment. > > v4: > * Fix spelling and repeated word in commit message, caught by > checkpatch. > v3: > * Try to fix git reference by making part of a series. > * Add --in-reply-to v1 when sending email. > v2: > * Fix spelling and repeated word in commit message, caught by > checkpatch. > > Signed-off-by: Morten Brørup <m...@smartsharesystems.com> > --- > lib/mempool/rte_mempool.c | 7 +++++ > lib/mempool/rte_mempool.h | 55 +++++++++++++++++++++++++++++++-------- > 2 files changed, 51 insertions(+), 11 deletions(-) > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c > index 21c94a2b9f..7b8c00a022 100644 > --- a/lib/mempool/rte_mempool.c > +++ b/lib/mempool/rte_mempool.c > @@ -1285,6 +1285,13 @@ rte_mempool_dump(FILE *f, struct rte_mempool > *mp) > sum.get_fail_objs += mp->stats[lcore_id].get_fail_objs; > sum.get_success_blks += mp- > >stats[lcore_id].get_success_blks; > sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks; > + /* Add the fast access statistics, if local caches exist */ > + if (mp->cache_size != 0) { > + sum.put_bulk += mp->local_cache[lcore_id].put_bulk; > + sum.put_objs += mp->local_cache[lcore_id].put_objs; > + sum.get_success_bulk += mp- > >local_cache[lcore_id].get_success_bulk; > + sum.get_success_objs += mp- > >local_cache[lcore_id].get_success_objs; > + } > } > fprintf(f, " stats:\n"); > fprintf(f, " put_bulk=%"PRIu64"\n", sum.put_bulk); > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h > index 3725a72951..d84087bc92 100644 > --- a/lib/mempool/rte_mempool.h > +++ b/lib/mempool/rte_mempool.h > @@ -86,6 +86,14 @@ struct rte_mempool_cache { > uint32_t size; /**< Size of the cache */ > uint32_t flushthresh; /**< Threshold before we flush excess > elements */ > uint32_t len; /**< Current cache count */ > +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG > + uint32_t unused; > + /* Fast access statistics, only for likely events */ > + uint64_t put_bulk; /**< Number of puts. */ > + uint64_t put_objs; /**< Number of objects > successfully put. */ > + uint64_t get_success_bulk; /**< Successful allocation number. > */ > + uint64_t get_success_objs; /**< Objects successfully > allocated. */ > +#endif > /** > * Cache objects > * > @@ -1327,13 +1335,19 @@ rte_mempool_do_generic_put(struct rte_mempool > *mp, void * const *obj_table, > { > void **cache_objs; > > + /* No cache provided */ > + if (unlikely(cache == NULL)) > + goto driver_enqueue; > + > +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG > /* increment stat now, adding in mempool always success */ > - RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1); > - RTE_MEMPOOL_STAT_ADD(mp, put_objs, n); > + cache->put_bulk += 1; > + cache->put_objs += n; > +#endif > > - /* No cache provided or the request itself is too big for the > cache */ > - if (unlikely(cache == NULL || n > cache->flushthresh)) > - goto driver_enqueue; > + /* The request is too big for the cache */ > + if (unlikely(n > cache->flushthresh)) > + goto driver_enqueue_stats_incremented; > > /* > * The cache follows the following algorithm: > @@ -1358,6 +1372,12 @@ rte_mempool_do_generic_put(struct rte_mempool > *mp, void * const *obj_table, > > driver_enqueue: > > + /* increment stat now, adding in mempool always success */ > + RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1); > + RTE_MEMPOOL_STAT_ADD(mp, put_objs, n); > + > +driver_enqueue_stats_incremented: > + > /* push objects to the backend */ > rte_mempool_ops_enqueue_bulk(mp, obj_table, n); > } > @@ -1464,8 +1484,10 @@ rte_mempool_do_generic_get(struct rte_mempool > *mp, void **obj_table, > if (remaining == 0) { > /* The entire request is satisfied from the cache. */ > > - RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1); > - RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n); > +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG > + cache->get_success_bulk += 1; > + cache->get_success_objs += n; > +#endif > > return 0; > } > @@ -1494,8 +1516,10 @@ rte_mempool_do_generic_get(struct rte_mempool > *mp, void **obj_table, > > cache->len = cache->size; > > - RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1); > - RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n); > +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG > + cache->get_success_bulk += 1; > + cache->get_success_objs += n; > +#endif > > return 0; > > @@ -1517,8 +1541,17 @@ rte_mempool_do_generic_get(struct rte_mempool > *mp, void **obj_table, > RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1); > RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n); > } else { > - RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1); > - RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n); > +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG > + if (likely(cache != NULL)) { > + cache->get_success_bulk += 1; > + cache->get_success_bulk += n; > + } else { > +#endif > + RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1); > + RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, n); > +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG > + } > +#endif > } > > return ret; > -- > 2.17.1
I am retracting this second part of the patch series, and reopening the original patch instead. This second part is probably not going to make it to 22.11 anyway. Instead, I am going to provide another patch series (after 22.11) to split the current RTE_LIBRTE_MEMPOOL_DEBUG define in two: RTE_LIBRTE_MEMPOOL_STATS for statistics, and RTE_LIBRTE_MEMPOOL_DEBUG for debugging. And then this patch can be added to the RTE_LIBRTE_MEMPOOL_STATS. -Morten