PING (again) for review. Many applications use bursts of more than 32 packets, and some applications buffer more than 512 packets.
This patch updates the mempool perf test accordingly. -Morten > From: Morten Brørup [mailto:m...@smartsharesystems.com] > Sent: Thursday, 4 April 2024 11.27 > > PING for review. This patch is relatively trivial. > > > From: Morten Brørup [mailto:m...@smartsharesystems.com] > > Sent: Saturday, 2 March 2024 21.04 > > > > Bursts of up to 64, 128 and 256 packets are not uncommon, so increase the > > maximum tested get and put burst sizes from 32 to 256. > > For convenience, also test get and put burst sizes of > > RTE_MEMPOOL_CACHE_MAX_SIZE. > > > > Some applications keep more than 512 objects, so increase the maximum > > number of kept objects from 512 to 32768, still in jumps of factor four. > > This exceeds the typical mempool cache size of 512 objects, so the test > > also exercises the mempool driver. > > > > Increased the precision of rate_persec calculation by timing the actual > > duration of the test, instead of assuming it took exactly 5 seconds. > > > > Added cache guard to per-lcore stats structure. > > > > Signed-off-by: Morten Brørup <m...@smartsharesystems.com> > > Acked-by: Chengwen Feng <fengcheng...@huawei.com> > > --- > > > > v7: > > * Increase max burst size to 256. (Inspired by Honnappa) > > v6: > > * Do not test with more lcores than available. (Thomas) > > v5: > > * Increased N, to reduce measurement overhead with large numbers of kept > > objects. > > * Increased precision of rate_persec calculation. > > * Added missing cache guard to per-lcore stats structure. > > v4: > > * v3 failed to apply; I had messed up something with git. > > * Added ACK from Chengwen Feng. > > v3: > > * Increased max number of kept objects to 32768. > > * Added get and put burst sizes of RTE_MEMPOOL_CACHE_MAX_SIZE objects. > > * Print error if unable to allocate mempool. > > * Initialize use_external_cache with each test. > > A previous version of this patch had a bug, where all test runs > > following the first would use external cache. (Chengwen Feng) > > v2: Addressed feedback by Chengwen Feng > > * Added get and put burst sizes of 64 objects, which is probably also not > > uncommon packet burst size. > > * Fixed list of number of kept objects so list remains in jumps of factor > > four. > > * Added three derivative test cases, for faster testing. > > --- > > app/test/test_mempool_perf.c | 144 +++++++++++++++++++++++------------ > > 1 file changed, 96 insertions(+), 48 deletions(-) > > > > diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c > > index 96de347f04..bb40d1d911 100644 > > --- a/app/test/test_mempool_perf.c > > +++ b/app/test/test_mempool_perf.c > > @@ -54,22 +54,25 @@ > > * > > * - Bulk size (*n_get_bulk*, *n_put_bulk*) > > * > > - * - Bulk get from 1 to 32 > > - * - Bulk put from 1 to 32 > > - * - Bulk get and put from 1 to 32, compile time constant > > + * - Bulk get from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE > > + * - Bulk put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE > > + * - Bulk get and put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE, > > compile time constant > > * > > * - Number of kept objects (*n_keep*) > > * > > * - 32 > > * - 128 > > * - 512 > > + * - 2048 > > + * - 8192 > > + * - 32768 > > */ > > > > -#define N 65536 > > #define TIME_S 5 > > #define MEMPOOL_ELT_SIZE 2048 > > -#define MAX_KEEP 512 > > -#define MEMPOOL_SIZE > > ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1) > > +#define MAX_KEEP 32768 > > +#define N (128 * MAX_KEEP) > > +#define MEMPOOL_SIZE > > ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE*2))-1) > > > > /* Number of pointers fitting into one cache line. */ > > #define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE / sizeof(uintptr_t)) > > @@ -100,9 +103,11 @@ static unsigned n_keep; > > /* true if we want to test with constant n_get_bulk and n_put_bulk */ > > static int use_constant_values; > > > > -/* number of enqueues / dequeues */ > > +/* number of enqueues / dequeues, and time used */ > > struct mempool_test_stats { > > uint64_t enq_count; > > + uint64_t duration_cycles; > > + RTE_CACHE_GUARD; > > } __rte_cache_aligned; > > > > static struct mempool_test_stats stats[RTE_MAX_LCORE]; > > @@ -185,6 +190,7 @@ per_lcore_mempool_test(void *arg) > > GOTO_ERR(ret, out); > > > > stats[lcore_id].enq_count = 0; > > + stats[lcore_id].duration_cycles = 0; > > > > /* wait synchro for workers */ > > if (lcore_id != rte_get_main_lcore()) > > @@ -204,6 +210,15 @@ per_lcore_mempool_test(void *arg) > > CACHE_LINE_BURST, CACHE_LINE_BURST); > > else if (n_get_bulk == 32) > > ret = test_loop(mp, cache, n_keep, 32, 32); > > + else if (n_get_bulk == 64) > > + ret = test_loop(mp, cache, n_keep, 64, 64); > > + else if (n_get_bulk == 128) > > + ret = test_loop(mp, cache, n_keep, 128, 128); > > + else if (n_get_bulk == 256) > > + ret = test_loop(mp, cache, n_keep, 256, 256); > > + else if (n_get_bulk == RTE_MEMPOOL_CACHE_MAX_SIZE) > > + ret = test_loop(mp, cache, n_keep, > > + RTE_MEMPOOL_CACHE_MAX_SIZE, > > RTE_MEMPOOL_CACHE_MAX_SIZE); > > else > > ret = -1; > > > > @@ -215,6 +230,8 @@ per_lcore_mempool_test(void *arg) > > stats[lcore_id].enq_count += N; > > } > > > > + stats[lcore_id].duration_cycles = time_diff; > > + > > out: > > if (use_external_cache) { > > rte_mempool_cache_flush(cache, mp); > > @@ -232,6 +249,7 @@ launch_cores(struct rte_mempool *mp, unsigned int cores) > > uint64_t rate; > > int ret; > > unsigned cores_save = cores; > > + double hz = rte_get_timer_hz(); > > > > __atomic_store_n(&synchro, 0, __ATOMIC_RELAXED); > > > > @@ -278,7 +296,9 @@ launch_cores(struct rte_mempool *mp, unsigned int cores) > > > > rate = 0; > > for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) > > - rate += (stats[lcore_id].enq_count / TIME_S); > > + if (stats[lcore_id].duration_cycles != 0) > > + rate += (double)stats[lcore_id].enq_count * hz / > > + (double)stats[lcore_id].duration_cycles; > > > > printf("rate_persec=%" PRIu64 "\n", rate); > > > > @@ -287,11 +307,13 @@ launch_cores(struct rte_mempool *mp, unsigned int > cores) > > > > /* for a given number of core, launch all test cases */ > > static int > > -do_one_mempool_test(struct rte_mempool *mp, unsigned int cores) > > +do_one_mempool_test(struct rte_mempool *mp, unsigned int cores, int > > external_cache) > > { > > - unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 0 }; > > - unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 }; > > - unsigned int keep_tab[] = { 32, 128, 512, 0 }; > > + unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128, > > 256, > > + RTE_MEMPOOL_CACHE_MAX_SIZE, 0 }; > > + unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128, > > 256, > > + RTE_MEMPOOL_CACHE_MAX_SIZE, 0 }; > > + unsigned int keep_tab[] = { 32, 128, 512, 2048, 8192, 32768, 0 }; > > unsigned *get_bulk_ptr; > > unsigned *put_bulk_ptr; > > unsigned *keep_ptr; > > @@ -301,6 +323,10 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned > int > > cores) > > for (put_bulk_ptr = bulk_tab_put; *put_bulk_ptr; put_bulk_ptr++) > > { > > for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) { > > > > + if (*keep_ptr < *get_bulk_ptr || *keep_ptr < > > *put_bulk_ptr) > > + continue; > > + > > + use_external_cache = external_cache; > > use_constant_values = 0; > > n_get_bulk = *get_bulk_ptr; > > n_put_bulk = *put_bulk_ptr; > > @@ -323,7 +349,7 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned int > > cores) > > } > > > > static int > > -test_mempool_perf(void) > > +do_all_mempool_perf_tests(unsigned int cores) > > { > > struct rte_mempool *mp_cache = NULL; > > struct rte_mempool *mp_nocache = NULL; > > @@ -337,8 +363,10 @@ test_mempool_perf(void) > > NULL, NULL, > > my_obj_init, NULL, > > SOCKET_ID_ANY, 0); > > - if (mp_nocache == NULL) > > + if (mp_nocache == NULL) { > > + printf("cannot allocate mempool (without cache)\n"); > > goto err; > > + } > > > > /* create a mempool (with cache) */ > > mp_cache = rte_mempool_create("perf_test_cache", MEMPOOL_SIZE, > > @@ -347,8 +375,10 @@ test_mempool_perf(void) > > NULL, NULL, > > my_obj_init, NULL, > > SOCKET_ID_ANY, 0); > > - if (mp_cache == NULL) > > + if (mp_cache == NULL) { > > + printf("cannot allocate mempool (with cache)\n"); > > goto err; > > + } > > > > default_pool_ops = rte_mbuf_best_mempool_ops(); > > /* Create a mempool based on Default handler */ > > @@ -376,65 +406,83 @@ test_mempool_perf(void) > > > > rte_mempool_obj_iter(default_pool, my_obj_init, NULL); > > > > - /* performance test with 1, 2 and max cores */ > > printf("start performance test (without cache)\n"); > > - > > - if (do_one_mempool_test(mp_nocache, 1) < 0) > > - goto err; > > - > > - if (do_one_mempool_test(mp_nocache, 2) < 0) > > + if (do_one_mempool_test(mp_nocache, cores, 0) < 0) > > goto err; > > > > - if (do_one_mempool_test(mp_nocache, rte_lcore_count()) < 0) > > - goto err; > > - > > - /* performance test with 1, 2 and max cores */ > > printf("start performance test for %s (without cache)\n", > > default_pool_ops); > > - > > - if (do_one_mempool_test(default_pool, 1) < 0) > > + if (do_one_mempool_test(default_pool, cores, 0) < 0) > > goto err; > > > > - if (do_one_mempool_test(default_pool, 2) < 0) > > + printf("start performance test (with cache)\n"); > > + if (do_one_mempool_test(mp_cache, cores, 0) < 0) > > goto err; > > > > - if (do_one_mempool_test(default_pool, rte_lcore_count()) < 0) > > + printf("start performance test (with user-owned cache)\n"); > > + if (do_one_mempool_test(mp_nocache, cores, 1) < 0) > > goto err; > > > > - /* performance test with 1, 2 and max cores */ > > - printf("start performance test (with cache)\n"); > > + rte_mempool_list_dump(stdout); > > > > - if (do_one_mempool_test(mp_cache, 1) < 0) > > - goto err; > > + ret = 0; > > > > - if (do_one_mempool_test(mp_cache, 2) < 0) > > - goto err; > > +err: > > + rte_mempool_free(mp_cache); > > + rte_mempool_free(mp_nocache); > > + rte_mempool_free(default_pool); > > + return ret; > > +} > > > > - if (do_one_mempool_test(mp_cache, rte_lcore_count()) < 0) > > - goto err; > > +static int > > +test_mempool_perf_1core(void) > > +{ > > + return do_all_mempool_perf_tests(1); > > +} > > > > - /* performance test with 1, 2 and max cores */ > > - printf("start performance test (with user-owned cache)\n"); > > - use_external_cache = 1; > > +static int > > +test_mempool_perf_2cores(void) > > +{ > > + if (rte_lcore_count() < 2) { > > + printf("not enough lcores\n"); > > + return -1; > > + } > > + return do_all_mempool_perf_tests(2); > > +} > > > > - if (do_one_mempool_test(mp_nocache, 1) < 0) > > - goto err; > > +static int > > +test_mempool_perf_allcores(void) > > +{ > > + return do_all_mempool_perf_tests(rte_lcore_count()); > > +} > > + > > +static int > > +test_mempool_perf(void) > > +{ > > + int ret = -1; > > > > - if (do_one_mempool_test(mp_nocache, 2) < 0) > > + /* performance test with 1, 2 and max cores */ > > + if (do_all_mempool_perf_tests(1) < 0) > > goto err; > > + if (rte_lcore_count() == 1) > > + goto done; > > > > - if (do_one_mempool_test(mp_nocache, rte_lcore_count()) < 0) > > + if (do_all_mempool_perf_tests(2) < 0) > > goto err; > > + if (rte_lcore_count() == 2) > > + goto done; > > > > - rte_mempool_list_dump(stdout); > > + if (do_all_mempool_perf_tests(rte_lcore_count()) < 0) > > + goto err; > > > > +done: > > ret = 0; > > > > err: > > - rte_mempool_free(mp_cache); > > - rte_mempool_free(mp_nocache); > > - rte_mempool_free(default_pool); > > return ret; > > } > > > > REGISTER_PERF_TEST(mempool_perf_autotest, test_mempool_perf); > > +REGISTER_PERF_TEST(mempool_perf_autotest_1core, test_mempool_perf_1core); > > +REGISTER_PERF_TEST(mempool_perf_autotest_2cores, test_mempool_perf_2cores); > > +REGISTER_PERF_TEST(mempool_perf_autotest_allcores, > > test_mempool_perf_allcores); > > -- > > 2.17.1