RE: [PATCH v7] mempool: test performance with larger bursts

Morten Brørup Mon, 10 Jun 2024 01:56:15 -0700

PING (again) for review.

Many applications use bursts of more than 32 packets,
and some applications buffer more than 512 packets.


This patch updates the mempool perf test accordingly.

-Morten

> From: Morten Brørup [mailto:m...@smartsharesystems.com]
> Sent: Thursday, 4 April 2024 11.27
> 
> PING for review. This patch is relatively trivial.
> 
> > From: Morten Brørup [mailto:m...@smartsharesystems.com]
> > Sent: Saturday, 2 March 2024 21.04
> >
> > Bursts of up to 64, 128 and 256 packets are not uncommon, so increase the
> > maximum tested get and put burst sizes from 32 to 256.
> > For convenience, also test get and put burst sizes of
> > RTE_MEMPOOL_CACHE_MAX_SIZE.
> >
> > Some applications keep more than 512 objects, so increase the maximum
> > number of kept objects from 512 to 32768, still in jumps of factor four.
> > This exceeds the typical mempool cache size of 512 objects, so the test
> > also exercises the mempool driver.
> >
> > Increased the precision of rate_persec calculation by timing the actual
> > duration of the test, instead of assuming it took exactly 5 seconds.
> >
> > Added cache guard to per-lcore stats structure.
> >
> > Signed-off-by: Morten Brørup <m...@smartsharesystems.com>
> > Acked-by: Chengwen Feng <fengcheng...@huawei.com>
> > ---
> >
> > v7:
> > * Increase max burst size to 256. (Inspired by Honnappa)
> > v6:
> > * Do not test with more lcores than available. (Thomas)
> > v5:
> > * Increased N, to reduce measurement overhead with large numbers of kept
> >   objects.
> > * Increased precision of rate_persec calculation.
> > * Added missing cache guard to per-lcore stats structure.
> > v4:
> > * v3 failed to apply; I had messed up something with git.
> > * Added ACK from Chengwen Feng.
> > v3:
> > * Increased max number of kept objects to 32768.
> > * Added get and put burst sizes of RTE_MEMPOOL_CACHE_MAX_SIZE objects.
> > * Print error if unable to allocate mempool.
> > * Initialize use_external_cache with each test.
> >   A previous version of this patch had a bug, where all test runs
> >   following the first would use external cache. (Chengwen Feng)
> > v2: Addressed feedback by Chengwen Feng
> > * Added get and put burst sizes of 64 objects, which is probably also not
> >   uncommon packet burst size.
> > * Fixed list of number of kept objects so list remains in jumps of factor
> >   four.
> > * Added three derivative test cases, for faster testing.
> > ---
> >  app/test/test_mempool_perf.c | 144 +++++++++++++++++++++++------------
> >  1 file changed, 96 insertions(+), 48 deletions(-)
> >
> > diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
> > index 96de347f04..bb40d1d911 100644
> > --- a/app/test/test_mempool_perf.c
> > +++ b/app/test/test_mempool_perf.c
> > @@ -54,22 +54,25 @@
> >   *
> >   *    - Bulk size (*n_get_bulk*, *n_put_bulk*)
> >   *
> > - *      - Bulk get from 1 to 32
> > - *      - Bulk put from 1 to 32
> > - *      - Bulk get and put from 1 to 32, compile time constant
> > + *      - Bulk get from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE
> > + *      - Bulk put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE
> > + *      - Bulk get and put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE,
> > compile time constant
> >   *
> >   *    - Number of kept objects (*n_keep*)
> >   *
> >   *      - 32
> >   *      - 128
> >   *      - 512
> > + *      - 2048
> > + *      - 8192
> > + *      - 32768
> >   */
> >
> > -#define N 65536
> >  #define TIME_S 5
> >  #define MEMPOOL_ELT_SIZE 2048
> > -#define MAX_KEEP 512
> > -#define MEMPOOL_SIZE
> > ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE))-1)
> > +#define MAX_KEEP 32768
> > +#define N (128 * MAX_KEEP)
> > +#define MEMPOOL_SIZE
> > ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE*2))-1)
> >
> >  /* Number of pointers fitting into one cache line. */
> >  #define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE / sizeof(uintptr_t))
> > @@ -100,9 +103,11 @@ static unsigned n_keep;
> >  /* true if we want to test with constant n_get_bulk and n_put_bulk */
> >  static int use_constant_values;
> >
> > -/* number of enqueues / dequeues */
> > +/* number of enqueues / dequeues, and time used */
> >  struct mempool_test_stats {
> >     uint64_t enq_count;
> > +   uint64_t duration_cycles;
> > +   RTE_CACHE_GUARD;
> >  } __rte_cache_aligned;
> >
> >  static struct mempool_test_stats stats[RTE_MAX_LCORE];
> > @@ -185,6 +190,7 @@ per_lcore_mempool_test(void *arg)
> >             GOTO_ERR(ret, out);
> >
> >     stats[lcore_id].enq_count = 0;
> > +   stats[lcore_id].duration_cycles = 0;
> >
> >     /* wait synchro for workers */
> >     if (lcore_id != rte_get_main_lcore())
> > @@ -204,6 +210,15 @@ per_lcore_mempool_test(void *arg)
> >                                     CACHE_LINE_BURST, CACHE_LINE_BURST);
> >             else if (n_get_bulk == 32)
> >                     ret = test_loop(mp, cache, n_keep, 32, 32);
> > +           else if (n_get_bulk == 64)
> > +                   ret = test_loop(mp, cache, n_keep, 64, 64);
> > +           else if (n_get_bulk == 128)
> > +                   ret = test_loop(mp, cache, n_keep, 128, 128);
> > +           else if (n_get_bulk == 256)
> > +                   ret = test_loop(mp, cache, n_keep, 256, 256);
> > +           else if (n_get_bulk == RTE_MEMPOOL_CACHE_MAX_SIZE)
> > +                   ret = test_loop(mp, cache, n_keep,
> > +                                   RTE_MEMPOOL_CACHE_MAX_SIZE,
> > RTE_MEMPOOL_CACHE_MAX_SIZE);
> >             else
> >                     ret = -1;
> >
> > @@ -215,6 +230,8 @@ per_lcore_mempool_test(void *arg)
> >             stats[lcore_id].enq_count += N;
> >     }
> >
> > +   stats[lcore_id].duration_cycles = time_diff;
> > +
> >  out:
> >     if (use_external_cache) {
> >             rte_mempool_cache_flush(cache, mp);
> > @@ -232,6 +249,7 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
> >     uint64_t rate;
> >     int ret;
> >     unsigned cores_save = cores;
> > +   double hz = rte_get_timer_hz();
> >
> >     __atomic_store_n(&synchro, 0, __ATOMIC_RELAXED);
> >
> > @@ -278,7 +296,9 @@ launch_cores(struct rte_mempool *mp, unsigned int cores)
> >
> >     rate = 0;
> >     for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > -           rate += (stats[lcore_id].enq_count / TIME_S);
> > +           if (stats[lcore_id].duration_cycles != 0)
> > +                   rate += (double)stats[lcore_id].enq_count * hz /
> > +                                   (double)stats[lcore_id].duration_cycles;
> >
> >     printf("rate_persec=%" PRIu64 "\n", rate);
> >
> > @@ -287,11 +307,13 @@ launch_cores(struct rte_mempool *mp, unsigned int
> cores)
> >
> >  /* for a given number of core, launch all test cases */
> >  static int
> > -do_one_mempool_test(struct rte_mempool *mp, unsigned int cores)
> > +do_one_mempool_test(struct rte_mempool *mp, unsigned int cores, int
> > external_cache)
> >  {
> > -   unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
> > -   unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 0 };
> > -   unsigned int keep_tab[] = { 32, 128, 512, 0 };
> > +   unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128,
> > 256,
> > +                   RTE_MEMPOOL_CACHE_MAX_SIZE, 0 };
> > +   unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128,
> > 256,
> > +                   RTE_MEMPOOL_CACHE_MAX_SIZE, 0 };
> > +   unsigned int keep_tab[] = { 32, 128, 512, 2048, 8192, 32768, 0 };
> >     unsigned *get_bulk_ptr;
> >     unsigned *put_bulk_ptr;
> >     unsigned *keep_ptr;
> > @@ -301,6 +323,10 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned
> int
> > cores)
> >             for (put_bulk_ptr = bulk_tab_put; *put_bulk_ptr; put_bulk_ptr++)
> > {
> >                     for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {
> >
> > +                           if (*keep_ptr < *get_bulk_ptr || *keep_ptr <
> > *put_bulk_ptr)
> > +                                   continue;
> > +
> > +                           use_external_cache = external_cache;
> >                             use_constant_values = 0;
> >                             n_get_bulk = *get_bulk_ptr;
> >                             n_put_bulk = *put_bulk_ptr;
> > @@ -323,7 +349,7 @@ do_one_mempool_test(struct rte_mempool *mp, unsigned int
> > cores)
> >  }
> >
> >  static int
> > -test_mempool_perf(void)
> > +do_all_mempool_perf_tests(unsigned int cores)
> >  {
> >     struct rte_mempool *mp_cache = NULL;
> >     struct rte_mempool *mp_nocache = NULL;
> > @@ -337,8 +363,10 @@ test_mempool_perf(void)
> >                                     NULL, NULL,
> >                                     my_obj_init, NULL,
> >                                     SOCKET_ID_ANY, 0);
> > -   if (mp_nocache == NULL)
> > +   if (mp_nocache == NULL) {
> > +           printf("cannot allocate mempool (without cache)\n");
> >             goto err;
> > +   }
> >
> >     /* create a mempool (with cache) */
> >     mp_cache = rte_mempool_create("perf_test_cache", MEMPOOL_SIZE,
> > @@ -347,8 +375,10 @@ test_mempool_perf(void)
> >                                   NULL, NULL,
> >                                   my_obj_init, NULL,
> >                                   SOCKET_ID_ANY, 0);
> > -   if (mp_cache == NULL)
> > +   if (mp_cache == NULL) {
> > +           printf("cannot allocate mempool (with cache)\n");
> >             goto err;
> > +   }
> >
> >     default_pool_ops = rte_mbuf_best_mempool_ops();
> >     /* Create a mempool based on Default handler */
> > @@ -376,65 +406,83 @@ test_mempool_perf(void)
> >
> >     rte_mempool_obj_iter(default_pool, my_obj_init, NULL);
> >
> > -   /* performance test with 1, 2 and max cores */
> >     printf("start performance test (without cache)\n");
> > -
> > -   if (do_one_mempool_test(mp_nocache, 1) < 0)
> > -           goto err;
> > -
> > -   if (do_one_mempool_test(mp_nocache, 2) < 0)
> > +   if (do_one_mempool_test(mp_nocache, cores, 0) < 0)
> >             goto err;
> >
> > -   if (do_one_mempool_test(mp_nocache, rte_lcore_count()) < 0)
> > -           goto err;
> > -
> > -   /* performance test with 1, 2 and max cores */
> >     printf("start performance test for %s (without cache)\n",
> >            default_pool_ops);
> > -
> > -   if (do_one_mempool_test(default_pool, 1) < 0)
> > +   if (do_one_mempool_test(default_pool, cores, 0) < 0)
> >             goto err;
> >
> > -   if (do_one_mempool_test(default_pool, 2) < 0)
> > +   printf("start performance test (with cache)\n");
> > +   if (do_one_mempool_test(mp_cache, cores, 0) < 0)
> >             goto err;
> >
> > -   if (do_one_mempool_test(default_pool, rte_lcore_count()) < 0)
> > +   printf("start performance test (with user-owned cache)\n");
> > +   if (do_one_mempool_test(mp_nocache, cores, 1) < 0)
> >             goto err;
> >
> > -   /* performance test with 1, 2 and max cores */
> > -   printf("start performance test (with cache)\n");
> > +   rte_mempool_list_dump(stdout);
> >
> > -   if (do_one_mempool_test(mp_cache, 1) < 0)
> > -           goto err;
> > +   ret = 0;
> >
> > -   if (do_one_mempool_test(mp_cache, 2) < 0)
> > -           goto err;
> > +err:
> > +   rte_mempool_free(mp_cache);
> > +   rte_mempool_free(mp_nocache);
> > +   rte_mempool_free(default_pool);
> > +   return ret;
> > +}
> >
> > -   if (do_one_mempool_test(mp_cache, rte_lcore_count()) < 0)
> > -           goto err;
> > +static int
> > +test_mempool_perf_1core(void)
> > +{
> > +   return do_all_mempool_perf_tests(1);
> > +}
> >
> > -   /* performance test with 1, 2 and max cores */
> > -   printf("start performance test (with user-owned cache)\n");
> > -   use_external_cache = 1;
> > +static int
> > +test_mempool_perf_2cores(void)
> > +{
> > +   if (rte_lcore_count() < 2) {
> > +           printf("not enough lcores\n");
> > +           return -1;
> > +   }
> > +   return do_all_mempool_perf_tests(2);
> > +}
> >
> > -   if (do_one_mempool_test(mp_nocache, 1) < 0)
> > -           goto err;
> > +static int
> > +test_mempool_perf_allcores(void)
> > +{
> > +   return do_all_mempool_perf_tests(rte_lcore_count());
> > +}
> > +
> > +static int
> > +test_mempool_perf(void)
> > +{
> > +   int ret = -1;
> >
> > -   if (do_one_mempool_test(mp_nocache, 2) < 0)
> > +   /* performance test with 1, 2 and max cores */
> > +   if (do_all_mempool_perf_tests(1) < 0)
> >             goto err;
> > +   if (rte_lcore_count() == 1)
> > +           goto done;
> >
> > -   if (do_one_mempool_test(mp_nocache, rte_lcore_count()) < 0)
> > +   if (do_all_mempool_perf_tests(2) < 0)
> >             goto err;
> > +   if (rte_lcore_count() == 2)
> > +           goto done;
> >
> > -   rte_mempool_list_dump(stdout);
> > +   if (do_all_mempool_perf_tests(rte_lcore_count()) < 0)
> > +           goto err;
> >
> > +done:
> >     ret = 0;
> >
> >  err:
> > -   rte_mempool_free(mp_cache);
> > -   rte_mempool_free(mp_nocache);
> > -   rte_mempool_free(default_pool);
> >     return ret;
> >  }
> >
> >  REGISTER_PERF_TEST(mempool_perf_autotest, test_mempool_perf);
> > +REGISTER_PERF_TEST(mempool_perf_autotest_1core, test_mempool_perf_1core);
> > +REGISTER_PERF_TEST(mempool_perf_autotest_2cores, test_mempool_perf_2cores);
> > +REGISTER_PERF_TEST(mempool_perf_autotest_allcores,
> > test_mempool_perf_allcores);
> > --
> > 2.17.1

RE: [PATCH v7] mempool: test performance with larger bursts

Reply via email to