Dmitry Kozlyuk <dkozl...@nvidia.com> writes: > Memory allocator performance is crucial to applications that deal > with large amount of memory or allocate frequently. DPDK allocator > performance is affected by EAL options, API used and, at least, > allocation size. New autotest is intended to be run with different > EAL options. It measures performance with a range of sizes > for dirrerent APIs: rte_malloc, rte_zmalloc, and rte_memzone_reserve. > > Work distribution between allocation and deallocation depends on EAL > options. The test prints both times and total time to ease comparison. > > Memory can be filled with zeroes at different points of allocation path, > but it always takes considerable fraction of overall timing. This is why > the test measures filling speed and prints how long clearing takes > for each size as a reference (for rte_memzone_reserve estimations > are printed). > > Signed-off-by: Dmitry Kozlyuk <dkozl...@nvidia.com> > Reviewed-by: Viacheslav Ovsiienko <viachesl...@nvidia.com> > ---
Thanks for making the changes. Acked-by: Aaron Conole <acon...@redhat.com> > app/test/meson.build | 2 + > app/test/test_malloc_perf.c | 174 ++++++++++++++++++++++++++++++++++++ > 2 files changed, 176 insertions(+) > create mode 100644 app/test/test_malloc_perf.c > > diff --git a/app/test/meson.build b/app/test/meson.build > index 344a609a4d..50cf2602a9 100644 > --- a/app/test/meson.build > +++ b/app/test/meson.build > @@ -88,6 +88,7 @@ test_sources = files( > 'test_lpm6_perf.c', > 'test_lpm_perf.c', > 'test_malloc.c', > + 'test_malloc_perf.c', > 'test_mbuf.c', > 'test_member.c', > 'test_member_perf.c', > @@ -295,6 +296,7 @@ extra_test_names = [ > > perf_test_names = [ > 'ring_perf_autotest', > + 'malloc_perf_autotest', > 'mempool_perf_autotest', > 'memcpy_perf_autotest', > 'hash_perf_autotest', > diff --git a/app/test/test_malloc_perf.c b/app/test/test_malloc_perf.c > new file mode 100644 > index 0000000000..9686fc8af5 > --- /dev/null > +++ b/app/test/test_malloc_perf.c > @@ -0,0 +1,174 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright (c) 2021 NVIDIA Corporation & Affiliates > + */ > + > +#include <inttypes.h> > +#include <string.h> > +#include <rte_cycles.h> > +#include <rte_errno.h> > +#include <rte_malloc.h> > +#include <rte_memzone.h> > + > +#include "test.h" > + > +#define TEST_LOG(level, ...) RTE_LOG(level, USER1, __VA_ARGS__) > + > +typedef void * (alloc_t)(const char *name, size_t size, unsigned int align); > +typedef void (free_t)(void *addr); > +typedef void * (memset_t)(void *addr, int value, size_t size); > + > +static const uint64_t KB = 1 << 10; > +static const uint64_t GB = 1 << 30; > + > +static double > +tsc_to_us(uint64_t tsc, size_t runs) > +{ > + return (double)tsc / rte_get_tsc_hz() * US_PER_S / runs; > +} > + > +static int > +test_memset_perf(double *us_per_gb) > +{ > + static const size_t RUNS = 20; > + > + void *ptr; > + size_t i; > + uint64_t tsc; > + > + TEST_LOG(INFO, "Reference: memset\n"); > + > + ptr = rte_malloc(NULL, GB, 0); > + if (ptr == NULL) { > + TEST_LOG(ERR, "rte_malloc(size=%"PRIx64") failed\n", GB); > + return -1; > + } > + > + tsc = rte_rdtsc_precise(); > + for (i = 0; i < RUNS; i++) > + memset(ptr, 0, GB); > + tsc = rte_rdtsc_precise() - tsc; > + > + *us_per_gb = tsc_to_us(tsc, RUNS); > + TEST_LOG(INFO, "Result: %f.3 GiB/s <=> %.2f us/MiB\n", > + US_PER_S / *us_per_gb, *us_per_gb / KB); > + > + rte_free(ptr); > + TEST_LOG(INFO, "\n"); > + return 0; > +} > + > +static int > +test_alloc_perf(const char *name, alloc_t *alloc_fn, free_t *free_fn, > + memset_t *memset_fn, double memset_gb_us, size_t max_runs) > +{ > + static const size_t SIZES[] = { > + 1 << 6, 1 << 7, 1 << 10, 1 << 12, 1 << 16, 1 << 20, > + 1 << 21, 1 << 22, 1 << 24, 1 << 30 }; > + > + size_t i, j; > + void **ptrs; > + > + TEST_LOG(INFO, "Performance: %s\n", name); > + > + ptrs = calloc(max_runs, sizeof(ptrs[0])); > + if (ptrs == NULL) { > + TEST_LOG(ERR, "Cannot allocate memory for pointers"); > + return -1; > + } > + > + TEST_LOG(INFO, "%12s%8s%12s%12s%12s%17s\n", "Size (B)", "Runs", > + "Alloc (us)", "Free (us)", "Total (us)", > + memset_fn != NULL ? "memset (us)" : "est.memset (us)"); > + for (i = 0; i < RTE_DIM(SIZES); i++) { > + size_t size = SIZES[i]; > + size_t runs_done; > + uint64_t tsc_start, tsc_alloc, tsc_memset = 0, tsc_free; > + double alloc_time, free_time, memset_time; > + > + tsc_start = rte_rdtsc_precise(); > + for (j = 0; j < max_runs; j++) { > + ptrs[j] = alloc_fn(NULL, size, 0); > + if (ptrs[j] == NULL) > + break; > + } > + tsc_alloc = rte_rdtsc_precise() - tsc_start; > + > + if (j == 0) { > + TEST_LOG(INFO, "%12zu Interrupted: out of memory.\n", > + size); > + break; > + } > + runs_done = j; > + > + if (memset_fn != NULL) { > + tsc_start = rte_rdtsc_precise(); > + for (j = 0; j < runs_done && ptrs[j] != NULL; j++) > + memset_fn(ptrs[j], 0, size); > + tsc_memset = rte_rdtsc_precise() - tsc_start; > + } > + > + tsc_start = rte_rdtsc_precise(); > + for (j = 0; j < runs_done && ptrs[j] != NULL; j++) > + free_fn(ptrs[j]); > + tsc_free = rte_rdtsc_precise() - tsc_start; > + > + alloc_time = tsc_to_us(tsc_alloc, runs_done); > + free_time = tsc_to_us(tsc_free, runs_done); > + memset_time = memset_fn != NULL ? > + tsc_to_us(tsc_memset, runs_done) : > + memset_gb_us * size / GB; > + TEST_LOG(INFO, "%12zu%8zu%12.2f%12.2f%12.2f%17.2f\n", > + size, runs_done, alloc_time, free_time, > + alloc_time + free_time, memset_time); > + > + memset(ptrs, 0, max_runs * sizeof(ptrs[0])); > + } > + > + free(ptrs); > + TEST_LOG(INFO, "\n"); > + return 0; > +} > + > +static void * > +memzone_alloc(const char *name __rte_unused, size_t size, unsigned int align) > +{ > + const struct rte_memzone *mz; > + char gen_name[RTE_MEMZONE_NAMESIZE]; > + > + snprintf(gen_name, sizeof(gen_name), "test-mz-%"PRIx64, rte_rdtsc()); > + mz = rte_memzone_reserve_aligned(gen_name, size, SOCKET_ID_ANY, > + RTE_MEMZONE_1GB | RTE_MEMZONE_SIZE_HINT_ONLY, align); > + return (void *)(uintptr_t)mz; > +} > + > +static void > +memzone_free(void *addr) > +{ > + rte_memzone_free((struct rte_memzone *)addr); > +} > + > +static int > +test_malloc_perf(void) > +{ > + static const size_t MAX_RUNS = 10000; > + > + double memset_us_gb; > + > + if (test_memset_perf(&memset_us_gb) < 0) > + return -1; > + > + if (test_alloc_perf("rte_malloc", rte_malloc, rte_free, memset, > + memset_us_gb, MAX_RUNS) < 0) > + return -1; > + if (test_alloc_perf("rte_zmalloc", rte_zmalloc, rte_free, memset, > + memset_us_gb, MAX_RUNS) < 0) > + return -1; > + > + if (test_alloc_perf("rte_memzone_reserve", memzone_alloc, memzone_free, > + NULL, memset_us_gb, RTE_MAX_MEMZONE - 1) < 0) > + return -1; > + > + return 0; > +} > + > +REGISTER_TEST_COMMAND(malloc_perf_autotest, test_malloc_perf);