Hi Mattias, Please see few comments below.
On Fri, Jul 08, 2022 at 02:56:07PM +0200, Mattias Rönnblom wrote: > Add performance test for the rte_raw_cksum() function, which delegates > the actual work to __rte_raw_cksum(), which in turn is used by other > functions in need of Internet checksum calculation. > > Signed-off-by: Mattias Rönnblom <mattias.ronnb...@ericsson.com> > > --- > > v2: > * Added __rte_unused to unused volatile variable, to keep the Intel > compiler happy. > --- > MAINTAINERS | 1 + > app/test/meson.build | 1 + > app/test/test_cksum_perf.c | 118 +++++++++++++++++++++++++++++++++++++ > 3 files changed, 120 insertions(+) > create mode 100644 app/test/test_cksum_perf.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index c923712946..2a4c99e05a 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -1414,6 +1414,7 @@ Network headers > M: Olivier Matz <olivier.m...@6wind.com> > F: lib/net/ > F: app/test/test_cksum.c > +F: app/test/test_cksum_perf.c > > Packet CRC > M: Jasvinder Singh <jasvinder.si...@intel.com> > diff --git a/app/test/meson.build b/app/test/meson.build > index 431c5bd318..191db03d1d 100644 > --- a/app/test/meson.build > +++ b/app/test/meson.build > @@ -18,6 +18,7 @@ test_sources = files( > 'test_bpf.c', > 'test_byteorder.c', > 'test_cksum.c', > + 'test_cksum_perf.c', > 'test_cmdline.c', > 'test_cmdline_cirbuf.c', > 'test_cmdline_etheraddr.c', > diff --git a/app/test/test_cksum_perf.c b/app/test/test_cksum_perf.c > new file mode 100644 > index 0000000000..bff73cb3bb > --- /dev/null > +++ b/app/test/test_cksum_perf.c > @@ -0,0 +1,118 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2022 Ericsson AB > + */ > + > +#include <stdio.h> > + > +#include <rte_common.h> > +#include <rte_cycles.h> > +#include <rte_ip.h> > +#include <rte_malloc.h> > +#include <rte_random.h> > + > +#include "test.h" > + > +#define NUM_BLOCKS (10) > +#define ITERATIONS (1000000) Parenthesis can be safely removed > + > +static const size_t data_sizes[] = { 20, 21, 100, 101, 1500, 1501 }; > + > +static __rte_noinline uint16_t > +do_rte_raw_cksum(const void *buf, size_t len) > +{ > + return rte_raw_cksum(buf, len); > +} I don't understand the need to have this wrapper, especially marked __rte_noinline. What is the objective? Note that when I remove the __rte_noinline, the performance is better for size 20 and 21. > + > +static void > +init_block(void *buf, size_t len) Can buf be a (char *) instead? It would avoid a cast below. > +{ > + size_t i; > + > + for (i = 0; i < len; i++) > + ((char *)buf)[i] = (uint8_t)rte_rand(); > +} > + > +static int > +test_cksum_perf_size_alignment(size_t block_size, bool aligned) > +{ > + char *data[NUM_BLOCKS]; > + char *blocks[NUM_BLOCKS]; > + unsigned int i; > + uint64_t start; > + uint64_t end; > + /* Floating point to handle low (pseudo-)TSC frequencies */ > + double block_latency; > + double byte_latency; > + volatile __rte_unused uint64_t sum = 0; > + > + for (i = 0; i < NUM_BLOCKS; i++) { > + data[i] = rte_malloc(NULL, block_size + 1, 0); > + > + if (data[i] == NULL) { > + printf("Failed to allocate memory for block\n"); > + return TEST_FAILED; > + } > + > + init_block(data[i], block_size + 1); > + > + blocks[i] = aligned ? data[i] : data[i] + 1; > + } > + > + start = rte_rdtsc(); > + > + for (i = 0; i < ITERATIONS; i++) { > + unsigned int j; > + for (j = 0; j < NUM_BLOCKS; j++) > + sum += do_rte_raw_cksum(blocks[j], block_size); > + } > + > + end = rte_rdtsc(); > + > + block_latency = (end - start) / (double)(ITERATIONS * NUM_BLOCKS); > + byte_latency = block_latency / block_size; > + > + printf("%-9s %10zd %19.1f %16.2f\n", aligned ? "Aligned" : "Unaligned", > + block_size, block_latency, byte_latency); When I run the test on my dev machine, I get the following results, which are quite reproductible: Aligned 20 10.4 0.52 (range is 0.48 - 0.52) Unaligned 20 7.9 0.39 (range is 0.39 - 0.40) ... If I increase the number of iterations, the first results change significantly: Aligned 20 8.2 0.42 (range is 0.41 - 0.42) Unaligned 20 8.0 0.40 (always this value) To have more precise tests with small size, would it make sense to target a test time instead of an iteration count? Something like this: #define ITERATIONS 1000000 uint64_t iterations = 0; ... do { for (i = 0; i < ITERATIONS; i++) { unsigned int j; for (j = 0; j < NUM_BLOCKS; j++) sum += do_rte_raw_cksum(blocks[j], block_size); } iterations += ITERATIONS; end = rte_rdtsc(); } while ((end - start) < rte_get_tsc_hz()); block_latency = (end - start) / (double)(iterations * NUM_BLOCKS); After this change, the aligned and unaligned cases have the same performance on my machine. > + > + for (i = 0; i < NUM_BLOCKS; i++) > + rte_free(data[i]); > + > + return TEST_SUCCESS; > +} > + > +static int > +test_cksum_perf_size(size_t block_size) > +{ > + int rc; > + > + rc = test_cksum_perf_size_alignment(block_size, true); > + if (rc != TEST_SUCCESS) > + return rc; > + > + rc = test_cksum_perf_size_alignment(block_size, false); > + > + return rc; > +} > + > +static int > +test_cksum_perf(void) > +{ > + uint16_t i; > + > + printf("### rte_raw_cksum() performance ###\n"); > + printf("Alignment Block size TSC cycles/block TSC cycles/byte\n"); > + > + for (i = 0; i < RTE_DIM(data_sizes); i++) { > + int rc; > + > + rc = test_cksum_perf_size(data_sizes[i]); > + if (rc != TEST_SUCCESS) > + return rc; > + } > + > + return TEST_SUCCESS; > +} > + > + > +REGISTER_TEST_COMMAND(cksum_perf_autotest, test_cksum_perf); > + The last empty line can be removed. > -- > 2.25.1 >