Re: [PATCH v2] app/dma-perf: introduce dma-perf application

Bruce Richardson Tue, 17 Jan 2023 05:00:52 -0800

On Tue, Jan 17, 2023 at 01:56:23AM +0000, Cheng Jiang wrote:
> There are many high-performance DMA devices supported in DPDK now, and
> these DMA devices can also be integrated into other modules of DPDK as
> accelerators, such as Vhost. Before integrating DMA into applications,
> developers need to know the performance of these DMA devices in various
> scenarios and the performance of CPUs in the same scenario, such as
> different buffer lengths. Only in this way can we know the target
> performance of the application accelerated by using them. This patch
> introduces a high-performance testing tool, which supports comparing the
> performance of CPU and DMA in different scenarios automatically with a
> pre-set config file. Memory Copy performance test are supported for now.
> 
> Signed-off-by: Cheng Jiang <cheng1.ji...@intel.com>
> Signed-off-by: Jiayu Hu <jiayu...@intel.com>
> Signed-off-by: Yuan Wang <yuanx.w...@intel.com>
> Acked-by: Morten Brørup <m...@smartsharesystems.com>
> ---
> v2: fixed some CI issues.


Some first review comments inline below. More will likely follow as I
review it further and try testing it out.

/Bruce

> 
>  app/meson.build               |   1 +
>  app/test-dma-perf/benchmark.c | 539 ++++++++++++++++++++++++++++++++++
>  app/test-dma-perf/benchmark.h |  12 +
>  app/test-dma-perf/config.ini  |  61 ++++
>  app/test-dma-perf/main.c      | 434 +++++++++++++++++++++++++++
>  app/test-dma-perf/main.h      |  53 ++++
>  app/test-dma-perf/meson.build |  22 ++
>  7 files changed, 1122 insertions(+)
>  create mode 100644 app/test-dma-perf/benchmark.c
>  create mode 100644 app/test-dma-perf/benchmark.h
>  create mode 100644 app/test-dma-perf/config.ini
>  create mode 100644 app/test-dma-perf/main.c
>  create mode 100644 app/test-dma-perf/main.h
>  create mode 100644 app/test-dma-perf/meson.build
> 
> diff --git a/app/meson.build b/app/meson.build
> index e32ea4bd5c..a060ad2725 100644
> --- a/app/meson.build
> +++ b/app/meson.build
> @@ -28,6 +28,7 @@ apps = [
>          'test-regex',
>          'test-sad',
>          'test-security-perf',
> +        'test-dma-perf',
>  ]

Lists in DPDK are always alphabetical when no other order is required,
therefore this new app should be further up the list, after
"test-crypto-perf".

> 
>  default_cflags = machine_args + ['-DALLOW_EXPERIMENTAL_API']
> diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
> new file mode 100644
> index 0000000000..1cb5b0b291
> --- /dev/null
> +++ b/app/test-dma-perf/benchmark.c
> @@ -0,0 +1,539 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#include <inttypes.h>
> +
> +#include <rte_time.h>
> +#include <rte_mbuf.h>
> +#include <rte_dmadev.h>
> +#include <rte_malloc.h>
> +#include <rte_lcore.h>
> +
> +#include "main.h"
> +#include "benchmark.h"
> +
> +
> +#define MAX_DMA_CPL_NB 255
> +
> +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 ",%.3lf,%" 
> PRIu64 "\n"
> +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%" 
> PRIu64 "\n"
> +
> +struct lcore_params {
> +     uint16_t dev_id;
> +     uint32_t nr_buf;
> +     uint16_t kick_batch;
> +     uint32_t buf_size;
> +     uint32_t repeat_times;
> +     uint16_t mpool_iter_step;
> +     struct rte_mbuf **srcs;
> +     struct rte_mbuf **dsts;
> +     uint8_t scenario_id;
> +};
> +
> +struct buf_info {
> +     struct rte_mbuf **array;
> +     uint32_t nr_buf;
> +     uint32_t buf_size;
> +};
> +
> +static struct rte_mempool *src_pool;
> +static struct rte_mempool *dst_pool;
> +
> +uint16_t dmadev_ids[MAX_WORKER_NB];
> +uint32_t nb_dmadevs;
> +
> +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
> +
> +static inline int
> +__rte_format_printf(3, 4)
> +print_err(const char *func, int lineno, const char *format, ...)
> +{
> +     va_list ap;
> +     int ret;
> +
> +     ret = fprintf(stderr, "In %s:%d - ", func, lineno);
> +     va_start(ap, format);
> +     ret += vfprintf(stderr, format, ap);
> +     va_end(ap);
> +
> +     return ret;
> +}
> +
> +static inline void
> +calc_result(struct lcore_params *p, uint64_t cp_cycle_sum, double time_sec,
> +                     uint32_t repeat_times, uint32_t *memory, uint64_t 
> *ave_cycle,
> +                     float *bandwidth, uint64_t *ops)
> +{
> +     *memory = (p->buf_size * p->nr_buf * 2) / (1024 * 1024);
> +     *ave_cycle = cp_cycle_sum / (p->repeat_times * p->nr_buf);
> +     *bandwidth = p->buf_size * 8 * rte_get_timer_hz() / (*ave_cycle * 1000 
> * 1000 * 1000.0);
> +     *ops = (double)p->nr_buf * repeat_times / time_sec;
> +}
> +
> +static void
> +output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, 
> uint64_t ave_cycle,
> +                     uint32_t buf_size, uint32_t nr_buf, uint32_t memory,
> +                     float bandwidth, uint64_t ops, bool is_dma)
> +{
> +     if (is_dma)
> +             printf("lcore %u, DMA %u:\n"
> +                             "average cycles: %" PRIu64 ","
> +                             " buffer size: %u, nr_buf: %u,"
> +                             " memory: %uMB, frequency: %" PRIu64 ".\n",
> +                             lcore_id,
> +                             dev_id,
> +                             ave_cycle,
> +                             buf_size,
> +                             nr_buf,
> +                             memory,
> +                             rte_get_timer_hz());

Longer lines are allowed for strings, so you can merge each line of output
to a single line, which will improve readability.
Also, to shorten the code, there is no reason each parameter needs to go on
its own line.

> +     else
> +             printf("lcore %u\n"
> +                     "average cycles: %" PRIu64 ","
> +                     " buffer size: %u, nr_buf: %u,"
> +                     " memory: %uMB, frequency: %" PRIu64 ".\n",
> +                     lcore_id,
> +                     ave_cycle,
> +                     buf_size,
> +                     nr_buf,
> +                     memory,
> +                     rte_get_timer_hz());

Suggestion, rather than duplicating the whole output, only the first line
needs to change based on SW vs HW copies. How about:

        if (is_dma)
                printf("lcore %u, DMA %u\n", lcore_id, dev_id);
        else
                printf("lcore %u\n", lcore_id);
        printf("average cycles: ..." , ...);

> +
> +     printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n", bandwidth, 
> ops);
> +
> +     if (is_dma)
> +             snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> +                     CSV_LINE_DMA_FMT,
> +                     scenario_id, lcore_id, dev_id, buf_size,
> +                     nr_buf, memory, ave_cycle, bandwidth, ops);
> +     else
> +             snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN,
> +                     CSV_LINE_CPU_FMT,
> +                     scenario_id, lcore_id, buf_size,
> +                     nr_buf, memory, ave_cycle, bandwidth, ops);
> +}
> +
> +static inline void
> +cache_flush_buf(void *arg)

For non-x86 builds, you probably need to mark "arg" as unused to avoid
compiler warnings.

Why is the parameter type given as a void pointer, when the type is
unconditionally cast below as "struct buf_info"? Void pointer type should
only be needed if you need to call this via a generic function pointer.

> +{
> +#ifdef RTE_ARCH_X86_64
> +     char *data;
> +     char *addr;
> +     struct buf_info *info = arg;
> +     struct rte_mbuf **srcs = info->array;
> +     uint32_t i, k;
> +
> +     for (i = 0; i < info->nr_buf; i++) {
> +             data = rte_pktmbuf_mtod(srcs[i], char *);
> +             for (k = 0; k < info->buf_size / 64; k++) {
> +                     addr = (k * 64 + data);
> +                     __builtin_ia32_clflush(addr);
> +             }

inner loop may be shorter by incrementing loop var by 64, rather than dividing
and then multiplying, since you can eliminate variable "addr".
Also can be more readable with a variable rename:

        for (offset = 0; offset < info->buf_size; offset += 64) 
                __buildin_ia32_clflush(data + offset);

> +     }
> +#endif
> +}
> +
> +/* Configuration of device. */
> +static void
> +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
> +{
> +     uint16_t vchan = 0;
> +     struct rte_dma_info info;
> +     struct rte_dma_conf dev_config = { .nb_vchans = 1 };
> +     struct rte_dma_vchan_conf qconf = {
> +             .direction = RTE_DMA_DIR_MEM_TO_MEM,
> +             .nb_desc = ring_size
> +     };
> +
> +     if (rte_dma_configure(dev_id, &dev_config) != 0)
> +             rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
> +
> +     if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) {
> +             printf("Error with queue configuration\n");
> +             rte_panic();
> +     }
> +

Inconsistency here - and below too. Either use rte_exit on failure or use
rte_panic, but don't mix them. Panic seems a little severe, so I suggest
just using rte_exit() in all cases.

> +     rte_dma_info_get(dev_id, &info);
> +     if (info.nb_vchans != 1) {
> +             printf("Error, no configured queues reported on device id 
> %u\n", dev_id);
> +             rte_panic();
> +     }
> +     if (rte_dma_start(dev_id) != 0)
> +             rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n");
> +}
> +
> +static int
> +config_dmadevs(uint32_t nb_workers, uint32_t ring_size)
> +{
> +     int16_t dev_id = rte_dma_next_dev(0);
> +     uint32_t i;
> +
> +     nb_dmadevs = 0;
> +
> +     for (i = 0; i < nb_workers; i++) {
> +             if (dev_id == -1)
> +                     goto end;
> +
> +             dmadev_ids[i] = dev_id;
> +             configure_dmadev_queue(dmadev_ids[i], ring_size);
> +             dev_id = rte_dma_next_dev(dev_id + 1);
> +             ++nb_dmadevs;

Very minor nit, but I'd suggest swapping these last two lines, incrementing
nb_dmadevs right after configuring the device, but before finding a new
one. It just makes more sense to me.

> +     }
> +
> +end:
> +     if (nb_dmadevs < nb_workers) {
> +             printf("Not enough dmadevs (%u) for all workers (%u).\n", 
> nb_dmadevs, nb_workers);
> +             return -1;
> +     }
> +
> +     RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n", nb_dmadevs);
> +
> +     return 0;
> +}
> +
> +static inline void
> +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch, 
> uint32_t buf_size,
> +                     uint16_t mpool_iter_step, struct rte_mbuf **srcs, 
> struct rte_mbuf **dsts)
> +{
> +     int64_t async_cnt = 0;
> +     int nr_cpl = 0;
> +     uint32_t index;
> +     uint16_t offset;
> +     uint32_t i;
> +
> +     for (offset = 0; offset < mpool_iter_step; offset++) {
> +             for (i = 0; index = i * mpool_iter_step + offset, index < 
> nr_buf; i++) {

Assignment in the condition part of a loop seems wrong. I suggest reworking
this to avoid it.

> +                     if (unlikely(rte_dma_copy(dev_id,
> +                                             0,
> +                                             srcs[index]->buf_iova + 
> srcs[index]->data_off,
> +                                             dsts[index]->buf_iova + 
> dsts[index]->data_off,

rte_pktmbuf_iova() macro can be used here.

> +                                             buf_size,
> +                                             0) < 0)) {
> +                             rte_dma_submit(dev_id, 0);
> +                             while (rte_dma_burst_capacity(dev_id, 0) == 0) {
> +                                     nr_cpl = rte_dma_completed(dev_id, 0, 
> MAX_DMA_CPL_NB,
> +                                                             NULL, NULL);
> +                                     async_cnt -= nr_cpl;
> +                             }
> +                             if (rte_dma_copy(dev_id,
> +                                             0,
> +                                             srcs[index]->buf_iova + 
> srcs[index]->data_off,
> +                                             dsts[index]->buf_iova + 
> dsts[index]->data_off,
> +                                             buf_size,
> +                                             0) < 0) {
> +                                     printf("enqueue fail again at %u\n", 
> index);
> +                                     printf("space:%d\n", 
> rte_dma_burst_capacity(dev_id, 0));
> +                                     rte_exit(EXIT_FAILURE, "DMA enqueue 
> failed\n");
> +                             }
> +                     }
> +                     async_cnt++;
> +
> +                     /**
> +                      * When '&' is used to wrap an index, mask must be a 
> power of 2.
> +                      * That is, kick_batch must be 2^n.

I assume that is checked on input processing when parsing the config file?

> +                      */
> +                     if (unlikely((async_cnt % kick_batch) == 0)) {

This is an expected condition that will occur with repeatable frequency.
Therefore, unlikely is not really appropriate.

> +                             rte_dma_submit(dev_id, 0);
> +                             /* add a poll to avoid ring full */
> +                             nr_cpl = rte_dma_completed(dev_id, 0, 
> MAX_DMA_CPL_NB, NULL, NULL);
> +                             async_cnt -= nr_cpl;
> +                     }
> +             }
> +
> +             rte_dma_submit(dev_id, 0);
> +             while (async_cnt > 0) {
> +                     nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, 
> NULL, NULL);
> +                     async_cnt -= nr_cpl;
> +             }

Do we need a timeout here or in the loop above incase of errors that cause
us to not get all the elements back?

> +     }
> +}
> +
> +static int
> +dma_mem_copy(void *p)
> +{

I see the call to this function within "remote_launch" uses a cast on the
function. I don't think that typecast should be necessary, but if you keep
it, you can avoid using the void pointer here and just mark the input type
as "struct lcore_params" directly.

> +     uint64_t ops;
> +     uint32_t memory;
> +     float bandwidth;
> +     double time_sec;
> +     uint32_t lcore_id = rte_lcore_id();
> +     struct lcore_params *params = (struct lcore_params *)p;
> +     uint32_t repeat_times = params->repeat_times;
> +     uint32_t buf_size = params->buf_size;
> +     uint16_t kick_batch = params->kick_batch;
> +     uint32_t lcore_nr_buf = params->nr_buf;
> +     uint16_t dev_id = params->dev_id;
> +     uint16_t mpool_iter_step = params->mpool_iter_step;
> +     struct rte_mbuf **srcs = params->srcs;
> +     struct rte_mbuf **dsts = params->dsts;
> +     uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
> +     uint32_t r;
> +
> +     begin = rte_rdtsc();
> +
> +     for (r = 0; r < repeat_times; r++)
> +             do_dma_mem_copy(dev_id, lcore_nr_buf, kick_batch, buf_size,
> +                     mpool_iter_step, srcs, dsts);
> +
> +     end = rte_rdtsc();
> +     total_cycles = end - begin;

You can do without "end" easily enough:
        total_cycles = rte_rdtsc() - begin;

> +     time_sec = (double)total_cycles / rte_get_timer_hz();
> +
> +     calc_result(params, total_cycles, time_sec, repeat_times, &memory,
> +                     &avg_cycles, &bandwidth, &ops);
> +     output_result(params->scenario_id, lcore_id, dev_id, avg_cycles, 
> buf_size, lcore_nr_buf,
> +                     memory, bandwidth, ops, true);
> +
> +     rte_free(p);
> +
> +     return 0;
> +}
> +
> +static int
> +cpu_mem_copy(void *p)
> +{

Most of comments from above, also apply here.

> +     uint32_t idx;
> +     uint32_t lcore_id;
> +     uint32_t memory;
> +     uint64_t ops;
> +     float bandwidth;
> +     double time_sec;
> +     struct lcore_params *params = (struct lcore_params *)p;
> +     uint32_t repeat_times = params->repeat_times;
> +     uint32_t buf_size = params->buf_size;
> +     uint32_t lcore_nr_buf = params->nr_buf;
> +     uint16_t mpool_iter_step = params->mpool_iter_step;
> +     struct rte_mbuf **srcs = params->srcs;
> +     struct rte_mbuf **dsts = params->dsts;
> +     uint64_t begin, end, total_cycles = 0, avg_cycles = 0;
> +     uint32_t k, j, offset;
> +
> +     begin = rte_rdtsc();
> +
> +     for (k = 0; k < repeat_times; k++) {
> +             /* copy buffer form src to dst */
> +             for (offset = 0; offset < mpool_iter_step; offset++) {
> +                     for (j = 0; idx = j * mpool_iter_step + offset, idx < 
> lcore_nr_buf; j++) {
> +                             rte_memcpy((void 
> *)(uintptr_t)rte_mbuf_data_iova(dsts[idx]),
> +                                     (void 
> *)(uintptr_t)rte_mbuf_data_iova(srcs[idx]),
> +                                     (size_t)buf_size);
> +                     }
> +             }
> +     }
> +
> +     end = rte_rdtsc();
> +     total_cycles = end - begin;
> +     time_sec = (double)total_cycles / rte_get_timer_hz();
> +
> +     lcore_id = rte_lcore_id();
> +
> +     calc_result(params, total_cycles, time_sec, repeat_times, &memory,
> +                     &avg_cycles, &bandwidth, &ops);
> +     output_result(params->scenario_id, lcore_id, 0, avg_cycles, buf_size, 
> lcore_nr_buf,
> +                     memory, bandwidth, ops, false);
> +
> +     rte_free(p);
> +
> +     return 0;
> +}
> +
> +static int
> +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
> +                     struct rte_mbuf ***dsts)
> +{
> +     uint32_t i;
> +     unsigned int buf_size = cfg->buf_size.cur;
> +     unsigned int nr_sockets;
> +     uint32_t nr_buf = cfg->nr_buf;
> +
> +     nr_sockets = rte_socket_count();
> +     if (cfg->src_numa_node >= nr_sockets ||
> +             cfg->dst_numa_node >= nr_sockets) {
> +             printf("Error: Source or destination numa exceeds the acture 
> numa nodes.\n");
> +             return -1;
> +     }
> +
> +     src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
> +                     nr_buf, /* n == num elements */
> +                     64,  /* cache size */
> +                     0,   /* priv size */
> +                     buf_size + RTE_PKTMBUF_HEADROOM,
> +                     cfg->src_numa_node);
> +     if (src_pool == NULL) {
> +             PRINT_ERR("Error with source mempool creation.\n");
> +             return -1;
> +     }
> +
> +     dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
> +                     nr_buf, /* n == num elements */
> +                     64,  /* cache size */
> +                     0,   /* priv size */
> +                     buf_size + RTE_PKTMBUF_HEADROOM,
> +                     cfg->dst_numa_node);
> +     if (dst_pool == NULL) {
> +             PRINT_ERR("Error with destination mempool creation.\n");
> +             return -1;
> +     }
> +
> +     *srcs = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf 
> *)));

Typecast for void * to other types aren't actually necessary in C.
I note some inconsistency in this file with regards to malloc. Here you use
regular malloc, while when building the parameters to pass to the memcpy
functions you use rte_malloc. I suggest standardizing on one or the other
rather than mixing.

> +     if (*srcs == NULL) {
> +             printf("Error: srcs malloc failed.\n");
> +             return -1;
> +     }
> +
> +     *dsts = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf 
> *)));
> +     if (*dsts == NULL) {
> +             printf("Error: dsts malloc failed.\n");
> +             return -1;
> +     }
> +
> +     for (i = 0; i < nr_buf; i++) {
> +             (*srcs)[i] = rte_pktmbuf_alloc(src_pool);
> +             (*dsts)[i] = rte_pktmbuf_alloc(dst_pool);

Rather than individually allocating you may well manage with
rte_mempool_get_bulk() to allocate all mbufs in one call.

> +             if ((!(*srcs)[i]) || (!(*dsts)[i])) {
> +                     printf("src: %p, dst: %p\n", (*srcs)[i], (*dsts)[i]);
> +                     return -1;
> +             }
> +
> +             (*srcs)[i]->data_len = (*srcs)[i]->pkt_len = buf_size;
> +             (*dsts)[i]->data_len = (*dsts)[i]->pkt_len = buf_size;

rte_pktmbuf_append() macro can be used here, rather than setting the
lengths manually. However, these values are not actually used anywhere else
in the code, I believe, so setting them is unnecessary. You are manually
tracking the copy lengths throughout the test, and nothing else is working
on the mbufs, so the length the mbuf reports is immaterial..


> +     }
> +
> +     return 0;
> +}
> +
> +void
> +dma_mem_copy_benchmark(struct test_configure *cfg)
> +{
> +     uint32_t i;
> +     uint32_t offset;
> +     unsigned int lcore_id  = 0;
> +     struct rte_mbuf **srcs = NULL, **dsts = NULL;
> +     unsigned int buf_size = cfg->buf_size.cur;
> +     uint16_t kick_batch = cfg->kick_batch.cur;
> +     uint16_t mpool_iter_step = cfg->mpool_iter_step;
> +     uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / 
> (cfg->buf_size.cur * 2);
> +     uint16_t nb_workers = cfg->nb_workers;
> +     uint32_t repeat_times = cfg->repeat_times;
> +
> +     if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +             goto out;
> +
> +     if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0)
> +             goto out;
> +
> +     if (cfg->cache_flush) {
> +             struct buf_info info;
> +
> +             info.array = srcs;
> +             info.buf_size = buf_size;
> +             info.nr_buf = nr_buf;
> +             cache_flush_buf(&info);
> +

>From what I can see, struct buf_info is only used for passing parameters to
the cache_flush_buf function. The code would be a lot simpler to remove the
structure and just pass 3 parameters to the function directly.

> +             info.array = dsts;
> +             cache_flush_buf(&info);
> +             rte_mb();
> +     }
> +
> +     printf("Start testing....\n");
> +
> +     for (i = 0; i < nb_workers; i++) {
> +             lcore_id = rte_get_next_lcore(lcore_id, true, true);
> +             offset = nr_buf / nb_workers * i;
> +
> +             struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
> +             if (!p) {
> +                     printf("lcore parameters malloc failure for lcore 
> %d\n", lcore_id);
> +                     break;
> +             }
> +             *p = (struct lcore_params) {
> +                     dmadev_ids[i],
> +                     (uint32_t)(nr_buf/nb_workers),
> +                     kick_batch,
> +                     buf_size,
> +                     repeat_times,
> +                     mpool_iter_step,
> +                     srcs + offset,
> +                     dsts + offset,
> +                     cfg->scenario_id
> +             };
> +
> +             rte_eal_remote_launch((lcore_function_t *)dma_mem_copy, p, 
> lcore_id);
> +     }
> +
> +     rte_eal_mp_wait_lcore();
> +
> +out:
> +     /* free env */
> +     if (srcs) {
> +             for (i = 0; i < nr_buf; i++)
> +                     rte_pktmbuf_free(srcs[i]);
> +             free(srcs);
> +     }
> +     if (dsts) {
> +             for (i = 0; i < nr_buf; i++)
> +                     rte_pktmbuf_free(dsts[i]);
> +             free(dsts);
> +     }
> +
> +     if (src_pool)
> +             rte_mempool_free(src_pool);
> +     if (dst_pool)
> +             rte_mempool_free(dst_pool);
> +
> +     for (i = 0; i < nb_dmadevs; i++) {
> +             printf("Stopping dmadev %d\n", dmadev_ids[i]);
> +             rte_dma_stop(dmadev_ids[i]);
> +     }
> +}
> +
> +void
> +cpu_mem_copy_benchmark(struct test_configure *cfg)
> +{
> +     uint32_t i, offset;
> +     uint32_t repeat_times = cfg->repeat_times;
> +     uint32_t kick_batch = cfg->kick_batch.cur;
> +     uint32_t buf_size = cfg->buf_size.cur;
> +     uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / 
> (cfg->buf_size.cur * 2);
> +     uint16_t nb_workers = cfg->nb_workers;
> +     uint16_t mpool_iter_step = cfg->mpool_iter_step;
> +     struct rte_mbuf **srcs  = NULL, **dsts  = NULL;
> +     unsigned int lcore_id = 0;
> +
> +     if (setup_memory_env(cfg, &srcs, &dsts) < 0)
> +             goto out;
> +
> +     for (i = 0; i < nb_workers; i++) {
> +             lcore_id = rte_get_next_lcore(lcore_id, rte_lcore_count() > 1 ? 
> 1 : 0, 1);
> +             offset = nr_buf / nb_workers * i;
> +             struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0);
> +             if (!p) {
> +                     printf("lcore parameters malloc failure for lcore 
> %d\n", lcore_id);
> +                     break;
> +             }
> +             *p = (struct lcore_params) { 0, nr_buf/nb_workers, kick_batch,
> +                                             buf_size, repeat_times, 
> mpool_iter_step,
> +                                             srcs + offset, dsts + offset, 
> cfg->scenario_id };

Formatting should be the same as function above.

> +             rte_eal_remote_launch((lcore_function_t *)cpu_mem_copy, p, 
> lcore_id);
> +     }
> +
> +     rte_eal_mp_wait_lcore();
> +
> +out:
> +     /* free env */
> +     if (srcs) {
> +             for (i = 0; i < nr_buf; i++)
> +                     rte_pktmbuf_free(srcs[i]);
> +             free(srcs);
> +     }
> +     if (dsts) {
> +             for (i = 0; i < nr_buf; i++)
> +                     rte_pktmbuf_free(dsts[i]);
> +             free(dsts);
> +     }
> +
> +     if (src_pool)
> +             rte_mempool_free(src_pool);
> +     if (dst_pool)
> +             rte_mempool_free(dst_pool);
> +}

There seems a quite a bit of common code between the dma_mem_copy_benchmark
and cpu_mem_copy_benchmark. Might be worth investigating if they can be
merged while still keeping readability.

> diff --git a/app/test-dma-perf/benchmark.h b/app/test-dma-perf/benchmark.h
> new file mode 100644
> index 0000000000..f5ad8d6d99
> --- /dev/null
> +++ b/app/test-dma-perf/benchmark.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#ifndef _BENCHMARK_H_
> +#define _BENCHMARK_H_
> +
> +void dma_mem_copy_benchmark(struct test_configure *cfg);
> +
> +void cpu_mem_copy_benchmark(struct test_configure *cfg);
> +
> +#endif /* _BENCHMARK_H_ */

You don't really need two separate headers in this application. Both main.h
and benchmark.h can be merged into one header, since both are always
included in both c files.

> diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
> new file mode 100644
> index 0000000000..e24bb19414
> --- /dev/null
> +++ b/app/test-dma-perf/config.ini
> @@ -0,0 +1,61 @@
> +
> +; Supported test types:
> +; DMA_MEM_COPY|CPU_MEM_COPY
> +
> +; Parameters:
> +; "mem_size","buf_size","dma_ring_size","kick_batch".
> +; "mem_size" means the size of the memory footprint.
> +; "buf_size" means the memory size of a single operation.
> +; "dma_ring_size" means the dma ring buffer size.
> +; "kick_batch" means dma operation batch size.
> +
> +; Format: variable=first[,last,increment[,ADD|MUL]]
> +; ADD is the default mode.
> +
> +; src_numa_node is used to control the numa node where the source memory is 
> allocated.
> +; dst_numa_node is used to control the numa node where the destination 
> memory is allocated.
> +
> +; cache_flush is used to control if the cache should be flushed.
> +
> +; repeat_times is used to control the repeat times of the whole case.
> +
> +; worker_threads is used to control the threads number of the test app.
> +; It should be less than the core number.
> +
> +; mpool_iter_step is used to control the buffer continuity.
> +
> +; Bind DMA to lcore:
> +; Specify the "lcore_dma" parameter.
> +; The number of "lcore_dma" should be greater than or equal to the number of 
> "worker_threads".
> +; Otherwise the remaining DMA devices will be automatically allocated to 
> threads that are not
> +; specified. If EAL parameters "-l" and "-a" are specified, the "lcore_dma" 
> should be within
> +; their range.
> +
> +[case1]
> +type=DMA_MEM_COPY
> +mem_size=10
> +buf_size=64,8192,2,MUL
> +dma_ring_size=1024
> +kick_batch=32
> +src_numa_node=0
> +dst_numa_node=0
> +cache_flush=0
> +repeat_times=10
> +worker_threads=1
> +mpool_iter_step=1
> +lcore_dma=lcore3@0000:00:04.0
> +eal_args=--legacy-mem --file-prefix=test
> +
> +[case2]
> +type=CPU_MEM_COPY
> +mem_size=10
> +buf_size=64,8192,2,MUL
> +dma_ring_size=1024
> +kick_batch=32
> +src_numa_node=0
> +dst_numa_node=1
> +cache_flush=0
> +repeat_times=100
> +worker_threads=1
> +mpool_iter_step=1
> +eal_args=--no-pci
> diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
> new file mode 100644
> index 0000000000..94ba369539
> --- /dev/null
> +++ b/app/test-dma-perf/main.c
> @@ -0,0 +1,434 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#include <stdio.h>
> +#if !defined(RTE_EXEC_ENV_LINUX)
> +
> +int
> +main(int argc, char *argv[])
> +{
> +     printf("OS not supported, skipping test\n");
> +     return 0;
> +}
> +

What is linux-specific about this app?

If we do need to limit the app to Linux-only I suggest using meson to do so
rather than putting #ifdefs in the code.

> +#else
> +
> +#include <stdlib.h>
> +#include <getopt.h>
> +#include <signal.h>

<snip>

Re: [PATCH v2] app/dma-perf: introduce dma-perf application

Reply via email to