On Tue, Jan 17, 2023 at 01:56:23AM +0000, Cheng Jiang wrote: > There are many high-performance DMA devices supported in DPDK now, and > these DMA devices can also be integrated into other modules of DPDK as > accelerators, such as Vhost. Before integrating DMA into applications, > developers need to know the performance of these DMA devices in various > scenarios and the performance of CPUs in the same scenario, such as > different buffer lengths. Only in this way can we know the target > performance of the application accelerated by using them. This patch > introduces a high-performance testing tool, which supports comparing the > performance of CPU and DMA in different scenarios automatically with a > pre-set config file. Memory Copy performance test are supported for now. > > Signed-off-by: Cheng Jiang <cheng1.ji...@intel.com> > Signed-off-by: Jiayu Hu <jiayu...@intel.com> > Signed-off-by: Yuan Wang <yuanx.w...@intel.com> > Acked-by: Morten Brørup <m...@smartsharesystems.com> > --- > v2: fixed some CI issues.
Some first review comments inline below. More will likely follow as I review it further and try testing it out. /Bruce > > app/meson.build | 1 + > app/test-dma-perf/benchmark.c | 539 ++++++++++++++++++++++++++++++++++ > app/test-dma-perf/benchmark.h | 12 + > app/test-dma-perf/config.ini | 61 ++++ > app/test-dma-perf/main.c | 434 +++++++++++++++++++++++++++ > app/test-dma-perf/main.h | 53 ++++ > app/test-dma-perf/meson.build | 22 ++ > 7 files changed, 1122 insertions(+) > create mode 100644 app/test-dma-perf/benchmark.c > create mode 100644 app/test-dma-perf/benchmark.h > create mode 100644 app/test-dma-perf/config.ini > create mode 100644 app/test-dma-perf/main.c > create mode 100644 app/test-dma-perf/main.h > create mode 100644 app/test-dma-perf/meson.build > > diff --git a/app/meson.build b/app/meson.build > index e32ea4bd5c..a060ad2725 100644 > --- a/app/meson.build > +++ b/app/meson.build > @@ -28,6 +28,7 @@ apps = [ > 'test-regex', > 'test-sad', > 'test-security-perf', > + 'test-dma-perf', > ] Lists in DPDK are always alphabetical when no other order is required, therefore this new app should be further up the list, after "test-crypto-perf". > > default_cflags = machine_args + ['-DALLOW_EXPERIMENTAL_API'] > diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c > new file mode 100644 > index 0000000000..1cb5b0b291 > --- /dev/null > +++ b/app/test-dma-perf/benchmark.c > @@ -0,0 +1,539 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2022 Intel Corporation > + */ > + > +#include <inttypes.h> > + > +#include <rte_time.h> > +#include <rte_mbuf.h> > +#include <rte_dmadev.h> > +#include <rte_malloc.h> > +#include <rte_lcore.h> > + > +#include "main.h" > +#include "benchmark.h" > + > + > +#define MAX_DMA_CPL_NB 255 > + > +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 ",%.3lf,%" > PRIu64 "\n" > +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 ",%.3lf,%" > PRIu64 "\n" > + > +struct lcore_params { > + uint16_t dev_id; > + uint32_t nr_buf; > + uint16_t kick_batch; > + uint32_t buf_size; > + uint32_t repeat_times; > + uint16_t mpool_iter_step; > + struct rte_mbuf **srcs; > + struct rte_mbuf **dsts; > + uint8_t scenario_id; > +}; > + > +struct buf_info { > + struct rte_mbuf **array; > + uint32_t nr_buf; > + uint32_t buf_size; > +}; > + > +static struct rte_mempool *src_pool; > +static struct rte_mempool *dst_pool; > + > +uint16_t dmadev_ids[MAX_WORKER_NB]; > +uint32_t nb_dmadevs; > + > +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__) > + > +static inline int > +__rte_format_printf(3, 4) > +print_err(const char *func, int lineno, const char *format, ...) > +{ > + va_list ap; > + int ret; > + > + ret = fprintf(stderr, "In %s:%d - ", func, lineno); > + va_start(ap, format); > + ret += vfprintf(stderr, format, ap); > + va_end(ap); > + > + return ret; > +} > + > +static inline void > +calc_result(struct lcore_params *p, uint64_t cp_cycle_sum, double time_sec, > + uint32_t repeat_times, uint32_t *memory, uint64_t > *ave_cycle, > + float *bandwidth, uint64_t *ops) > +{ > + *memory = (p->buf_size * p->nr_buf * 2) / (1024 * 1024); > + *ave_cycle = cp_cycle_sum / (p->repeat_times * p->nr_buf); > + *bandwidth = p->buf_size * 8 * rte_get_timer_hz() / (*ave_cycle * 1000 > * 1000 * 1000.0); > + *ops = (double)p->nr_buf * repeat_times / time_sec; > +} > + > +static void > +output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, > uint64_t ave_cycle, > + uint32_t buf_size, uint32_t nr_buf, uint32_t memory, > + float bandwidth, uint64_t ops, bool is_dma) > +{ > + if (is_dma) > + printf("lcore %u, DMA %u:\n" > + "average cycles: %" PRIu64 "," > + " buffer size: %u, nr_buf: %u," > + " memory: %uMB, frequency: %" PRIu64 ".\n", > + lcore_id, > + dev_id, > + ave_cycle, > + buf_size, > + nr_buf, > + memory, > + rte_get_timer_hz()); Longer lines are allowed for strings, so you can merge each line of output to a single line, which will improve readability. Also, to shorten the code, there is no reason each parameter needs to go on its own line. > + else > + printf("lcore %u\n" > + "average cycles: %" PRIu64 "," > + " buffer size: %u, nr_buf: %u," > + " memory: %uMB, frequency: %" PRIu64 ".\n", > + lcore_id, > + ave_cycle, > + buf_size, > + nr_buf, > + memory, > + rte_get_timer_hz()); Suggestion, rather than duplicating the whole output, only the first line needs to change based on SW vs HW copies. How about: if (is_dma) printf("lcore %u, DMA %u\n", lcore_id, dev_id); else printf("lcore %u\n", lcore_id); printf("average cycles: ..." , ...); > + > + printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n", bandwidth, > ops); > + > + if (is_dma) > + snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, > + CSV_LINE_DMA_FMT, > + scenario_id, lcore_id, dev_id, buf_size, > + nr_buf, memory, ave_cycle, bandwidth, ops); > + else > + snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, > + CSV_LINE_CPU_FMT, > + scenario_id, lcore_id, buf_size, > + nr_buf, memory, ave_cycle, bandwidth, ops); > +} > + > +static inline void > +cache_flush_buf(void *arg) For non-x86 builds, you probably need to mark "arg" as unused to avoid compiler warnings. Why is the parameter type given as a void pointer, when the type is unconditionally cast below as "struct buf_info"? Void pointer type should only be needed if you need to call this via a generic function pointer. > +{ > +#ifdef RTE_ARCH_X86_64 > + char *data; > + char *addr; > + struct buf_info *info = arg; > + struct rte_mbuf **srcs = info->array; > + uint32_t i, k; > + > + for (i = 0; i < info->nr_buf; i++) { > + data = rte_pktmbuf_mtod(srcs[i], char *); > + for (k = 0; k < info->buf_size / 64; k++) { > + addr = (k * 64 + data); > + __builtin_ia32_clflush(addr); > + } inner loop may be shorter by incrementing loop var by 64, rather than dividing and then multiplying, since you can eliminate variable "addr". Also can be more readable with a variable rename: for (offset = 0; offset < info->buf_size; offset += 64) __buildin_ia32_clflush(data + offset); > + } > +#endif > +} > + > +/* Configuration of device. */ > +static void > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) > +{ > + uint16_t vchan = 0; > + struct rte_dma_info info; > + struct rte_dma_conf dev_config = { .nb_vchans = 1 }; > + struct rte_dma_vchan_conf qconf = { > + .direction = RTE_DMA_DIR_MEM_TO_MEM, > + .nb_desc = ring_size > + }; > + > + if (rte_dma_configure(dev_id, &dev_config) != 0) > + rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n"); > + > + if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) { > + printf("Error with queue configuration\n"); > + rte_panic(); > + } > + Inconsistency here - and below too. Either use rte_exit on failure or use rte_panic, but don't mix them. Panic seems a little severe, so I suggest just using rte_exit() in all cases. > + rte_dma_info_get(dev_id, &info); > + if (info.nb_vchans != 1) { > + printf("Error, no configured queues reported on device id > %u\n", dev_id); > + rte_panic(); > + } > + if (rte_dma_start(dev_id) != 0) > + rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n"); > +} > + > +static int > +config_dmadevs(uint32_t nb_workers, uint32_t ring_size) > +{ > + int16_t dev_id = rte_dma_next_dev(0); > + uint32_t i; > + > + nb_dmadevs = 0; > + > + for (i = 0; i < nb_workers; i++) { > + if (dev_id == -1) > + goto end; > + > + dmadev_ids[i] = dev_id; > + configure_dmadev_queue(dmadev_ids[i], ring_size); > + dev_id = rte_dma_next_dev(dev_id + 1); > + ++nb_dmadevs; Very minor nit, but I'd suggest swapping these last two lines, incrementing nb_dmadevs right after configuring the device, but before finding a new one. It just makes more sense to me. > + } > + > +end: > + if (nb_dmadevs < nb_workers) { > + printf("Not enough dmadevs (%u) for all workers (%u).\n", > nb_dmadevs, nb_workers); > + return -1; > + } > + > + RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n", nb_dmadevs); > + > + return 0; > +} > + > +static inline void > +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t kick_batch, > uint32_t buf_size, > + uint16_t mpool_iter_step, struct rte_mbuf **srcs, > struct rte_mbuf **dsts) > +{ > + int64_t async_cnt = 0; > + int nr_cpl = 0; > + uint32_t index; > + uint16_t offset; > + uint32_t i; > + > + for (offset = 0; offset < mpool_iter_step; offset++) { > + for (i = 0; index = i * mpool_iter_step + offset, index < > nr_buf; i++) { Assignment in the condition part of a loop seems wrong. I suggest reworking this to avoid it. > + if (unlikely(rte_dma_copy(dev_id, > + 0, > + srcs[index]->buf_iova + > srcs[index]->data_off, > + dsts[index]->buf_iova + > dsts[index]->data_off, rte_pktmbuf_iova() macro can be used here. > + buf_size, > + 0) < 0)) { > + rte_dma_submit(dev_id, 0); > + while (rte_dma_burst_capacity(dev_id, 0) == 0) { > + nr_cpl = rte_dma_completed(dev_id, 0, > MAX_DMA_CPL_NB, > + NULL, NULL); > + async_cnt -= nr_cpl; > + } > + if (rte_dma_copy(dev_id, > + 0, > + srcs[index]->buf_iova + > srcs[index]->data_off, > + dsts[index]->buf_iova + > dsts[index]->data_off, > + buf_size, > + 0) < 0) { > + printf("enqueue fail again at %u\n", > index); > + printf("space:%d\n", > rte_dma_burst_capacity(dev_id, 0)); > + rte_exit(EXIT_FAILURE, "DMA enqueue > failed\n"); > + } > + } > + async_cnt++; > + > + /** > + * When '&' is used to wrap an index, mask must be a > power of 2. > + * That is, kick_batch must be 2^n. I assume that is checked on input processing when parsing the config file? > + */ > + if (unlikely((async_cnt % kick_batch) == 0)) { This is an expected condition that will occur with repeatable frequency. Therefore, unlikely is not really appropriate. > + rte_dma_submit(dev_id, 0); > + /* add a poll to avoid ring full */ > + nr_cpl = rte_dma_completed(dev_id, 0, > MAX_DMA_CPL_NB, NULL, NULL); > + async_cnt -= nr_cpl; > + } > + } > + > + rte_dma_submit(dev_id, 0); > + while (async_cnt > 0) { > + nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, > NULL, NULL); > + async_cnt -= nr_cpl; > + } Do we need a timeout here or in the loop above incase of errors that cause us to not get all the elements back? > + } > +} > + > +static int > +dma_mem_copy(void *p) > +{ I see the call to this function within "remote_launch" uses a cast on the function. I don't think that typecast should be necessary, but if you keep it, you can avoid using the void pointer here and just mark the input type as "struct lcore_params" directly. > + uint64_t ops; > + uint32_t memory; > + float bandwidth; > + double time_sec; > + uint32_t lcore_id = rte_lcore_id(); > + struct lcore_params *params = (struct lcore_params *)p; > + uint32_t repeat_times = params->repeat_times; > + uint32_t buf_size = params->buf_size; > + uint16_t kick_batch = params->kick_batch; > + uint32_t lcore_nr_buf = params->nr_buf; > + uint16_t dev_id = params->dev_id; > + uint16_t mpool_iter_step = params->mpool_iter_step; > + struct rte_mbuf **srcs = params->srcs; > + struct rte_mbuf **dsts = params->dsts; > + uint64_t begin, end, total_cycles = 0, avg_cycles = 0; > + uint32_t r; > + > + begin = rte_rdtsc(); > + > + for (r = 0; r < repeat_times; r++) > + do_dma_mem_copy(dev_id, lcore_nr_buf, kick_batch, buf_size, > + mpool_iter_step, srcs, dsts); > + > + end = rte_rdtsc(); > + total_cycles = end - begin; You can do without "end" easily enough: total_cycles = rte_rdtsc() - begin; > + time_sec = (double)total_cycles / rte_get_timer_hz(); > + > + calc_result(params, total_cycles, time_sec, repeat_times, &memory, > + &avg_cycles, &bandwidth, &ops); > + output_result(params->scenario_id, lcore_id, dev_id, avg_cycles, > buf_size, lcore_nr_buf, > + memory, bandwidth, ops, true); > + > + rte_free(p); > + > + return 0; > +} > + > +static int > +cpu_mem_copy(void *p) > +{ Most of comments from above, also apply here. > + uint32_t idx; > + uint32_t lcore_id; > + uint32_t memory; > + uint64_t ops; > + float bandwidth; > + double time_sec; > + struct lcore_params *params = (struct lcore_params *)p; > + uint32_t repeat_times = params->repeat_times; > + uint32_t buf_size = params->buf_size; > + uint32_t lcore_nr_buf = params->nr_buf; > + uint16_t mpool_iter_step = params->mpool_iter_step; > + struct rte_mbuf **srcs = params->srcs; > + struct rte_mbuf **dsts = params->dsts; > + uint64_t begin, end, total_cycles = 0, avg_cycles = 0; > + uint32_t k, j, offset; > + > + begin = rte_rdtsc(); > + > + for (k = 0; k < repeat_times; k++) { > + /* copy buffer form src to dst */ > + for (offset = 0; offset < mpool_iter_step; offset++) { > + for (j = 0; idx = j * mpool_iter_step + offset, idx < > lcore_nr_buf; j++) { > + rte_memcpy((void > *)(uintptr_t)rte_mbuf_data_iova(dsts[idx]), > + (void > *)(uintptr_t)rte_mbuf_data_iova(srcs[idx]), > + (size_t)buf_size); > + } > + } > + } > + > + end = rte_rdtsc(); > + total_cycles = end - begin; > + time_sec = (double)total_cycles / rte_get_timer_hz(); > + > + lcore_id = rte_lcore_id(); > + > + calc_result(params, total_cycles, time_sec, repeat_times, &memory, > + &avg_cycles, &bandwidth, &ops); > + output_result(params->scenario_id, lcore_id, 0, avg_cycles, buf_size, > lcore_nr_buf, > + memory, bandwidth, ops, false); > + > + rte_free(p); > + > + return 0; > +} > + > +static int > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs, > + struct rte_mbuf ***dsts) > +{ > + uint32_t i; > + unsigned int buf_size = cfg->buf_size.cur; > + unsigned int nr_sockets; > + uint32_t nr_buf = cfg->nr_buf; > + > + nr_sockets = rte_socket_count(); > + if (cfg->src_numa_node >= nr_sockets || > + cfg->dst_numa_node >= nr_sockets) { > + printf("Error: Source or destination numa exceeds the acture > numa nodes.\n"); > + return -1; > + } > + > + src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC", > + nr_buf, /* n == num elements */ > + 64, /* cache size */ > + 0, /* priv size */ > + buf_size + RTE_PKTMBUF_HEADROOM, > + cfg->src_numa_node); > + if (src_pool == NULL) { > + PRINT_ERR("Error with source mempool creation.\n"); > + return -1; > + } > + > + dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST", > + nr_buf, /* n == num elements */ > + 64, /* cache size */ > + 0, /* priv size */ > + buf_size + RTE_PKTMBUF_HEADROOM, > + cfg->dst_numa_node); > + if (dst_pool == NULL) { > + PRINT_ERR("Error with destination mempool creation.\n"); > + return -1; > + } > + > + *srcs = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf > *))); Typecast for void * to other types aren't actually necessary in C. I note some inconsistency in this file with regards to malloc. Here you use regular malloc, while when building the parameters to pass to the memcpy functions you use rte_malloc. I suggest standardizing on one or the other rather than mixing. > + if (*srcs == NULL) { > + printf("Error: srcs malloc failed.\n"); > + return -1; > + } > + > + *dsts = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf > *))); > + if (*dsts == NULL) { > + printf("Error: dsts malloc failed.\n"); > + return -1; > + } > + > + for (i = 0; i < nr_buf; i++) { > + (*srcs)[i] = rte_pktmbuf_alloc(src_pool); > + (*dsts)[i] = rte_pktmbuf_alloc(dst_pool); Rather than individually allocating you may well manage with rte_mempool_get_bulk() to allocate all mbufs in one call. > + if ((!(*srcs)[i]) || (!(*dsts)[i])) { > + printf("src: %p, dst: %p\n", (*srcs)[i], (*dsts)[i]); > + return -1; > + } > + > + (*srcs)[i]->data_len = (*srcs)[i]->pkt_len = buf_size; > + (*dsts)[i]->data_len = (*dsts)[i]->pkt_len = buf_size; rte_pktmbuf_append() macro can be used here, rather than setting the lengths manually. However, these values are not actually used anywhere else in the code, I believe, so setting them is unnecessary. You are manually tracking the copy lengths throughout the test, and nothing else is working on the mbufs, so the length the mbuf reports is immaterial.. > + } > + > + return 0; > +} > + > +void > +dma_mem_copy_benchmark(struct test_configure *cfg) > +{ > + uint32_t i; > + uint32_t offset; > + unsigned int lcore_id = 0; > + struct rte_mbuf **srcs = NULL, **dsts = NULL; > + unsigned int buf_size = cfg->buf_size.cur; > + uint16_t kick_batch = cfg->kick_batch.cur; > + uint16_t mpool_iter_step = cfg->mpool_iter_step; > + uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / > (cfg->buf_size.cur * 2); > + uint16_t nb_workers = cfg->nb_workers; > + uint32_t repeat_times = cfg->repeat_times; > + > + if (setup_memory_env(cfg, &srcs, &dsts) < 0) > + goto out; > + > + if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0) > + goto out; > + > + if (cfg->cache_flush) { > + struct buf_info info; > + > + info.array = srcs; > + info.buf_size = buf_size; > + info.nr_buf = nr_buf; > + cache_flush_buf(&info); > + >From what I can see, struct buf_info is only used for passing parameters to the cache_flush_buf function. The code would be a lot simpler to remove the structure and just pass 3 parameters to the function directly. > + info.array = dsts; > + cache_flush_buf(&info); > + rte_mb(); > + } > + > + printf("Start testing....\n"); > + > + for (i = 0; i < nb_workers; i++) { > + lcore_id = rte_get_next_lcore(lcore_id, true, true); > + offset = nr_buf / nb_workers * i; > + > + struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0); > + if (!p) { > + printf("lcore parameters malloc failure for lcore > %d\n", lcore_id); > + break; > + } > + *p = (struct lcore_params) { > + dmadev_ids[i], > + (uint32_t)(nr_buf/nb_workers), > + kick_batch, > + buf_size, > + repeat_times, > + mpool_iter_step, > + srcs + offset, > + dsts + offset, > + cfg->scenario_id > + }; > + > + rte_eal_remote_launch((lcore_function_t *)dma_mem_copy, p, > lcore_id); > + } > + > + rte_eal_mp_wait_lcore(); > + > +out: > + /* free env */ > + if (srcs) { > + for (i = 0; i < nr_buf; i++) > + rte_pktmbuf_free(srcs[i]); > + free(srcs); > + } > + if (dsts) { > + for (i = 0; i < nr_buf; i++) > + rte_pktmbuf_free(dsts[i]); > + free(dsts); > + } > + > + if (src_pool) > + rte_mempool_free(src_pool); > + if (dst_pool) > + rte_mempool_free(dst_pool); > + > + for (i = 0; i < nb_dmadevs; i++) { > + printf("Stopping dmadev %d\n", dmadev_ids[i]); > + rte_dma_stop(dmadev_ids[i]); > + } > +} > + > +void > +cpu_mem_copy_benchmark(struct test_configure *cfg) > +{ > + uint32_t i, offset; > + uint32_t repeat_times = cfg->repeat_times; > + uint32_t kick_batch = cfg->kick_batch.cur; > + uint32_t buf_size = cfg->buf_size.cur; > + uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / > (cfg->buf_size.cur * 2); > + uint16_t nb_workers = cfg->nb_workers; > + uint16_t mpool_iter_step = cfg->mpool_iter_step; > + struct rte_mbuf **srcs = NULL, **dsts = NULL; > + unsigned int lcore_id = 0; > + > + if (setup_memory_env(cfg, &srcs, &dsts) < 0) > + goto out; > + > + for (i = 0; i < nb_workers; i++) { > + lcore_id = rte_get_next_lcore(lcore_id, rte_lcore_count() > 1 ? > 1 : 0, 1); > + offset = nr_buf / nb_workers * i; > + struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0); > + if (!p) { > + printf("lcore parameters malloc failure for lcore > %d\n", lcore_id); > + break; > + } > + *p = (struct lcore_params) { 0, nr_buf/nb_workers, kick_batch, > + buf_size, repeat_times, > mpool_iter_step, > + srcs + offset, dsts + offset, > cfg->scenario_id }; Formatting should be the same as function above. > + rte_eal_remote_launch((lcore_function_t *)cpu_mem_copy, p, > lcore_id); > + } > + > + rte_eal_mp_wait_lcore(); > + > +out: > + /* free env */ > + if (srcs) { > + for (i = 0; i < nr_buf; i++) > + rte_pktmbuf_free(srcs[i]); > + free(srcs); > + } > + if (dsts) { > + for (i = 0; i < nr_buf; i++) > + rte_pktmbuf_free(dsts[i]); > + free(dsts); > + } > + > + if (src_pool) > + rte_mempool_free(src_pool); > + if (dst_pool) > + rte_mempool_free(dst_pool); > +} There seems a quite a bit of common code between the dma_mem_copy_benchmark and cpu_mem_copy_benchmark. Might be worth investigating if they can be merged while still keeping readability. > diff --git a/app/test-dma-perf/benchmark.h b/app/test-dma-perf/benchmark.h > new file mode 100644 > index 0000000000..f5ad8d6d99 > --- /dev/null > +++ b/app/test-dma-perf/benchmark.h > @@ -0,0 +1,12 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2022 Intel Corporation > + */ > + > +#ifndef _BENCHMARK_H_ > +#define _BENCHMARK_H_ > + > +void dma_mem_copy_benchmark(struct test_configure *cfg); > + > +void cpu_mem_copy_benchmark(struct test_configure *cfg); > + > +#endif /* _BENCHMARK_H_ */ You don't really need two separate headers in this application. Both main.h and benchmark.h can be merged into one header, since both are always included in both c files. > diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini > new file mode 100644 > index 0000000000..e24bb19414 > --- /dev/null > +++ b/app/test-dma-perf/config.ini > @@ -0,0 +1,61 @@ > + > +; Supported test types: > +; DMA_MEM_COPY|CPU_MEM_COPY > + > +; Parameters: > +; "mem_size","buf_size","dma_ring_size","kick_batch". > +; "mem_size" means the size of the memory footprint. > +; "buf_size" means the memory size of a single operation. > +; "dma_ring_size" means the dma ring buffer size. > +; "kick_batch" means dma operation batch size. > + > +; Format: variable=first[,last,increment[,ADD|MUL]] > +; ADD is the default mode. > + > +; src_numa_node is used to control the numa node where the source memory is > allocated. > +; dst_numa_node is used to control the numa node where the destination > memory is allocated. > + > +; cache_flush is used to control if the cache should be flushed. > + > +; repeat_times is used to control the repeat times of the whole case. > + > +; worker_threads is used to control the threads number of the test app. > +; It should be less than the core number. > + > +; mpool_iter_step is used to control the buffer continuity. > + > +; Bind DMA to lcore: > +; Specify the "lcore_dma" parameter. > +; The number of "lcore_dma" should be greater than or equal to the number of > "worker_threads". > +; Otherwise the remaining DMA devices will be automatically allocated to > threads that are not > +; specified. If EAL parameters "-l" and "-a" are specified, the "lcore_dma" > should be within > +; their range. > + > +[case1] > +type=DMA_MEM_COPY > +mem_size=10 > +buf_size=64,8192,2,MUL > +dma_ring_size=1024 > +kick_batch=32 > +src_numa_node=0 > +dst_numa_node=0 > +cache_flush=0 > +repeat_times=10 > +worker_threads=1 > +mpool_iter_step=1 > +lcore_dma=lcore3@0000:00:04.0 > +eal_args=--legacy-mem --file-prefix=test > + > +[case2] > +type=CPU_MEM_COPY > +mem_size=10 > +buf_size=64,8192,2,MUL > +dma_ring_size=1024 > +kick_batch=32 > +src_numa_node=0 > +dst_numa_node=1 > +cache_flush=0 > +repeat_times=100 > +worker_threads=1 > +mpool_iter_step=1 > +eal_args=--no-pci > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c > new file mode 100644 > index 0000000000..94ba369539 > --- /dev/null > +++ b/app/test-dma-perf/main.c > @@ -0,0 +1,434 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2022 Intel Corporation > + */ > + > +#include <stdio.h> > +#if !defined(RTE_EXEC_ENV_LINUX) > + > +int > +main(int argc, char *argv[]) > +{ > + printf("OS not supported, skipping test\n"); > + return 0; > +} > + What is linux-specific about this app? If we do need to limit the app to Linux-only I suggest using meson to do so rather than putting #ifdefs in the code. > +#else > + > +#include <stdlib.h> > +#include <getopt.h> > +#include <signal.h> <snip>