Hi Bruce, Thanks for your comments. Replies are inline. I'll fix them in the next version.
Thanks, Cheng > -----Original Message----- > From: Richardson, Bruce <bruce.richard...@intel.com> > Sent: Tuesday, January 17, 2023 9:00 PM > To: Jiang, Cheng1 <cheng1.ji...@intel.com> > Cc: tho...@monjalon.net; m...@smartsharesystems.com; dev@dpdk.org; > Hu, Jiayu <jiayu...@intel.com>; Ding, Xuan <xuan.d...@intel.com>; Ma, > WenwuX <wenwux...@intel.com>; Wang, YuanX > <yuanx.w...@intel.com>; He, Xingguang <xingguang...@intel.com> > Subject: Re: [PATCH v2] app/dma-perf: introduce dma-perf application > > On Tue, Jan 17, 2023 at 01:56:23AM +0000, Cheng Jiang wrote: > > There are many high-performance DMA devices supported in DPDK now, > and > > these DMA devices can also be integrated into other modules of DPDK as > > accelerators, such as Vhost. Before integrating DMA into applications, > > developers need to know the performance of these DMA devices in > > various scenarios and the performance of CPUs in the same scenario, > > such as different buffer lengths. Only in this way can we know the > > target performance of the application accelerated by using them. This > > patch introduces a high-performance testing tool, which supports > > comparing the performance of CPU and DMA in different scenarios > > automatically with a pre-set config file. Memory Copy performance test are > supported for now. > > > > Signed-off-by: Cheng Jiang <cheng1.ji...@intel.com> > > Signed-off-by: Jiayu Hu <jiayu...@intel.com> > > Signed-off-by: Yuan Wang <yuanx.w...@intel.com> > > Acked-by: Morten Brørup <m...@smartsharesystems.com> > > --- > > v2: fixed some CI issues. > > Some first review comments inline below. More will likely follow as I review > it > further and try testing it out. > > /Bruce > > > > > app/meson.build | 1 + > > app/test-dma-perf/benchmark.c | 539 > > ++++++++++++++++++++++++++++++++++ > > app/test-dma-perf/benchmark.h | 12 + app/test-dma-perf/config.ini > > | 61 ++++ > > app/test-dma-perf/main.c | 434 +++++++++++++++++++++++++++ > > app/test-dma-perf/main.h | 53 ++++ > > app/test-dma-perf/meson.build | 22 ++ > > 7 files changed, 1122 insertions(+) > > create mode 100644 app/test-dma-perf/benchmark.c create mode 100644 > > app/test-dma-perf/benchmark.h create mode 100644 > > app/test-dma-perf/config.ini create mode 100644 > > app/test-dma-perf/main.c create mode 100644 app/test-dma-perf/main.h > > create mode 100644 app/test-dma-perf/meson.build > > > > diff --git a/app/meson.build b/app/meson.build index > > e32ea4bd5c..a060ad2725 100644 > > --- a/app/meson.build > > +++ b/app/meson.build > > @@ -28,6 +28,7 @@ apps = [ > > 'test-regex', > > 'test-sad', > > 'test-security-perf', > > + 'test-dma-perf', > > ] > > Lists in DPDK are always alphabetical when no other order is required, > therefore this new app should be further up the list, after > "test-crypto-perf". Sure, I'll fix it in the next version. > > > > > default_cflags = machine_args + ['-DALLOW_EXPERIMENTAL_API'] diff > > --git a/app/test-dma-perf/benchmark.c b/app/test-dma- > perf/benchmark.c > > new file mode 100644 index 0000000000..1cb5b0b291 > > --- /dev/null > > +++ b/app/test-dma-perf/benchmark.c > > @@ -0,0 +1,539 @@ > > +/* SPDX-License-Identifier: BSD-3-Clause > > + * Copyright(c) 2022 Intel Corporation */ > > + > > +#include <inttypes.h> > > + > > +#include <rte_time.h> > > +#include <rte_mbuf.h> > > +#include <rte_dmadev.h> > > +#include <rte_malloc.h> > > +#include <rte_lcore.h> > > + > > +#include "main.h" > > +#include "benchmark.h" > > + > > + > > +#define MAX_DMA_CPL_NB 255 > > + > > +#define CSV_LINE_DMA_FMT "Scenario %u,%u,%u,%u,%u,%u,%" PRIu64 > ",%.3lf,%" PRIu64 "\n" > > +#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,%u,%u,%u,%" PRIu64 > ",%.3lf,%" PRIu64 "\n" > > + > > +struct lcore_params { > > + uint16_t dev_id; > > + uint32_t nr_buf; > > + uint16_t kick_batch; > > + uint32_t buf_size; > > + uint32_t repeat_times; > > + uint16_t mpool_iter_step; > > + struct rte_mbuf **srcs; > > + struct rte_mbuf **dsts; > > + uint8_t scenario_id; > > +}; > > + > > +struct buf_info { > > + struct rte_mbuf **array; > > + uint32_t nr_buf; > > + uint32_t buf_size; > > +}; > > + > > +static struct rte_mempool *src_pool; > > +static struct rte_mempool *dst_pool; > > + > > +uint16_t dmadev_ids[MAX_WORKER_NB]; > > +uint32_t nb_dmadevs; > > + > > +#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__) > > + > > +static inline int > > +__rte_format_printf(3, 4) > > +print_err(const char *func, int lineno, const char *format, ...) { > > + va_list ap; > > + int ret; > > + > > + ret = fprintf(stderr, "In %s:%d - ", func, lineno); > > + va_start(ap, format); > > + ret += vfprintf(stderr, format, ap); > > + va_end(ap); > > + > > + return ret; > > +} > > + > > +static inline void > > +calc_result(struct lcore_params *p, uint64_t cp_cycle_sum, double > time_sec, > > + uint32_t repeat_times, uint32_t *memory, uint64_t > *ave_cycle, > > + float *bandwidth, uint64_t *ops) > > +{ > > + *memory = (p->buf_size * p->nr_buf * 2) / (1024 * 1024); > > + *ave_cycle = cp_cycle_sum / (p->repeat_times * p->nr_buf); > > + *bandwidth = p->buf_size * 8 * rte_get_timer_hz() / (*ave_cycle * > 1000 * 1000 * 1000.0); > > + *ops = (double)p->nr_buf * repeat_times / time_sec; } > > + > > +static void > > +output_result(uint8_t scenario_id, uint32_t lcore_id, uint16_t dev_id, > uint64_t ave_cycle, > > + uint32_t buf_size, uint32_t nr_buf, uint32_t memory, > > + float bandwidth, uint64_t ops, bool is_dma) { > > + if (is_dma) > > + printf("lcore %u, DMA %u:\n" > > + "average cycles: %" PRIu64 "," > > + " buffer size: %u, nr_buf: %u," > > + " memory: %uMB, frequency: %" PRIu64 > ".\n", > > + lcore_id, > > + dev_id, > > + ave_cycle, > > + buf_size, > > + nr_buf, > > + memory, > > + rte_get_timer_hz()); > > Longer lines are allowed for strings, so you can merge each line of output to > a > single line, which will improve readability. > Also, to shorten the code, there is no reason each parameter needs to go on > its own line. Yes, totally make sense. I'll fix it in the next version. > > > + else > > + printf("lcore %u\n" > > + "average cycles: %" PRIu64 "," > > + " buffer size: %u, nr_buf: %u," > > + " memory: %uMB, frequency: %" PRIu64 ".\n", > > + lcore_id, > > + ave_cycle, > > + buf_size, > > + nr_buf, > > + memory, > > + rte_get_timer_hz()); > > Suggestion, rather than duplicating the whole output, only the first line > needs to change based on SW vs HW copies. How about: > > if (is_dma) > printf("lcore %u, DMA %u\n", lcore_id, dev_id); > else > printf("lcore %u\n", lcore_id); > printf("average cycles: ..." , ...); > Got it, good point. I'll fix it. > > + > > + printf("Average bandwidth: %.3lfGbps, OPS: %" PRIu64 "\n", > > +bandwidth, ops); > > + > > + if (is_dma) > > + snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, > > + CSV_LINE_DMA_FMT, > > + scenario_id, lcore_id, dev_id, buf_size, > > + nr_buf, memory, ave_cycle, bandwidth, ops); > > + else > > + snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, > > + CSV_LINE_CPU_FMT, > > + scenario_id, lcore_id, buf_size, > > + nr_buf, memory, ave_cycle, bandwidth, ops); } > > + > > +static inline void > > +cache_flush_buf(void *arg) > > For non-x86 builds, you probably need to mark "arg" as unused to avoid > compiler warnings. Sure, I was wandering how to avoid compiler warnings, thanks for your advice. > > Why is the parameter type given as a void pointer, when the type is > unconditionally cast below as "struct buf_info"? Void pointer type should > only be needed if you need to call this via a generic function pointer. You are right, I'll fix it. > > > +{ > > +#ifdef RTE_ARCH_X86_64 > > + char *data; > > + char *addr; > > + struct buf_info *info = arg; > > + struct rte_mbuf **srcs = info->array; > > + uint32_t i, k; > > + > > + for (i = 0; i < info->nr_buf; i++) { > > + data = rte_pktmbuf_mtod(srcs[i], char *); > > + for (k = 0; k < info->buf_size / 64; k++) { > > + addr = (k * 64 + data); > > + __builtin_ia32_clflush(addr); > > + } > > inner loop may be shorter by incrementing loop var by 64, rather than > dividing and then multiplying, since you can eliminate variable "addr". > Also can be more readable with a variable rename: > > for (offset = 0; offset < info->buf_size; offset += 64) > __buildin_ia32_clflush(data + offset); > Sure, totally make sense to me, thanks. > > + } > > +#endif > > +} > > + > > +/* Configuration of device. */ > > +static void > > +configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) { > > + uint16_t vchan = 0; > > + struct rte_dma_info info; > > + struct rte_dma_conf dev_config = { .nb_vchans = 1 }; > > + struct rte_dma_vchan_conf qconf = { > > + .direction = RTE_DMA_DIR_MEM_TO_MEM, > > + .nb_desc = ring_size > > + }; > > + > > + if (rte_dma_configure(dev_id, &dev_config) != 0) > > + rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n"); > > + > > + if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) { > > + printf("Error with queue configuration\n"); > > + rte_panic(); > > + } > > + > > Inconsistency here - and below too. Either use rte_exit on failure or use > rte_panic, but don't mix them. Panic seems a little severe, so I suggest just > using rte_exit() in all cases. Sure. > > > + rte_dma_info_get(dev_id, &info); > > + if (info.nb_vchans != 1) { > > + printf("Error, no configured queues reported on device > id %u\n", dev_id); > > + rte_panic(); > > + } > > + if (rte_dma_start(dev_id) != 0) > > + rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n"); } > > + > > +static int > > +config_dmadevs(uint32_t nb_workers, uint32_t ring_size) { > > + int16_t dev_id = rte_dma_next_dev(0); > > + uint32_t i; > > + > > + nb_dmadevs = 0; > > + > > + for (i = 0; i < nb_workers; i++) { > > + if (dev_id == -1) > > + goto end; > > + > > + dmadev_ids[i] = dev_id; > > + configure_dmadev_queue(dmadev_ids[i], ring_size); > > + dev_id = rte_dma_next_dev(dev_id + 1); > > + ++nb_dmadevs; > > Very minor nit, but I'd suggest swapping these last two lines, incrementing > nb_dmadevs right after configuring the device, but before finding a new one. > It just makes more sense to me. Sure. > > > + } > > + > > +end: > > + if (nb_dmadevs < nb_workers) { > > + printf("Not enough dmadevs (%u) for all workers (%u).\n", > nb_dmadevs, nb_workers); > > + return -1; > > + } > > + > > + RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n", > nb_dmadevs); > > + > > + return 0; > > +} > > + > > +static inline void > > +do_dma_mem_copy(uint16_t dev_id, uint32_t nr_buf, uint16_t > kick_batch, uint32_t buf_size, > > + uint16_t mpool_iter_step, struct rte_mbuf **srcs, > struct rte_mbuf > > +**dsts) { > > + int64_t async_cnt = 0; > > + int nr_cpl = 0; > > + uint32_t index; > > + uint16_t offset; > > + uint32_t i; > > + > > + for (offset = 0; offset < mpool_iter_step; offset++) { > > + for (i = 0; index = i * mpool_iter_step + offset, index < > > nr_buf; > > +i++) { > > Assignment in the condition part of a loop seems wrong. I suggest reworking > this to avoid it. Sure, I'll reconsider it. > > > + if (unlikely(rte_dma_copy(dev_id, > > + 0, > > + srcs[index]->buf_iova + > srcs[index]->data_off, > > + dsts[index]->buf_iova + > dsts[index]->data_off, > > rte_pktmbuf_iova() macro can be used here. Sure, sorry I missed it. > > > + buf_size, > > + 0) < 0)) { > > + rte_dma_submit(dev_id, 0); > > + while (rte_dma_burst_capacity(dev_id, 0) == > 0) { > > + nr_cpl = rte_dma_completed(dev_id, > 0, MAX_DMA_CPL_NB, > > + NULL, NULL); > > + async_cnt -= nr_cpl; > > + } > > + if (rte_dma_copy(dev_id, > > + 0, > > + srcs[index]->buf_iova + > srcs[index]->data_off, > > + dsts[index]->buf_iova + > dsts[index]->data_off, > > + buf_size, > > + 0) < 0) { > > + printf("enqueue fail again at %u\n", > index); > > + printf("space:%d\n", > rte_dma_burst_capacity(dev_id, 0)); > > + rte_exit(EXIT_FAILURE, "DMA > enqueue failed\n"); > > + } > > + } > > + async_cnt++; > > + > > + /** > > + * When '&' is used to wrap an index, mask must be a > power of 2. > > + * That is, kick_batch must be 2^n. > > I assume that is checked on input processing when parsing the config file? I'll check it in the next version. > > > + */ > > + if (unlikely((async_cnt % kick_batch) == 0)) { > > This is an expected condition that will occur with repeatable frequency. > Therefore, unlikely is not really appropriate. Sure, I'll reconsider it. > > > + rte_dma_submit(dev_id, 0); > > + /* add a poll to avoid ring full */ > > + nr_cpl = rte_dma_completed(dev_id, 0, > MAX_DMA_CPL_NB, NULL, NULL); > > + async_cnt -= nr_cpl; > > + } > > + } > > + > > + rte_dma_submit(dev_id, 0); > > + while (async_cnt > 0) { > > + nr_cpl = rte_dma_completed(dev_id, 0, > MAX_DMA_CPL_NB, NULL, NULL); > > + async_cnt -= nr_cpl; > > + } > > Do we need a timeout here or in the loop above incase of errors that cause > us to not get all the elements back? Make sense, I'll consider it in the next version. Thanks. > > > + } > > +} > > + > > +static int > > +dma_mem_copy(void *p) > > +{ > > I see the call to this function within "remote_launch" uses a cast on the > function. I don't think that typecast should be necessary, but if you keep it, > you can avoid using the void pointer here and just mark the input type as > "struct lcore_params" directly. OK, make sense to me. > > > + uint64_t ops; > > + uint32_t memory; > > + float bandwidth; > > + double time_sec; > > + uint32_t lcore_id = rte_lcore_id(); > > + struct lcore_params *params = (struct lcore_params *)p; > > + uint32_t repeat_times = params->repeat_times; > > + uint32_t buf_size = params->buf_size; > > + uint16_t kick_batch = params->kick_batch; > > + uint32_t lcore_nr_buf = params->nr_buf; > > + uint16_t dev_id = params->dev_id; > > + uint16_t mpool_iter_step = params->mpool_iter_step; > > + struct rte_mbuf **srcs = params->srcs; > > + struct rte_mbuf **dsts = params->dsts; > > + uint64_t begin, end, total_cycles = 0, avg_cycles = 0; > > + uint32_t r; > > + > > + begin = rte_rdtsc(); > > + > > + for (r = 0; r < repeat_times; r++) > > + do_dma_mem_copy(dev_id, lcore_nr_buf, kick_batch, > buf_size, > > + mpool_iter_step, srcs, dsts); > > + > > + end = rte_rdtsc(); > > + total_cycles = end - begin; > > You can do without "end" easily enough: > total_cycles = rte_rdtsc() - begin; Got it, thanks for your advice. > > > + time_sec = (double)total_cycles / rte_get_timer_hz(); > > + > > + calc_result(params, total_cycles, time_sec, repeat_times, &memory, > > + &avg_cycles, &bandwidth, &ops); > > + output_result(params->scenario_id, lcore_id, dev_id, avg_cycles, > buf_size, lcore_nr_buf, > > + memory, bandwidth, ops, true); > > + > > + rte_free(p); > > + > > + return 0; > > +} > > + > > +static int > > +cpu_mem_copy(void *p) > > +{ > > Most of comments from above, also apply here. Sure, I'll fix them in the next version. > > > + uint32_t idx; > > + uint32_t lcore_id; > > + uint32_t memory; > > + uint64_t ops; > > + float bandwidth; > > + double time_sec; > > + struct lcore_params *params = (struct lcore_params *)p; > > + uint32_t repeat_times = params->repeat_times; > > + uint32_t buf_size = params->buf_size; > > + uint32_t lcore_nr_buf = params->nr_buf; > > + uint16_t mpool_iter_step = params->mpool_iter_step; > > + struct rte_mbuf **srcs = params->srcs; > > + struct rte_mbuf **dsts = params->dsts; > > + uint64_t begin, end, total_cycles = 0, avg_cycles = 0; > > + uint32_t k, j, offset; > > + > > + begin = rte_rdtsc(); > > + > > + for (k = 0; k < repeat_times; k++) { > > + /* copy buffer form src to dst */ > > + for (offset = 0; offset < mpool_iter_step; offset++) { > > + for (j = 0; idx = j * mpool_iter_step + offset, idx < > lcore_nr_buf; j++) { > > + rte_memcpy((void > *)(uintptr_t)rte_mbuf_data_iova(dsts[idx]), > > + (void > *)(uintptr_t)rte_mbuf_data_iova(srcs[idx]), > > + (size_t)buf_size); > > + } > > + } > > + } > > + > > + end = rte_rdtsc(); > > + total_cycles = end - begin; > > + time_sec = (double)total_cycles / rte_get_timer_hz(); > > + > > + lcore_id = rte_lcore_id(); > > + > > + calc_result(params, total_cycles, time_sec, repeat_times, &memory, > > + &avg_cycles, &bandwidth, &ops); > > + output_result(params->scenario_id, lcore_id, 0, avg_cycles, buf_size, > lcore_nr_buf, > > + memory, bandwidth, ops, false); > > + > > + rte_free(p); > > + > > + return 0; > > +} > > + > > +static int > > +setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs, > > + struct rte_mbuf ***dsts) > > +{ > > + uint32_t i; > > + unsigned int buf_size = cfg->buf_size.cur; > > + unsigned int nr_sockets; > > + uint32_t nr_buf = cfg->nr_buf; > > + > > + nr_sockets = rte_socket_count(); > > + if (cfg->src_numa_node >= nr_sockets || > > + cfg->dst_numa_node >= nr_sockets) { > > + printf("Error: Source or destination numa exceeds the acture > numa nodes.\n"); > > + return -1; > > + } > > + > > + src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC", > > + nr_buf, /* n == num elements */ > > + 64, /* cache size */ > > + 0, /* priv size */ > > + buf_size + RTE_PKTMBUF_HEADROOM, > > + cfg->src_numa_node); > > + if (src_pool == NULL) { > > + PRINT_ERR("Error with source mempool creation.\n"); > > + return -1; > > + } > > + > > + dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST", > > + nr_buf, /* n == num elements */ > > + 64, /* cache size */ > > + 0, /* priv size */ > > + buf_size + RTE_PKTMBUF_HEADROOM, > > + cfg->dst_numa_node); > > + if (dst_pool == NULL) { > > + PRINT_ERR("Error with destination mempool creation.\n"); > > + return -1; > > + } > > + > > + *srcs = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf > > +*))); > > Typecast for void * to other types aren't actually necessary in C. > I note some inconsistency in this file with regards to malloc. Here you use > regular malloc, while when building the parameters to pass to the memcpy > functions you use rte_malloc. I suggest standardizing on one or the other > rather than mixing. Good point, thanks. > > > + if (*srcs == NULL) { > > + printf("Error: srcs malloc failed.\n"); > > + return -1; > > + } > > + > > + *dsts = (struct rte_mbuf **)(malloc(nr_buf * sizeof(struct rte_mbuf > *))); > > + if (*dsts == NULL) { > > + printf("Error: dsts malloc failed.\n"); > > + return -1; > > + } > > + > > + for (i = 0; i < nr_buf; i++) { > > + (*srcs)[i] = rte_pktmbuf_alloc(src_pool); > > + (*dsts)[i] = rte_pktmbuf_alloc(dst_pool); > > Rather than individually allocating you may well manage with > rte_mempool_get_bulk() to allocate all mbufs in one call. Sure, thanks. > > > + if ((!(*srcs)[i]) || (!(*dsts)[i])) { > > + printf("src: %p, dst: %p\n", (*srcs)[i], (*dsts)[i]); > > + return -1; > > + } > > + > > + (*srcs)[i]->data_len = (*srcs)[i]->pkt_len = buf_size; > > + (*dsts)[i]->data_len = (*dsts)[i]->pkt_len = buf_size; > > rte_pktmbuf_append() macro can be used here, rather than setting the > lengths manually. However, these values are not actually used anywhere > else in the code, I believe, so setting them is unnecessary. You are manually > tracking the copy lengths throughout the test, and nothing else is working on > the mbufs, so the length the mbuf reports is immaterial.. Sure, it will be fixed. > > > > + } > > + > > + return 0; > > +} > > + > > +void > > +dma_mem_copy_benchmark(struct test_configure *cfg) { > > + uint32_t i; > > + uint32_t offset; > > + unsigned int lcore_id = 0; > > + struct rte_mbuf **srcs = NULL, **dsts = NULL; > > + unsigned int buf_size = cfg->buf_size.cur; > > + uint16_t kick_batch = cfg->kick_batch.cur; > > + uint16_t mpool_iter_step = cfg->mpool_iter_step; > > + uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / > (cfg->buf_size.cur * 2); > > + uint16_t nb_workers = cfg->nb_workers; > > + uint32_t repeat_times = cfg->repeat_times; > > + > > + if (setup_memory_env(cfg, &srcs, &dsts) < 0) > > + goto out; > > + > > + if (config_dmadevs(nb_workers, cfg->ring_size.cur) < 0) > > + goto out; > > + > > + if (cfg->cache_flush) { > > + struct buf_info info; > > + > > + info.array = srcs; > > + info.buf_size = buf_size; > > + info.nr_buf = nr_buf; > > + cache_flush_buf(&info); > > + > > From what I can see, struct buf_info is only used for passing parameters to > the cache_flush_buf function. The code would be a lot simpler to remove > the structure and just pass 3 parameters to the function directly. Good point, thanks. > > > + info.array = dsts; > > + cache_flush_buf(&info); > > + rte_mb(); > > + } > > + > > + printf("Start testing....\n"); > > + > > + for (i = 0; i < nb_workers; i++) { > > + lcore_id = rte_get_next_lcore(lcore_id, true, true); > > + offset = nr_buf / nb_workers * i; > > + > > + struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0); > > + if (!p) { > > + printf("lcore parameters malloc failure for > lcore %d\n", lcore_id); > > + break; > > + } > > + *p = (struct lcore_params) { > > + dmadev_ids[i], > > + (uint32_t)(nr_buf/nb_workers), > > + kick_batch, > > + buf_size, > > + repeat_times, > > + mpool_iter_step, > > + srcs + offset, > > + dsts + offset, > > + cfg->scenario_id > > + }; > > + > > + rte_eal_remote_launch((lcore_function_t > *)dma_mem_copy, p, lcore_id); > > + } > > + > > + rte_eal_mp_wait_lcore(); > > + > > +out: > > + /* free env */ > > + if (srcs) { > > + for (i = 0; i < nr_buf; i++) > > + rte_pktmbuf_free(srcs[i]); > > + free(srcs); > > + } > > + if (dsts) { > > + for (i = 0; i < nr_buf; i++) > > + rte_pktmbuf_free(dsts[i]); > > + free(dsts); > > + } > > + > > + if (src_pool) > > + rte_mempool_free(src_pool); > > + if (dst_pool) > > + rte_mempool_free(dst_pool); > > + > > + for (i = 0; i < nb_dmadevs; i++) { > > + printf("Stopping dmadev %d\n", dmadev_ids[i]); > > + rte_dma_stop(dmadev_ids[i]); > > + } > > +} > > + > > +void > > +cpu_mem_copy_benchmark(struct test_configure *cfg) { > > + uint32_t i, offset; > > + uint32_t repeat_times = cfg->repeat_times; > > + uint32_t kick_batch = cfg->kick_batch.cur; > > + uint32_t buf_size = cfg->buf_size.cur; > > + uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / > (cfg->buf_size.cur * 2); > > + uint16_t nb_workers = cfg->nb_workers; > > + uint16_t mpool_iter_step = cfg->mpool_iter_step; > > + struct rte_mbuf **srcs = NULL, **dsts = NULL; > > + unsigned int lcore_id = 0; > > + > > + if (setup_memory_env(cfg, &srcs, &dsts) < 0) > > + goto out; > > + > > + for (i = 0; i < nb_workers; i++) { > > + lcore_id = rte_get_next_lcore(lcore_id, rte_lcore_count() > > 1 ? 1 : 0, 1); > > + offset = nr_buf / nb_workers * i; > > + struct lcore_params *p = rte_malloc(NULL, sizeof(*p), 0); > > + if (!p) { > > + printf("lcore parameters malloc failure for > lcore %d\n", lcore_id); > > + break; > > + } > > + *p = (struct lcore_params) { 0, nr_buf/nb_workers, > kick_batch, > > + buf_size, repeat_times, > mpool_iter_step, > > + srcs + offset, dsts + offset, > cfg->scenario_id }; > > Formatting should be the same as function above. Sure. > > > + rte_eal_remote_launch((lcore_function_t *)cpu_mem_copy, > p, lcore_id); > > + } > > + > > + rte_eal_mp_wait_lcore(); > > + > > +out: > > + /* free env */ > > + if (srcs) { > > + for (i = 0; i < nr_buf; i++) > > + rte_pktmbuf_free(srcs[i]); > > + free(srcs); > > + } > > + if (dsts) { > > + for (i = 0; i < nr_buf; i++) > > + rte_pktmbuf_free(dsts[i]); > > + free(dsts); > > + } > > + > > + if (src_pool) > > + rte_mempool_free(src_pool); > > + if (dst_pool) > > + rte_mempool_free(dst_pool); > > +} > > There seems a quite a bit of common code between the > dma_mem_copy_benchmark and cpu_mem_copy_benchmark. Might be > worth investigating if they can be merged while still keeping readability. Yes you're right. I'll consider it. > > > diff --git a/app/test-dma-perf/benchmark.h > > b/app/test-dma-perf/benchmark.h new file mode 100644 index > > 0000000000..f5ad8d6d99 > > --- /dev/null > > +++ b/app/test-dma-perf/benchmark.h > > @@ -0,0 +1,12 @@ > > +/* SPDX-License-Identifier: BSD-3-Clause > > + * Copyright(c) 2022 Intel Corporation */ > > + > > +#ifndef _BENCHMARK_H_ > > +#define _BENCHMARK_H_ > > + > > +void dma_mem_copy_benchmark(struct test_configure *cfg); > > + > > +void cpu_mem_copy_benchmark(struct test_configure *cfg); > > + > > +#endif /* _BENCHMARK_H_ */ > > You don't really need two separate headers in this application. Both main.h > and benchmark.h can be merged into one header, since both are always > included in both c files. Sure, make sense. > > > diff --git a/app/test-dma-perf/config.ini > > b/app/test-dma-perf/config.ini new file mode 100644 index > > 0000000000..e24bb19414 > > --- /dev/null > > +++ b/app/test-dma-perf/config.ini > > @@ -0,0 +1,61 @@ > > + > > +; Supported test types: > > +; DMA_MEM_COPY|CPU_MEM_COPY > > + > > +; Parameters: > > +; "mem_size","buf_size","dma_ring_size","kick_batch". > > +; "mem_size" means the size of the memory footprint. > > +; "buf_size" means the memory size of a single operation. > > +; "dma_ring_size" means the dma ring buffer size. > > +; "kick_batch" means dma operation batch size. > > + > > +; Format: variable=first[,last,increment[,ADD|MUL]] > > +; ADD is the default mode. > > + > > +; src_numa_node is used to control the numa node where the source > memory is allocated. > > +; dst_numa_node is used to control the numa node where the > destination memory is allocated. > > + > > +; cache_flush is used to control if the cache should be flushed. > > + > > +; repeat_times is used to control the repeat times of the whole case. > > + > > +; worker_threads is used to control the threads number of the test app. > > +; It should be less than the core number. > > + > > +; mpool_iter_step is used to control the buffer continuity. > > + > > +; Bind DMA to lcore: > > +; Specify the "lcore_dma" parameter. > > +; The number of "lcore_dma" should be greater than or equal to the > number of "worker_threads". > > +; Otherwise the remaining DMA devices will be automatically allocated > > +to threads that are not ; specified. If EAL parameters "-l" and "-a" > > +are specified, the "lcore_dma" should be within ; their range. > > + > > +[case1] > > +type=DMA_MEM_COPY > > +mem_size=10 > > +buf_size=64,8192,2,MUL > > +dma_ring_size=1024 > > +kick_batch=32 > > +src_numa_node=0 > > +dst_numa_node=0 > > +cache_flush=0 > > +repeat_times=10 > > +worker_threads=1 > > +mpool_iter_step=1 > > +lcore_dma=lcore3@0000:00:04.0 > > +eal_args=--legacy-mem --file-prefix=test > > + > > +[case2] > > +type=CPU_MEM_COPY > > +mem_size=10 > > +buf_size=64,8192,2,MUL > > +dma_ring_size=1024 > > +kick_batch=32 > > +src_numa_node=0 > > +dst_numa_node=1 > > +cache_flush=0 > > +repeat_times=100 > > +worker_threads=1 > > +mpool_iter_step=1 > > +eal_args=--no-pci > > diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c new > > file mode 100644 index 0000000000..94ba369539 > > --- /dev/null > > +++ b/app/test-dma-perf/main.c > > @@ -0,0 +1,434 @@ > > +/* SPDX-License-Identifier: BSD-3-Clause > > + * Copyright(c) 2022 Intel Corporation */ > > + > > +#include <stdio.h> > > +#if !defined(RTE_EXEC_ENV_LINUX) > > + > > +int > > +main(int argc, char *argv[]) > > +{ > > + printf("OS not supported, skipping test\n"); > > + return 0; > > +} > > + > > What is linux-specific about this app? > > If we do need to limit the app to Linux-only I suggest using meson to do so > rather than putting #ifdefs in the code. Got it. Thanks! > > > +#else > > + > > +#include <stdlib.h> > > +#include <getopt.h> > > +#include <signal.h> > > <snip>