This patch allows a configurable pair of values to be set, which controls the frequency and length of a nanosleep call performed at test-pmd's iofwd main loop.
The problem is the following: it is necessary to execute code on isolated CPUs which is not part of the packet forwarding load. For example: "echo val > /sys/kernel/debug/tracing/buffer_size_kb" hangs the process, because the DPDK thread has higher priority than the workqueue thread which executes the flush from CPU local tracebuffer to CPU global trace buffer [the workitem in case]. There are more serious issues than the trace-cmd bug, such as XFS workitems failing to execute causing filesystem corruption. To workaround this problem, until a proper kernel solution is developed, allow DPDK to nanosleep (hopefully with a small enough frequency and interval so that the performance is within acceptable levels). The new parameters are: * --delay-hz: sets nanosleep frequency in Hz. * --delay-length: sets nanosleep length in ns. Results for delay-hz=100,delay-length=10000 (which allows the buffer_size_kb change to complete): Baseline run-1: [Histogram port 0 to port 1 at rate 2.3 Mpps] Samples: 49505, Average: 19008.7 ns, StdDev: 2501.0 ns, Quartiles: 17293.0/18330.0/19901.0 ns Baseline run-2: [Histogram port 0 to port 1 at rate 2.3 Mpps] Samples: 49606, Average: 19036.4 ns, StdDev: 2485.2 ns, Quartiles: 17318.0/18349.0/19936.0 ns Baseline run-3: [Histogram port 0 to port 1 at rate 2.3 Mpps] Samples: 49627, Average: 19019.2 ns, StdDev: 2503.7 ns, Quartiles: 17323.0/18355.0/19940.0 ns ============================ (10.000us, 100HZ) Run-1: [Histogram port 0 to port 1 at rate 2.3 Mpps] Samples: 7284, Average: 20830.6 ns, StdDev: 12023.0 ns, Quartiles: 17309.0/18394.0/20233.0 ns Run-2: [Histogram port 0 to port 1 at rate 2.3 Mpps] Samples: 6272, Average: 20897.1 ns, StdDev: 12057.2 ns, Quartiles: 17389.0/18457.0/20266.0 ns Run-3: [Histogram port 0 to port 1 at rate 2.3 Mpps] Samples: 4843, Average: 20535.2 ns, StdDev: 9827.3 ns, Quartiles: 17389.0/18441.0/20269.0 ns Signed-off-by: Marcelo Tosatti <mtosa...@redhat.com> diff -Nur dpdk-17.08.orig/app/test-pmd/iofwd.c dpdk-17.08/app/test-pmd/iofwd.c --- dpdk-17.08.orig/app/test-pmd/iofwd.c 2017-10-30 22:45:37.829492673 -0200 +++ dpdk-17.08/app/test-pmd/iofwd.c 2017-10-30 22:45:48.321522581 -0200 @@ -64,9 +64,30 @@ #include <rte_ethdev.h> #include <rte_string_fns.h> #include <rte_flow.h> +#include <time.h> #include "testpmd.h" +uint32_t nanosleep_interval; + +static void calc_nanosleep_interval(int hz) +{ + uint64_t cycles_per_sec = rte_get_timer_hz(); + nanosleep_interval = cycles_per_sec/hz; +} + +static void do_nanosleep(void) +{ + struct timespec req; + + req.tv_sec = 0; + req.tv_nsec = nanosleep_length; + + nanosleep(&req, NULL); + + return; +} + /* * Forwarding of packets in I/O mode. * Forward packets "as-is". @@ -81,6 +102,10 @@ uint16_t nb_tx; uint32_t retry; + + if (nanosleep_interval == 0 && nanosleep_frequency > 0) + calc_nanosleep_interval(nanosleep_frequency); + #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES uint64_t start_tsc; uint64_t end_tsc; @@ -91,6 +116,12 @@ start_tsc = rte_rdtsc(); #endif + if (nanosleep_frequency > 0 && + rte_get_timer_cycles() > fs->next_nanosleep) { + do_nanosleep(); + fs->next_nanosleep = rte_get_timer_cycles() + nanosleep_interval; + } + /* * Receive a burst of packets and forward them. */ diff -Nur dpdk-17.08.orig/app/test-pmd/parameters.c dpdk-17.08/app/test-pmd/parameters.c --- dpdk-17.08.orig/app/test-pmd/parameters.c 2017-10-30 22:45:37.830492676 -0200 +++ dpdk-17.08/app/test-pmd/parameters.c 2017-10-30 22:46:33.708651912 -0200 @@ -216,6 +216,8 @@ "disable print of designated event or all of them.\n"); printf(" --flow-isolate-all: " "requests flow API isolated mode on all ports at initialization time.\n"); + printf(" --delay-hz: sets nanosleep frequency in Hz.\n"); + printf(" --delay-length: sets nanosleep length in ns.\n"); } #ifdef RTE_LIBRTE_CMDLINE @@ -638,7 +640,9 @@ { "no-rmv-interrupt", 0, 0, 0 }, { "print-event", 1, 0, 0 }, { "mask-event", 1, 0, 0 }, - { 0, 0, 0, 0 }, + { "delay-hz", 1, 0, 0 }, + { "delay-length", 1, 0, 0 }, + { 0, 0, 0, 0 }, }; argvopt = argv; @@ -1099,6 +1103,27 @@ else rte_exit(EXIT_FAILURE, "bad txpkts\n"); } + + if (!strcmp(lgopts[opt_idx].name, "delay-hz")) { + int n; + + n = atoi(optarg); + + if (n < 0) + rte_exit(EXIT_FAILURE, "bad delay-hz\n"); + nanosleep_frequency = n; + } + + if (!strcmp(lgopts[opt_idx].name, "delay-length")) { + int n; + + n = atoi(optarg); + + if (n < 0) + rte_exit(EXIT_FAILURE, "bad delay-length\n"); + nanosleep_length = n; + } + if (!strcmp(lgopts[opt_idx].name, "no-flush-rx")) no_flush_rx = 1; if (!strcmp(lgopts[opt_idx].name, "disable-link-check")) diff -Nur dpdk-17.08.orig/app/test-pmd/testpmd.c dpdk-17.08/app/test-pmd/testpmd.c --- dpdk-17.08.orig/app/test-pmd/testpmd.c 2017-10-30 22:45:37.829492673 -0200 +++ dpdk-17.08/app/test-pmd/testpmd.c 2017-10-30 22:45:48.323522591 -0200 @@ -327,6 +327,13 @@ #endif + +/* How long to sleep in packet processing */ +uint32_t nanosleep_length; + +/* How often to sleep in packet processing */ +uint32_t nanosleep_frequency; + /* * Ethernet device configuration. */ diff -Nur dpdk-17.08.orig/app/test-pmd/testpmd.h dpdk-17.08/app/test-pmd/testpmd.h --- dpdk-17.08.orig/app/test-pmd/testpmd.h 2017-10-30 22:45:37.829492673 -0200 +++ dpdk-17.08/app/test-pmd/testpmd.h 2017-10-30 22:45:48.323522591 -0200 @@ -127,6 +127,7 @@ struct pkt_burst_stats rx_burst_stats; struct pkt_burst_stats tx_burst_stats; #endif + uint64_t next_nanosleep; }; /** Offload IP checksum in csum forward engine */ @@ -390,6 +391,9 @@ extern lcoreid_t latencystats_lcore_id; #endif +extern uint32_t nanosleep_length; +extern uint32_t nanosleep_frequency; + #ifdef RTE_LIBRTE_BITRATE extern lcoreid_t bitrate_lcore_id; extern uint8_t bitrate_enabled;