* Ingo Molnar <mi...@kernel.org> wrote: > [...] > > I kept the process, threading and memory allocation bits of numa.c, just in > case > we need them to measure more complex syscalls. Maybe we could keep the > threading > bits and remove the memory allocation parameters, to simplify the benchmark?
So the patch below removes NUMA details: convergence measurement and memory access pattern details. This reduces the linecount by about 30%. Should be combined with the previous patch I suspect. Thanks, Ingo ==================> >From a992aecebe12a195ffa74e09fcbe6b48db4430e3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mi...@kernel.org> Date: Mon, 1 Feb 2016 08:46:39 +0100 Subject: [PATCH] perf tooling: Simplify 'perf bench syscall' Remove NUMA legacies. Signed-off-by: Ingo Molnar <mi...@kernel.org> --- tools/perf/bench/syscall.c | 316 +-------------------------------------------- 1 file changed, 5 insertions(+), 311 deletions(-) diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c index 5a4ef02176d1..fabac462bde5 100644 --- a/tools/perf/bench/syscall.c +++ b/tools/perf/bench/syscall.c @@ -81,11 +81,6 @@ struct params { double mb_thread; /* Access patterns to the working set: */ - bool data_reads; - bool data_writes; - bool data_backwards; - bool data_zero_memset; - bool data_rand_walk; u32 nr_loops; u32 nr_secs; u32 sleep_usecs; @@ -108,10 +103,6 @@ struct params { int nr_tasks; bool show_quiet; - bool show_convergence; - bool measure_convergence; - - int perturb_secs; int nr_cpus; int nr_nodes; @@ -139,8 +130,6 @@ struct global_info { struct thread_data *threads; - /* Convergence latency measurement: */ - bool all_converged; bool stop_work; int print_once; @@ -168,23 +157,13 @@ static const struct option options[] = { OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run (default: 5 secs)"), OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"), - OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via writes (can be mixed with -W)"), - OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"), - OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"), - OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"), - OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"), - - OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations"), OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations"), OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"), - OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"), OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"), OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"), OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"), - OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"), - OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"), OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"), OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"), @@ -208,32 +187,6 @@ static const char * const syscall_usage[] = { NULL }; -static cpu_set_t bind_to_cpu(int target_cpu) -{ - cpu_set_t orig_mask, mask; - int ret; - - ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); - BUG_ON(ret); - - CPU_ZERO(&mask); - - if (target_cpu == -1) { - int cpu; - - for (cpu = 0; cpu < g->p.nr_cpus; cpu++) - CPU_SET(cpu, &mask); - } else { - BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus); - CPU_SET(target_cpu, &mask); - } - - ret = sched_setaffinity(0, sizeof(mask), &mask); - BUG_ON(ret); - - return orig_mask; -} - static cpu_set_t bind_to_node(int target_node) { int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes; @@ -699,222 +652,11 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked) prctl(0, bytes_worked); } -#define MAX_NR_NODES 64 - -/* - * Count the number of nodes a process's threads - * are spread out on. - * - * A count of 1 means that the process is compressed - * to a single node. A count of g->p.nr_nodes means it's - * spread out on the whole system. - */ -static int count_process_nodes(int process_nr) -{ - char node_present[MAX_NR_NODES] = { 0, }; - int nodes; - int n, t; - - for (t = 0; t < g->p.nr_threads; t++) { - struct thread_data *td; - int task_nr; - int node; - - task_nr = process_nr*g->p.nr_threads + t; - td = g->threads + task_nr; - - node = numa_node_of_cpu(td->curr_cpu); - if (node < 0) /* curr_cpu was likely still -1 */ - return 0; - - node_present[node] = 1; - } - - nodes = 0; - - for (n = 0; n < MAX_NR_NODES; n++) - nodes += node_present[n]; - - return nodes; -} - -/* - * Count the number of distinct process-threads a node contains. - * - * A count of 1 means that the node contains only a single - * process. If all nodes on the system contain at most one - * process then we are well-converged. - */ -static int count_node_processes(int node) -{ - int processes = 0; - int t, p; - - for (p = 0; p < g->p.nr_proc; p++) { - for (t = 0; t < g->p.nr_threads; t++) { - struct thread_data *td; - int task_nr; - int n; - - task_nr = p*g->p.nr_threads + t; - td = g->threads + task_nr; - - n = numa_node_of_cpu(td->curr_cpu); - if (n == node) { - processes++; - break; - } - } - } - - return processes; -} - -static void calc_convergence_compression(int *strong) -{ - unsigned int nodes_min, nodes_max; - int p; - - nodes_min = -1; - nodes_max = 0; - - for (p = 0; p < g->p.nr_proc; p++) { - unsigned int nodes = count_process_nodes(p); - - if (!nodes) { - *strong = 0; - return; - } - - nodes_min = min(nodes, nodes_min); - nodes_max = max(nodes, nodes_max); - } - - /* Strong convergence: all threads compress on a single node: */ - if (nodes_min == 1 && nodes_max == 1) { - *strong = 1; - } else { - *strong = 0; - tprintf(" {%d-%d}", nodes_min, nodes_max); - } -} - -static void calc_convergence(double runtime_ns_max, double *convergence) -{ - unsigned int loops_done_min, loops_done_max; - int process_groups; - int nodes[MAX_NR_NODES]; - int distance; - int nr_min; - int nr_max; - int strong; - int sum; - int nr; - int node; - int cpu; - int t; - - if (!g->p.show_convergence && !g->p.measure_convergence) - return; - - for (node = 0; node < g->p.nr_nodes; node++) - nodes[node] = 0; - - loops_done_min = -1; - loops_done_max = 0; - - for (t = 0; t < g->p.nr_tasks; t++) { - struct thread_data *td = g->threads + t; - unsigned int loops_done; - - cpu = td->curr_cpu; - - /* Not all threads have written it yet: */ - if (cpu < 0) - continue; - - node = numa_node_of_cpu(cpu); - - nodes[node]++; - - loops_done = td->loops_done; - loops_done_min = min(loops_done, loops_done_min); - loops_done_max = max(loops_done, loops_done_max); - } - - nr_max = 0; - nr_min = g->p.nr_tasks; - sum = 0; - - for (node = 0; node < g->p.nr_nodes; node++) { - nr = nodes[node]; - nr_min = min(nr, nr_min); - nr_max = max(nr, nr_max); - sum += nr; - } - BUG_ON(nr_min > nr_max); - - BUG_ON(sum > g->p.nr_tasks); - - if (0 && (sum < g->p.nr_tasks)) - return; - - /* - * Count the number of distinct process groups present - * on nodes - when we are converged this will decrease - * to g->p.nr_proc: - */ - process_groups = 0; - - for (node = 0; node < g->p.nr_nodes; node++) { - int processes = count_node_processes(node); - - nr = nodes[node]; - tprintf(" %2d/%-2d", nr, processes); - - process_groups += processes; - } - - distance = nr_max - nr_min; - - tprintf(" [%2d/%-2d]", distance, process_groups); - - tprintf(" l:%3d-%-3d (%3d)", - loops_done_min, loops_done_max, loops_done_max-loops_done_min); - - if (loops_done_min && loops_done_max) { - double skew = 1.0 - (double)loops_done_min/loops_done_max; - - tprintf(" [%4.1f%%]", skew * 100.0); - } - - calc_convergence_compression(&strong); - - if (strong && process_groups == g->p.nr_proc) { - if (!*convergence) { - *convergence = runtime_ns_max; - tprintf(" (%6.1fs converged)\n", *convergence/1e9); - if (g->p.measure_convergence) { - g->all_converged = true; - g->stop_work = true; - } - } - } else { - if (*convergence) { - tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9); - *convergence = 0; - } - tprintf("\n"); - } -} - -static void show_summary(double runtime_ns_max, int l, double *convergence) +static void show_summary(double runtime_ns_max, int l) { tprintf("\r # %5.1f%% [%.1f mins]", (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0); - calc_convergence(runtime_ns_max, convergence); - if (g->p.show_details >= 0) fflush(stdout); } @@ -925,11 +667,9 @@ static void *worker_thread(void *__tdata) struct timeval start0, start, stop, diff; int process_nr = td->process_nr; int thread_nr = td->thread_nr; - unsigned long last_perturbance; int task_nr = td->task_nr; int details = g->p.show_details; - int first_task, last_task; - double convergence = 0; + int last_task; u64 val = td->val; double runtime_ns_max; u8 *global_data; @@ -955,10 +695,6 @@ static void *worker_thread(void *__tdata) if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1) last_task = 1; - first_task = 0; - if (process_nr == 0 && thread_nr == 0) - first_task = 1; - if (details >= 2) { printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n", process_nr, thread_nr, global_data, process_data, thread_data); @@ -983,7 +719,6 @@ static void *worker_thread(void *__tdata) gettimeofday(&start0, NULL); start = stop = start0; - last_perturbance = start.tv_sec; for (l = 0; l < g->p.nr_loops; l++) { start = stop; @@ -1015,7 +750,7 @@ static void *worker_thread(void *__tdata) update_curr_cpu(task_nr, work_done); bytes_done += work_done; - if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs) + if (details < 0 && !g->p.nr_secs) continue; td->loops_done = l; @@ -1035,37 +770,6 @@ static void *worker_thread(void *__tdata) if (start.tv_sec == stop.tv_sec) continue; - /* - * Perturb the first task's equilibrium every g->p.perturb_secs seconds, - * by migrating to CPU#0: - */ - if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) { - cpu_set_t orig_mask; - int target_cpu; - int this_cpu; - - last_perturbance = stop.tv_sec; - - /* - * Depending on where we are running, move into - * the other half of the system, to create some - * real disturbance: - */ - this_cpu = g->threads[task_nr].curr_cpu; - if (this_cpu < g->p.nr_cpus/2) - target_cpu = g->p.nr_cpus-1; - else - target_cpu = 0; - - orig_mask = bind_to_cpu(target_cpu); - - /* Here we are running on the target CPU already */ - if (details >= 1) - printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu); - - bind_to_cpumask(orig_mask); - } - if (details >= 3) { timersub(&stop, &start, &diff); runtime_ns_max = diff.tv_sec * 1000000000; @@ -1084,7 +788,7 @@ static void *worker_thread(void *__tdata) runtime_ns_max = diff.tv_sec * 1000000000ULL; runtime_ns_max += diff.tv_usec * 1000ULL; - show_summary(runtime_ns_max, l, &convergence); + show_summary(runtime_ns_max, l); } gettimeofday(&stop, NULL); @@ -1226,8 +930,7 @@ static int init(void) g->p.nr_nodes = numa_max_node() + 1; - /* char array in count_process_nodes(): */ - BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0); + BUG_ON(g->p.nr_nodes < 0); if (g->p.show_quiet && !g->p.show_details) g->p.show_details = -1; @@ -1427,11 +1130,6 @@ static int __bench_syscall(const char *name) bytes = g->bytes_done; runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9; - if (g->p.measure_convergence) { - print_res(name, runtime_sec_max, - "secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge"); - } - print_res(name, runtime_sec_max, "secs,", "runtime-max/thread", "secs slowest (max) thread-runtime"); @@ -1517,10 +1215,6 @@ static void init_params(struct params *p, const char *name, int argc, const char /* Initialize nonzero defaults: */ p->serialize_startup = 1; - p->data_reads = true; - p->data_writes = true; - p->data_backwards = true; - p->data_rand_walk = true; p->nr_loops = 10000000; p->init_random = true; p->mb_global_str = "1";