[PATCH] perf tooling: Simplify 'perf bench syscall'

Ingo Molnar Sun, 31 Jan 2016 23:50:40 -0800

* Ingo Molnar <mi...@kernel.org> wrote:

> [...]
> 
> I kept the process, threading and memory allocation bits of numa.c, just in 
> case 
> we need them to measure more complex syscalls. Maybe we could keep the 
> threading 
> bits and remove the memory allocation parameters, to simplify the benchmark?


So the patch below removes NUMA details: convergence measurement and memory 
access 
pattern details. This reduces the linecount by about 30%. Should be combined 
with 
the previous patch I suspect.

Thanks,

        Ingo

==================>
>From a992aecebe12a195ffa74e09fcbe6b48db4430e3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mi...@kernel.org>
Date: Mon, 1 Feb 2016 08:46:39 +0100
Subject: [PATCH] perf tooling: Simplify 'perf bench syscall'

Remove NUMA legacies.

Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 tools/perf/bench/syscall.c | 316 +--------------------------------------------
 1 file changed, 5 insertions(+), 311 deletions(-)

diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c
index 5a4ef02176d1..fabac462bde5 100644
--- a/tools/perf/bench/syscall.c
+++ b/tools/perf/bench/syscall.c
@@ -81,11 +81,6 @@ struct params {
        double                  mb_thread;
 
        /* Access patterns to the working set: */
-       bool                    data_reads;
-       bool                    data_writes;
-       bool                    data_backwards;
-       bool                    data_zero_memset;
-       bool                    data_rand_walk;
        u32                     nr_loops;
        u32                     nr_secs;
        u32                     sleep_usecs;
@@ -108,10 +103,6 @@ struct params {
        int                     nr_tasks;
        bool                    show_quiet;
 
-       bool                    show_convergence;
-       bool                    measure_convergence;
-
-       int                     perturb_secs;
        int                     nr_cpus;
        int                     nr_nodes;
 
@@ -139,8 +130,6 @@ struct global_info {
 
        struct thread_data      *threads;
 
-       /* Convergence latency measurement: */
-       bool                    all_converged;
        bool                    stop_work;
 
        int                     print_once;
@@ -168,23 +157,13 @@ static const struct option options[] = {
        OPT_UINTEGER('s', "nr_secs"     , &p0.nr_secs,          "max number of 
seconds to run (default: 5 secs)"),
        OPT_UINTEGER('u', "usleep"      , &p0.sleep_usecs,      "usecs to sleep 
per loop iteration"),
 
-       OPT_BOOLEAN('R', "data_reads"   , &p0.data_reads,       "access the 
data via writes (can be mixed with -W)"),
-       OPT_BOOLEAN('W', "data_writes"  , &p0.data_writes,      "access the 
data via writes (can be mixed with -R)"),
-       OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards,  "access the 
data backwards as well"),
-       OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the 
data via glibc bzero only"),
-       OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk,  "access the 
data with random (32bit LFSR) walk"),
-
-
        OPT_BOOLEAN('z', "init_zero"    , &p0.init_zero,        "bzero the 
initial allocations"),
        OPT_BOOLEAN('I', "init_random"  , &p0.init_random,      "randomize the 
contents of the initial allocations"),
        OPT_BOOLEAN('0', "init_cpu0"    , &p0.init_cpu0,        "do the initial 
allocations on CPU#0"),
-       OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs,      "perturb thread 
0/0 every X secs, to test convergence stability"),
 
        OPT_INCR   ('d', "show_details" , &p0.show_details,     "Show details"),
        OPT_INCR   ('a', "all"          , &p0.run_all,          "Run all tests 
in the suite"),
        OPT_INTEGER('H', "thp"          , &p0.thp,              
"MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
-       OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show 
convergence details"),
-       OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, 
"measure convergence latency"),
        OPT_BOOLEAN('q', "quiet"        , &p0.show_quiet,       "quiet mode"),
        OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize 
thread startup"),
 
@@ -208,32 +187,6 @@ static const char * const syscall_usage[] = {
        NULL
 };
 
-static cpu_set_t bind_to_cpu(int target_cpu)
-{
-       cpu_set_t orig_mask, mask;
-       int ret;
-
-       ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
-       BUG_ON(ret);
-
-       CPU_ZERO(&mask);
-
-       if (target_cpu == -1) {
-               int cpu;
-
-               for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
-                       CPU_SET(cpu, &mask);
-       } else {
-               BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
-               CPU_SET(target_cpu, &mask);
-       }
-
-       ret = sched_setaffinity(0, sizeof(mask), &mask);
-       BUG_ON(ret);
-
-       return orig_mask;
-}
-
 static cpu_set_t bind_to_node(int target_node)
 {
        int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes;
@@ -699,222 +652,11 @@ static void update_curr_cpu(int task_nr, unsigned long 
bytes_worked)
        prctl(0, bytes_worked);
 }
 
-#define MAX_NR_NODES   64
-
-/*
- * Count the number of nodes a process's threads
- * are spread out on.
- *
- * A count of 1 means that the process is compressed
- * to a single node. A count of g->p.nr_nodes means it's
- * spread out on the whole system.
- */
-static int count_process_nodes(int process_nr)
-{
-       char node_present[MAX_NR_NODES] = { 0, };
-       int nodes;
-       int n, t;
-
-       for (t = 0; t < g->p.nr_threads; t++) {
-               struct thread_data *td;
-               int task_nr;
-               int node;
-
-               task_nr = process_nr*g->p.nr_threads + t;
-               td = g->threads + task_nr;
-
-               node = numa_node_of_cpu(td->curr_cpu);
-               if (node < 0) /* curr_cpu was likely still -1 */
-                       return 0;
-
-               node_present[node] = 1;
-       }
-
-       nodes = 0;
-
-       for (n = 0; n < MAX_NR_NODES; n++)
-               nodes += node_present[n];
-
-       return nodes;
-}
-
-/*
- * Count the number of distinct process-threads a node contains.
- *
- * A count of 1 means that the node contains only a single
- * process. If all nodes on the system contain at most one
- * process then we are well-converged.
- */
-static int count_node_processes(int node)
-{
-       int processes = 0;
-       int t, p;
-
-       for (p = 0; p < g->p.nr_proc; p++) {
-               for (t = 0; t < g->p.nr_threads; t++) {
-                       struct thread_data *td;
-                       int task_nr;
-                       int n;
-
-                       task_nr = p*g->p.nr_threads + t;
-                       td = g->threads + task_nr;
-
-                       n = numa_node_of_cpu(td->curr_cpu);
-                       if (n == node) {
-                               processes++;
-                               break;
-                       }
-               }
-       }
-
-       return processes;
-}
-
-static void calc_convergence_compression(int *strong)
-{
-       unsigned int nodes_min, nodes_max;
-       int p;
-
-       nodes_min = -1;
-       nodes_max =  0;
-
-       for (p = 0; p < g->p.nr_proc; p++) {
-               unsigned int nodes = count_process_nodes(p);
-
-               if (!nodes) {
-                       *strong = 0;
-                       return;
-               }
-
-               nodes_min = min(nodes, nodes_min);
-               nodes_max = max(nodes, nodes_max);
-       }
-
-       /* Strong convergence: all threads compress on a single node: */
-       if (nodes_min == 1 && nodes_max == 1) {
-               *strong = 1;
-       } else {
-               *strong = 0;
-               tprintf(" {%d-%d}", nodes_min, nodes_max);
-       }
-}
-
-static void calc_convergence(double runtime_ns_max, double *convergence)
-{
-       unsigned int loops_done_min, loops_done_max;
-       int process_groups;
-       int nodes[MAX_NR_NODES];
-       int distance;
-       int nr_min;
-       int nr_max;
-       int strong;
-       int sum;
-       int nr;
-       int node;
-       int cpu;
-       int t;
-
-       if (!g->p.show_convergence && !g->p.measure_convergence)
-               return;
-
-       for (node = 0; node < g->p.nr_nodes; node++)
-               nodes[node] = 0;
-
-       loops_done_min = -1;
-       loops_done_max = 0;
-
-       for (t = 0; t < g->p.nr_tasks; t++) {
-               struct thread_data *td = g->threads + t;
-               unsigned int loops_done;
-
-               cpu = td->curr_cpu;
-
-               /* Not all threads have written it yet: */
-               if (cpu < 0)
-                       continue;
-
-               node = numa_node_of_cpu(cpu);
-
-               nodes[node]++;
-
-               loops_done = td->loops_done;
-               loops_done_min = min(loops_done, loops_done_min);
-               loops_done_max = max(loops_done, loops_done_max);
-       }
-
-       nr_max = 0;
-       nr_min = g->p.nr_tasks;
-       sum = 0;
-
-       for (node = 0; node < g->p.nr_nodes; node++) {
-               nr = nodes[node];
-               nr_min = min(nr, nr_min);
-               nr_max = max(nr, nr_max);
-               sum += nr;
-       }
-       BUG_ON(nr_min > nr_max);
-
-       BUG_ON(sum > g->p.nr_tasks);
-
-       if (0 && (sum < g->p.nr_tasks))
-               return;
-
-       /*
-        * Count the number of distinct process groups present
-        * on nodes - when we are converged this will decrease
-        * to g->p.nr_proc:
-        */
-       process_groups = 0;
-
-       for (node = 0; node < g->p.nr_nodes; node++) {
-               int processes = count_node_processes(node);
-
-               nr = nodes[node];
-               tprintf(" %2d/%-2d", nr, processes);
-
-               process_groups += processes;
-       }
-
-       distance = nr_max - nr_min;
-
-       tprintf(" [%2d/%-2d]", distance, process_groups);
-
-       tprintf(" l:%3d-%-3d (%3d)",
-               loops_done_min, loops_done_max, loops_done_max-loops_done_min);
-
-       if (loops_done_min && loops_done_max) {
-               double skew = 1.0 - (double)loops_done_min/loops_done_max;
-
-               tprintf(" [%4.1f%%]", skew * 100.0);
-       }
-
-       calc_convergence_compression(&strong);
-
-       if (strong && process_groups == g->p.nr_proc) {
-               if (!*convergence) {
-                       *convergence = runtime_ns_max;
-                       tprintf(" (%6.1fs converged)\n", *convergence/1e9);
-                       if (g->p.measure_convergence) {
-                               g->all_converged = true;
-                               g->stop_work = true;
-                       }
-               }
-       } else {
-               if (*convergence) {
-                       tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9);
-                       *convergence = 0;
-               }
-               tprintf("\n");
-       }
-}
-
-static void show_summary(double runtime_ns_max, int l, double *convergence)
+static void show_summary(double runtime_ns_max, int l)
 {
        tprintf("\r #  %5.1f%%  [%.1f mins]",
                (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0);
 
-       calc_convergence(runtime_ns_max, convergence);
-
        if (g->p.show_details >= 0)
                fflush(stdout);
 }
@@ -925,11 +667,9 @@ static void *worker_thread(void *__tdata)
        struct timeval start0, start, stop, diff;
        int process_nr = td->process_nr;
        int thread_nr = td->thread_nr;
-       unsigned long last_perturbance;
        int task_nr = td->task_nr;
        int details = g->p.show_details;
-       int first_task, last_task;
-       double convergence = 0;
+       int last_task;
        u64 val = td->val;
        double runtime_ns_max;
        u8 *global_data;
@@ -955,10 +695,6 @@ static void *worker_thread(void *__tdata)
        if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1)
                last_task = 1;
 
-       first_task = 0;
-       if (process_nr == 0 && thread_nr == 0)
-               first_task = 1;
-
        if (details >= 2) {
                printf("#  thread %2d / %2d global mem: %p, process mem: %p, 
thread mem: %p\n",
                        process_nr, thread_nr, global_data, process_data, 
thread_data);
@@ -983,7 +719,6 @@ static void *worker_thread(void *__tdata)
        gettimeofday(&start0, NULL);
 
        start = stop = start0;
-       last_perturbance = start.tv_sec;
 
        for (l = 0; l < g->p.nr_loops; l++) {
                start = stop;
@@ -1015,7 +750,7 @@ static void *worker_thread(void *__tdata)
                update_curr_cpu(task_nr, work_done);
                bytes_done += work_done;
 
-               if (details < 0 && !g->p.perturb_secs && 
!g->p.measure_convergence && !g->p.nr_secs)
+               if (details < 0 && !g->p.nr_secs)
                        continue;
 
                td->loops_done = l;
@@ -1035,37 +770,6 @@ static void *worker_thread(void *__tdata)
                if (start.tv_sec == stop.tv_sec)
                        continue;
 
-               /*
-                * Perturb the first task's equilibrium every g->p.perturb_secs 
seconds,
-                * by migrating to CPU#0:
-                */
-               if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - 
last_perturbance) >= g->p.perturb_secs) {
-                       cpu_set_t orig_mask;
-                       int target_cpu;
-                       int this_cpu;
-
-                       last_perturbance = stop.tv_sec;
-
-                       /*
-                        * Depending on where we are running, move into
-                        * the other half of the system, to create some
-                        * real disturbance:
-                        */
-                       this_cpu = g->threads[task_nr].curr_cpu;
-                       if (this_cpu < g->p.nr_cpus/2)
-                               target_cpu = g->p.nr_cpus-1;
-                       else
-                               target_cpu = 0;
-
-                       orig_mask = bind_to_cpu(target_cpu);
-
-                       /* Here we are running on the target CPU already */
-                       if (details >= 1)
-                               printf(" (injecting perturbalance, moved to 
CPU#%d)\n", target_cpu);
-
-                       bind_to_cpumask(orig_mask);
-               }
-
                if (details >= 3) {
                        timersub(&stop, &start, &diff);
                        runtime_ns_max = diff.tv_sec * 1000000000;
@@ -1084,7 +788,7 @@ static void *worker_thread(void *__tdata)
                runtime_ns_max = diff.tv_sec * 1000000000ULL;
                runtime_ns_max += diff.tv_usec * 1000ULL;
 
-               show_summary(runtime_ns_max, l, &convergence);
+               show_summary(runtime_ns_max, l);
        }
 
        gettimeofday(&stop, NULL);
@@ -1226,8 +930,7 @@ static int init(void)
 
        g->p.nr_nodes = numa_max_node() + 1;
 
-       /* char array in count_process_nodes(): */
-       BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0);
+       BUG_ON(g->p.nr_nodes < 0);
 
        if (g->p.show_quiet && !g->p.show_details)
                g->p.show_details = -1;
@@ -1427,11 +1130,6 @@ static int __bench_syscall(const char *name)
        bytes = g->bytes_done;
        runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9;
 
-       if (g->p.measure_convergence) {
-               print_res(name, runtime_sec_max,
-                       "secs,", "NUMA-convergence-latency", "secs latency to 
NUMA-converge");
-       }
-
        print_res(name, runtime_sec_max,
                "secs,", "runtime-max/thread",  "secs slowest (max) 
thread-runtime");
 
@@ -1517,10 +1215,6 @@ static void init_params(struct params *p, const char 
*name, int argc, const char
        /* Initialize nonzero defaults: */
 
        p->serialize_startup            = 1;
-       p->data_reads                   = true;
-       p->data_writes                  = true;
-       p->data_backwards               = true;
-       p->data_rand_walk               = true;
        p->nr_loops                     = 10000000;
        p->init_random                  = true;
        p->mb_global_str                = "1";

[PATCH] perf tooling: Simplify 'perf bench syscall'

Reply via email to