This adds a new option to tell perf on which numa node the hash benchmark
should run. If set then 

 - The test is bound to the node
 - Memory is allocated on the local NUMA node
 - The threads are bound to the cpus on the node

The NUMA node can be specified by the -n argument.

Signed-off-by: Sebastian Andrzej Siewior <bige...@linutronix.de>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
---
 tools/perf/bench/Build        |  4 ++
 tools/perf/bench/futex-hash.c | 89 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index 60bf119..9e6e518 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -1,3 +1,7 @@
+ifdef CONFIG_NUMA
+CFLAGS_futex-hash.o   += -DCONFIG_NUMA=1
+endif
+
 perf-y += sched-messaging.o
 perf-y += sched-pipe.o
 perf-y += mem-functions.o
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 0999ac5..a1c6ee9 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -20,6 +20,9 @@
 #include <stdlib.h>
 #include <sys/time.h>
 #include <pthread.h>
+#ifdef CONFIG_NUMA
+#include <numa.h>
+#endif
 
 static unsigned int nthreads = 0;
 static unsigned int nsecs    = 10;
@@ -27,6 +30,7 @@ static unsigned int nsecs    = 10;
 static unsigned int nfutexes = 1024;
 static bool fshared = false, done = false, silent = false;
 static int futex_flag = 0;
+static int numa_node = -1;
 
 struct timeval start, end, runtime;
 static pthread_mutex_t thread_lock;
@@ -39,7 +43,7 @@ struct worker {
        u_int32_t *futex;
        pthread_t thread;
        unsigned long ops;
-};
+} __attribute__((aligned(128)));
 
 static const struct option options[] = {
        OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),
@@ -47,9 +51,28 @@ static const struct option options[] = {
        OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per 
threads"),
        OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display 
data/details"),
        OPT_BOOLEAN( 'S', "shared",  &fshared,  "Use shared futexes instead of 
private ones"),
+#ifdef CONFIG_NUMA
+       OPT_INTEGER( 'n', "numa",   &numa_node,  "Specify the NUMA node"),
+#endif
        OPT_END()
 };
 
+#ifndef CONFIG_NUMA
+static int numa_run_on_node(int node __maybe_unused) { return 0; }
+static int numa_node_of_cpu(int node __maybe_unused) { return 0; }
+static void *numa_alloc_local(size_t size) { return malloc(size); }
+static void numa_free(void *p, size_t size __maybe_unused) { return free(p); }
+#endif
+
+static bool cpu_is_local(int cpu)
+{
+       if (numa_node < 0)
+               return true;
+       if (numa_node_of_cpu(cpu) == numa_node)
+               return true;
+       return false;
+}
+
 static const char * const bench_futex_hash_usage[] = {
        "perf bench futex hash <options>",
        NULL
@@ -115,6 +138,8 @@ int bench_futex_hash(int argc, const char **argv,
        unsigned int i, ncpus;
        pthread_attr_t thread_attr;
        struct worker *worker = NULL;
+       char *node_str = NULL;
+       unsigned int cpunum;
 
        argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
        if (argc) {
@@ -128,18 +153,50 @@ int bench_futex_hash(int argc, const char **argv,
        act.sa_sigaction = toggle_done;
        sigaction(SIGINT, &act, NULL);
 
-       if (!nthreads) /* default to the number of CPUs */
-               nthreads = ncpus;
+       if (!nthreads) {
+               /* default to the number of CPUs per NUMA node */
+               if (numa_node < 0) {
+                       nthreads = ncpus;
+               } else {
+                       for (i = 0; i < ncpus; i++) {
+                               if (cpu_is_local(i))
+                                       nthreads++;
+                       }
+                       if (!nthreads)
+                               err(EXIT_FAILURE, "No online CPUs for this 
node");
+               }
+       } else {
+               int cpu_available = 0;
+
+               for (i = 0; i < ncpus && !cpu_available; i++) {
+                       if (cpu_is_local(i))
+                               cpu_available = 1;
+               }
+               if (!cpu_available)
+                       err(EXIT_FAILURE, "No online CPUs for this node");
+       }
+
+       if (numa_node >= 0) {
+               ret = numa_run_on_node(numa_node);
+               if (ret < 0)
+                       err(EXIT_FAILURE, "numa_run_on_node");
+               ret = asprintf(&node_str, " on node %d", numa_node);
+               if (ret < 0)
+                       err(EXIT_FAILURE, "numa_node, asprintf");
+       }
 
-       worker = calloc(nthreads, sizeof(*worker));
+       worker = numa_alloc_local(nthreads * sizeof(*worker));
        if (!worker)
                goto errmem;
 
        if (!fshared)
                futex_flag = FUTEX_PRIVATE_FLAG;
 
-       printf("Run summary [PID %d]: %d threads, each operating on %d [%s] 
futexes for %d secs.\n\n",
-              getpid(), nthreads, nfutexes, fshared ? "shared":"private", 
nsecs);
+       printf("Run summary [PID %d]: %d threads%s, each operating on %d [%s] 
futexes for %d secs.\n\n",
+              getpid(), nthreads,
+              node_str ? : "",
+              nfutexes, fshared ? "shared":"private",
+              nsecs);
 
        init_stats(&throughput_stats);
        pthread_mutex_init(&thread_lock, NULL);
@@ -149,14 +206,24 @@ int bench_futex_hash(int argc, const char **argv,
        threads_starting = nthreads;
        pthread_attr_init(&thread_attr);
        gettimeofday(&start, NULL);
-       for (i = 0; i < nthreads; i++) {
+       for (cpunum = 0, i = 0; i < nthreads; i++, cpunum++) {
+
+               do {
+                       if (cpu_is_local(cpunum))
+                               break;
+                       cpunum++;
+                       if (cpunum > ncpus)
+                               cpunum = 0;
+               } while (1);
+
                worker[i].tid = i;
-               worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex));
+               worker[i].futex = numa_alloc_local(nfutexes *
+                                                  sizeof(*worker[i].futex));
                if (!worker[i].futex)
                        goto errmem;
 
                CPU_ZERO(&cpu);
-               CPU_SET(i % ncpus, &cpu);
+               CPU_SET(cpunum % ncpus, &cpu);
 
                ret = pthread_attr_setaffinity_np(&thread_attr, 
sizeof(cpu_set_t), &cpu);
                if (ret)
@@ -203,12 +270,12 @@ int bench_futex_hash(int argc, const char **argv,
                                       &worker[i].futex[nfutexes-1], t);
                }
 
-               free(worker[i].futex);
+               numa_free(worker[i].futex, nfutexes * sizeof(*worker[i].futex));
        }
 
        print_summary();
 
-       free(worker);
+       numa_free(worker, nthreads * sizeof(*worker));
        return ret;
 errmem:
        err(EXIT_FAILURE, "calloc");
-- 
2.1.4



Reply via email to