This patch adds optional pagefault tracing support to 'perf trace'.
Using -F/--pf option user can specify whether he wants minor, major or
all pagefault events to be traced. This patch adds only live mode,
record and replace will come in a separate patch.

Example output:
1756272.905 ( 0.000 ms): curl/5937 majfault [0x7fa7261978b6] => 
/usr/lib/x86_64-linux-gnu/libkrb5.so.26.0.0+0x85288 (d.)
1862866.036 ( 0.000 ms): wget/8460 majfault [__clear_user+0x3f] => 0x659cb4 (?k)

Signed-off-by: Stanislav Fomichev <stfomic...@yandex-team.ru>
---
 tools/perf/Documentation/perf-trace.txt |  12 ++++
 tools/perf/builtin-trace.c              | 116 +++++++++++++++++++++++++++++++-
 2 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-trace.txt 
b/tools/perf/Documentation/perf-trace.txt
index fae38d9a44a4..8e5f710aa45d 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -107,6 +107,18 @@ the thread executes on the designated CPUs. Default is to 
monitor all CPUs.
        Show tool stats such as number of times fd->pathname was discovered thru
        hooking the open syscall return + vfs_getname or via reading 
/proc/pid/fd, etc.
 
+-F=[all|min|maj]::
+--pf=[all|min|maj]::
+       Trace pagefaults. Optionally, you can specify whether you want minor,
+       major or all pagefaults. Default value is maj.
+
+EXAMPLES
+--------
+
+Trace syscalls, major and minor pagefaults:
+
+ $ perf trace -F all
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script[1]
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index a9b542918da0..a80aae2bba40 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1177,6 +1177,9 @@ fail:
        return NULL;
 }
 
+#define TRACE_PFMAJ            (1 << 0)
+#define TRACE_PFMIN            (1 << 1)
+
 struct trace {
        struct perf_tool        tool;
        struct {
@@ -1211,6 +1214,7 @@ struct trace {
        bool                    summary_only;
        bool                    show_comm;
        bool                    show_tool_stats;
+       int                     trace_pgfaults;
 };
 
 static int trace__set_fd_pathname(struct thread *thread, int fd, const char 
*pathname)
@@ -1773,6 +1777,59 @@ out_dump:
        return 0;
 }
 
+static int trace__pgfault(struct trace *trace,
+                         struct perf_evsel *evsel,
+                         union perf_event *event,
+                         struct perf_sample *sample)
+{
+       struct thread *thread;
+       u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+       struct addr_location al;
+       char map_type = 'd';
+
+       thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+
+       thread__find_addr_location(thread, trace->host, cpumode, MAP__FUNCTION,
+                             sample->ip, &al);
+
+       trace__fprintf_entry_head(trace, thread, 0, sample->time, 
trace->output);
+
+       fprintf(trace->output, "%sfault ",
+               evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
+               "maj" : "min");
+
+       if (al.sym)
+               fprintf(trace->output, "[%s+0x%lx]",
+                       al.sym->name, al.addr - al.sym->start);
+       else
+               fprintf(trace->output, "[0x%lx]", sample->ip);
+
+       fprintf(trace->output, " => ");
+
+       thread__find_addr_map(thread, trace->host, cpumode, MAP__VARIABLE,
+                             sample->addr, &al);
+
+       if (!al.map) {
+               thread__find_addr_map(thread, trace->host, cpumode,
+                                     MAP__FUNCTION, sample->addr, &al);
+
+               if (al.map)
+                       map_type = 'x';
+       }
+
+       if (al.map) {
+               fprintf(trace->output, "%s+0x%lx",
+                       al.map->dso->long_name, al.addr);
+       } else {
+               map_type = '?';
+               fprintf(trace->output, "0x%lx", sample->addr);
+       }
+
+       fprintf(trace->output, " (%c%c)\n", map_type, al.level);
+
+       return 0;
+}
+
 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
 {
        if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
@@ -1887,6 +1944,30 @@ static void perf_evlist__add_vfs_getname(struct 
perf_evlist *evlist)
        perf_evlist__add(evlist, evsel);
 }
 
+static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
+                                   u64 config)
+{
+       struct perf_evsel *evsel;
+       struct perf_event_attr attr = {
+               .type = PERF_TYPE_SOFTWARE,
+               .mmap_data = 1,
+               .sample_period = 1,
+       };
+
+       attr.config = config;
+
+       event_attr_init(&attr);
+
+       evsel = perf_evsel__new(&attr);
+       if (!evsel)
+               return -ENOMEM;
+
+       evsel->handler = trace__pgfault;
+       perf_evlist__add(evlist, evsel);
+
+       return 0;
+}
+
 static int trace__run(struct trace *trace, int argc, const char **argv)
 {
        struct perf_evlist *evlist = perf_evlist__new();
@@ -1907,6 +1988,14 @@ static int trace__run(struct trace *trace, int argc, 
const char **argv)
 
        perf_evlist__add_vfs_getname(evlist);
 
+       if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
+           perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ))
+               goto out_error_tp;
+
+       if ((trace->trace_pgfaults & TRACE_PFMIN) &&
+           perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
+               goto out_error_tp;
+
        if (trace->sched &&
                perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
                                trace__sched_stat_runtime))
@@ -1987,7 +2076,8 @@ again:
                                goto next_event;
                        }
 
-                       if (sample.raw_data == NULL) {
+                       if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
+                           sample.raw_data == NULL) {
                                fprintf(trace->output, "%s sample with no 
payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
                                       perf_evsel__name(evsel), sample.tid,
                                       sample.cpu, sample.raw_size);
@@ -2269,6 +2359,23 @@ static int trace__open_output(struct trace *trace, const 
char *filename)
        return trace->output == NULL ? -errno : 0;
 }
 
+static int parse_pagefaults(const struct option *opt, const char *str,
+                           int unset __maybe_unused)
+{
+       int *trace_pgfaults = opt->value;
+
+       if (strcmp(str, "all") == 0)
+               *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
+       else if (strcmp(str, "maj") == 0)
+               *trace_pgfaults |= TRACE_PFMAJ;
+       else if (strcmp(str, "min") == 0)
+               *trace_pgfaults |= TRACE_PFMIN;
+       else
+               return -1;
+
+       return 0;
+}
+
 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 {
        const char * const trace_usage[] = {
@@ -2335,6 +2442,8 @@ int cmd_trace(int argc, const char **argv, const char 
*prefix __maybe_unused)
                    "Show only syscall summary with statistics"),
        OPT_BOOLEAN('S', "with-summary", &trace.summary,
                    "Show all syscalls and summary with statistics"),
+       OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
+                    "Trace pagefaults", parse_pagefaults, "maj"),
        OPT_END()
        };
        int err;
@@ -2349,6 +2458,11 @@ int cmd_trace(int argc, const char **argv, const char 
*prefix __maybe_unused)
        if (trace.summary_only)
                trace.summary = trace.summary_only;
 
+       if (trace.trace_pgfaults) {
+               trace.opts.sample_address = true;
+               trace.opts.sample_time = true;
+       }
+
        if (output_name != NULL) {
                err = trace__open_output(&trace, output_name);
                if (err < 0) {
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to