When callgraph data was included in the perf data file, it may take a
long time to scan all those data and merge them together especially
if the stored callchains are long and the perf data file itself is
large, like a Gbyte or so.

The callchain stack is currently limited to PERF_MAX_STACK_DEPTH (127).
This is a large value. Usually the callgraph data that developers are
most interested in are the first few levels, the rests are usually
not looked at.

This patch adds a new --max-stack option to perf-report to limit the
depth of callchain stack data to look at to reduce the time it takes
for perf-report to finish its processing. It trades the presence of
trailing stack information with faster speed.

The following table shows the elapsed time of doing perf-report on a
perf.data file of size 985,531,828 bytes.

--max_stack     Elapsed Time    Output data size
-----------     ------------    ----------------
not set            88.0s        124,422,651
64                 87.5s        116,303,213
32                 87.2s        112,023,804
16                 86.6s         94,326,380
8                  59.9s         33,697,248
4                  40.7s         10,116,637
-g none            27.1s          2,555,810

Signed-off-by: Waiman Long <waiman.l...@hp.com>
---
 tools/perf/Documentation/perf-report.txt |    8 ++++++++
 tools/perf/builtin-report.c              |   22 +++++++++++++++++-----
 tools/perf/builtin-top.c                 |    3 ++-
 tools/perf/util/machine.c                |   14 +++++++++-----
 tools/perf/util/machine.h                |    3 ++-
 tools/perf/util/session.c                |    3 ++-
 6 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt 
b/tools/perf/Documentation/perf-report.txt
index 2b8097e..be3f196 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -135,6 +135,14 @@ OPTIONS
 
        Default: fractal,0.5,callee,function.
 
+--max-stack::
+       Set the stack depth limit when parsing the callchain, anything
+       beyond the specified depth will be ignored. This is a trade-off
+       between information loss and faster processing especially for
+       workloads that can have a very long callchain stack.
+
+       Default: 127
+
 -G::
 --inverted::
         alias for inverted caller based call graph.
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 72eae74..d0c9504 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -47,6 +47,7 @@ struct perf_report {
        bool                    show_threads;
        bool                    inverted_callchain;
        bool                    mem_mode;
+       int                     max_stack;
        struct perf_read_values show_threads_values;
        const char              *pretty_printing_style;
        const char              *cpu_list;
@@ -88,7 +89,8 @@ static int perf_report__add_mem_hist_entry(struct perf_tool 
*tool,
        if ((sort__has_parent || symbol_conf.use_callchain) &&
            sample->callchain) {
                err = machine__resolve_callchain(machine, evsel, al->thread,
-                                                sample, &parent, al);
+                                                sample, &parent, al,
+                                                rep->max_stack);
                if (err)
                        return err;
        }
@@ -179,7 +181,8 @@ static int perf_report__add_branch_hist_entry(struct 
perf_tool *tool,
        if ((sort__has_parent || symbol_conf.use_callchain)
            && sample->callchain) {
                err = machine__resolve_callchain(machine, evsel, al->thread,
-                                                sample, &parent, al);
+                                                sample, &parent, al,
+                                                rep->max_stack);
                if (err)
                        return err;
        }
@@ -242,18 +245,21 @@ out:
        return err;
 }
 
-static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
+static int perf_evsel__add_hist_entry(struct perf_tool *tool,
+                                     struct perf_evsel *evsel,
                                      struct addr_location *al,
                                      struct perf_sample *sample,
                                      struct machine *machine)
 {
+       struct perf_report *rep = container_of(tool, struct perf_report, tool);
        struct symbol *parent = NULL;
        int err = 0;
        struct hist_entry *he;
 
        if ((sort__has_parent || symbol_conf.use_callchain) && 
sample->callchain) {
                err = machine__resolve_callchain(machine, evsel, al->thread,
-                                                sample, &parent, al);
+                                                sample, &parent, al,
+                                                rep->max_stack);
                if (err)
                        return err;
        }
@@ -330,7 +336,8 @@ static int process_sample_event(struct perf_tool *tool,
                if (al.map != NULL)
                        al.map->dso->hit = 1;
 
-               ret = perf_evsel__add_hist_entry(evsel, &al, sample, machine);
+               ret = perf_evsel__add_hist_entry(tool, evsel, &al, sample,
+                                                machine);
                if (ret < 0)
                        pr_debug("problem incrementing symbol period, skipping 
event\n");
        }
@@ -757,6 +764,7 @@ int cmd_report(int argc, const char **argv, const char 
*prefix __maybe_unused)
                        .ordered_samples = true,
                        .ordering_requires_timestamps = true,
                },
+               .max_stack               = PERF_MAX_STACK_DEPTH,
                .pretty_printing_style   = "normal",
        };
        const struct option options[] = {
@@ -797,6 +805,10 @@ int cmd_report(int argc, const char **argv, const char 
*prefix __maybe_unused)
        OPT_CALLBACK_DEFAULT('g', "call-graph", &report, 
"output_type,min_percent[,print_limit],call_order",
                     "Display callchains using output_type (graph, flat, 
fractal, or none) , min percent threshold, optional print limit, callchain 
order, key (function or address). "
                     "Default: fractal,0.5,callee,function", 
&parse_callchain_opt, callchain_default_opt),
+       OPT_INTEGER(0, "max-stack", &report.max_stack,
+                   "Set the maximum stack depth when parsing the callchain, "
+                   "anything beyond the specified depth will be ignored. "
+                   "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
        OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
                    "alias for inverted call graph"),
        OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 2122141..2725aca 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -771,7 +771,8 @@ static void perf_event__process_sample(struct perf_tool 
*tool,
                    sample->callchain) {
                        err = machine__resolve_callchain(machine, evsel,
                                                         al.thread, sample,
-                                                        &parent, &al);
+                                                        &parent, &al,
+                                                        PERF_MAX_STACK_DEPTH);
                        if (err)
                                return;
                }
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 6188d28..9617c4a 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1267,10 +1267,12 @@ static int machine__resolve_callchain_sample(struct 
machine *machine,
                                             struct thread *thread,
                                             struct ip_callchain *chain,
                                             struct symbol **parent,
-                                            struct addr_location *root_al)
+                                            struct addr_location *root_al,
+                                            int max_stack)
 {
        u8 cpumode = PERF_RECORD_MISC_USER;
-       unsigned int i;
+       int chain_nr = min(max_stack, (int)chain->nr);
+       int i;
        int err;
 
        callchain_cursor_reset(&callchain_cursor);
@@ -1280,7 +1282,7 @@ static int machine__resolve_callchain_sample(struct 
machine *machine,
                return 0;
        }
 
-       for (i = 0; i < chain->nr; i++) {
+       for (i = 0; i < chain_nr; i++) {
                u64 ip;
                struct addr_location al;
 
@@ -1352,12 +1354,14 @@ int machine__resolve_callchain(struct machine *machine,
                               struct thread *thread,
                               struct perf_sample *sample,
                               struct symbol **parent,
-                              struct addr_location *root_al)
+                              struct addr_location *root_al,
+                              int max_stack)
 {
        int ret;
 
        ret = machine__resolve_callchain_sample(machine, thread,
-                                               sample->callchain, parent, 
root_al);
+                                               sample->callchain, parent,
+                                               root_al, max_stack);
        if (ret)
                return ret;
 
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 58a6be1..d09cce0 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -91,7 +91,8 @@ int machine__resolve_callchain(struct machine *machine,
                               struct thread *thread,
                               struct perf_sample *sample,
                               struct symbol **parent,
-                              struct addr_location *root_al);
+                              struct addr_location *root_al,
+                              int max_stack);
 
 /*
  * Default guest kernel is defined by parameter --guestkallsyms
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 568b750..96e5449 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1525,7 +1525,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, union 
perf_event *event,
        if (symbol_conf.use_callchain && sample->callchain) {
 
                if (machine__resolve_callchain(machine, evsel, al.thread,
-                                              sample, NULL, NULL) != 0) {
+                                              sample, NULL, NULL,
+                                              PERF_MAX_STACK_DEPTH) != 0) {
                        if (verbose)
                                error("Failed to resolve callchain. 
Skipping\n");
                        return;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to