Em Fri, Oct 18, 2013 at 10:38:48AM -0400, Waiman Long escreveu:
> When callgraph data was included in the perf data file, it may take a
> long time to scan all those data and merge them together especially
> if the stored callchains are long and the perf data file itself is
> large, like a Gbyte or so.
> 
> The callchain stack is currently limited to PERF_MAX_STACK_DEPTH (127).
> This is a large value. Usually the callgraph data that developers are
> most interested in are the first few levels, the rests are usually
> not looked at.
> 
> This patch adds a new --max-stack option to perf-report to limit the
> depth of callchain stack data to look at to reduce the time it takes
> for perf-report to finish its processing. It trades the presence of
> trailing stack information with faster speed.
> 
> The following table shows the elapsed time of doing perf-report on a
> perf.data file of size 985,531,828 bytes.
> 
> --max_stack   Elapsed Time    Output data size
> -----------   ------------    ----------------

Please prefix lines like this (------) with a space, otherwise 'git am'
will chop off everything from that line onwards. Fixing it up now.

- Arnaldo

> not set                  88.0s        124,422,651
> 64               87.5s        116,303,213
> 32               87.2s        112,023,804
> 16               86.6s         94,326,380
> 8                59.9s         33,697,248
> 4                40.7s         10,116,637
> -g none                  27.1s          2,555,810
> 
> Signed-off-by: Waiman Long <waiman.l...@hp.com>
> ---
>  tools/perf/Documentation/perf-report.txt |    8 ++++++++
>  tools/perf/builtin-report.c              |   22 +++++++++++++++++-----
>  tools/perf/builtin-top.c                 |    3 ++-
>  tools/perf/util/machine.c                |   14 +++++++++-----
>  tools/perf/util/machine.h                |    3 ++-
>  tools/perf/util/session.c                |    3 ++-
>  6 files changed, 40 insertions(+), 13 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-report.txt 
> b/tools/perf/Documentation/perf-report.txt
> index 2b8097e..be3f196 100644
> --- a/tools/perf/Documentation/perf-report.txt
> +++ b/tools/perf/Documentation/perf-report.txt
> @@ -135,6 +135,14 @@ OPTIONS
>  
>       Default: fractal,0.5,callee,function.
>  
> +--max-stack::
> +     Set the stack depth limit when parsing the callchain, anything
> +     beyond the specified depth will be ignored. This is a trade-off
> +     between information loss and faster processing especially for
> +     workloads that can have a very long callchain stack.
> +
> +     Default: 127
> +
>  -G::
>  --inverted::
>          alias for inverted caller based call graph.
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index 72eae74..d0c9504 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -47,6 +47,7 @@ struct perf_report {
>       bool                    show_threads;
>       bool                    inverted_callchain;
>       bool                    mem_mode;
> +     int                     max_stack;
>       struct perf_read_values show_threads_values;
>       const char              *pretty_printing_style;
>       const char              *cpu_list;
> @@ -88,7 +89,8 @@ static int perf_report__add_mem_hist_entry(struct perf_tool 
> *tool,
>       if ((sort__has_parent || symbol_conf.use_callchain) &&
>           sample->callchain) {
>               err = machine__resolve_callchain(machine, evsel, al->thread,
> -                                              sample, &parent, al);
> +                                              sample, &parent, al,
> +                                              rep->max_stack);
>               if (err)
>                       return err;
>       }
> @@ -179,7 +181,8 @@ static int perf_report__add_branch_hist_entry(struct 
> perf_tool *tool,
>       if ((sort__has_parent || symbol_conf.use_callchain)
>           && sample->callchain) {
>               err = machine__resolve_callchain(machine, evsel, al->thread,
> -                                              sample, &parent, al);
> +                                              sample, &parent, al,
> +                                              rep->max_stack);
>               if (err)
>                       return err;
>       }
> @@ -242,18 +245,21 @@ out:
>       return err;
>  }
>  
> -static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
> +static int perf_evsel__add_hist_entry(struct perf_tool *tool,
> +                                   struct perf_evsel *evsel,
>                                     struct addr_location *al,
>                                     struct perf_sample *sample,
>                                     struct machine *machine)
>  {
> +     struct perf_report *rep = container_of(tool, struct perf_report, tool);
>       struct symbol *parent = NULL;
>       int err = 0;
>       struct hist_entry *he;
>  
>       if ((sort__has_parent || symbol_conf.use_callchain) && 
> sample->callchain) {
>               err = machine__resolve_callchain(machine, evsel, al->thread,
> -                                              sample, &parent, al);
> +                                              sample, &parent, al,
> +                                              rep->max_stack);
>               if (err)
>                       return err;
>       }
> @@ -330,7 +336,8 @@ static int process_sample_event(struct perf_tool *tool,
>               if (al.map != NULL)
>                       al.map->dso->hit = 1;
>  
> -             ret = perf_evsel__add_hist_entry(evsel, &al, sample, machine);
> +             ret = perf_evsel__add_hist_entry(tool, evsel, &al, sample,
> +                                              machine);
>               if (ret < 0)
>                       pr_debug("problem incrementing symbol period, skipping 
> event\n");
>       }
> @@ -757,6 +764,7 @@ int cmd_report(int argc, const char **argv, const char 
> *prefix __maybe_unused)
>                       .ordered_samples = true,
>                       .ordering_requires_timestamps = true,
>               },
> +             .max_stack               = PERF_MAX_STACK_DEPTH,
>               .pretty_printing_style   = "normal",
>       };
>       const struct option options[] = {
> @@ -797,6 +805,10 @@ int cmd_report(int argc, const char **argv, const char 
> *prefix __maybe_unused)
>       OPT_CALLBACK_DEFAULT('g', "call-graph", &report, 
> "output_type,min_percent[,print_limit],call_order",
>                    "Display callchains using output_type (graph, flat, 
> fractal, or none) , min percent threshold, optional print limit, callchain 
> order, key (function or address). "
>                    "Default: fractal,0.5,callee,function", 
> &parse_callchain_opt, callchain_default_opt),
> +     OPT_INTEGER(0, "max-stack", &report.max_stack,
> +                 "Set the maximum stack depth when parsing the callchain, "
> +                 "anything beyond the specified depth will be ignored. "
> +                 "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
>       OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
>                   "alias for inverted call graph"),
>       OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
> diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
> index 2122141..2725aca 100644
> --- a/tools/perf/builtin-top.c
> +++ b/tools/perf/builtin-top.c
> @@ -771,7 +771,8 @@ static void perf_event__process_sample(struct perf_tool 
> *tool,
>                   sample->callchain) {
>                       err = machine__resolve_callchain(machine, evsel,
>                                                        al.thread, sample,
> -                                                      &parent, &al);
> +                                                      &parent, &al,
> +                                                      PERF_MAX_STACK_DEPTH);
>                       if (err)
>                               return;
>               }
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 6188d28..9617c4a 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -1267,10 +1267,12 @@ static int machine__resolve_callchain_sample(struct 
> machine *machine,
>                                            struct thread *thread,
>                                            struct ip_callchain *chain,
>                                            struct symbol **parent,
> -                                          struct addr_location *root_al)
> +                                          struct addr_location *root_al,
> +                                          int max_stack)
>  {
>       u8 cpumode = PERF_RECORD_MISC_USER;
> -     unsigned int i;
> +     int chain_nr = min(max_stack, (int)chain->nr);
> +     int i;
>       int err;
>  
>       callchain_cursor_reset(&callchain_cursor);
> @@ -1280,7 +1282,7 @@ static int machine__resolve_callchain_sample(struct 
> machine *machine,
>               return 0;
>       }
>  
> -     for (i = 0; i < chain->nr; i++) {
> +     for (i = 0; i < chain_nr; i++) {
>               u64 ip;
>               struct addr_location al;
>  
> @@ -1352,12 +1354,14 @@ int machine__resolve_callchain(struct machine 
> *machine,
>                              struct thread *thread,
>                              struct perf_sample *sample,
>                              struct symbol **parent,
> -                            struct addr_location *root_al)
> +                            struct addr_location *root_al,
> +                            int max_stack)
>  {
>       int ret;
>  
>       ret = machine__resolve_callchain_sample(machine, thread,
> -                                             sample->callchain, parent, 
> root_al);
> +                                             sample->callchain, parent,
> +                                             root_al, max_stack);
>       if (ret)
>               return ret;
>  
> diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
> index 58a6be1..d09cce0 100644
> --- a/tools/perf/util/machine.h
> +++ b/tools/perf/util/machine.h
> @@ -91,7 +91,8 @@ int machine__resolve_callchain(struct machine *machine,
>                              struct thread *thread,
>                              struct perf_sample *sample,
>                              struct symbol **parent,
> -                            struct addr_location *root_al);
> +                            struct addr_location *root_al,
> +                            int max_stack);
>  
>  /*
>   * Default guest kernel is defined by parameter --guestkallsyms
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 568b750..96e5449 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1525,7 +1525,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, 
> union perf_event *event,
>       if (symbol_conf.use_callchain && sample->callchain) {
>  
>               if (machine__resolve_callchain(machine, evsel, al.thread,
> -                                            sample, NULL, NULL) != 0) {
> +                                            sample, NULL, NULL,
> +                                            PERF_MAX_STACK_DEPTH) != 0) {
>                       if (verbose)
>                               error("Failed to resolve callchain. 
> Skipping\n");
>                       return;
> -- 
> 1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to