On 02/18/2015 02:58 PM, Martin Liška wrote:
On 02/17/2015 10:03 PM, Jan Hubicka wrote:
Hi,
this patch should chase away the expensive thunks and aliases walks from most
of analysis code. I think only real use left is local_p predicate that needs to
stay because i386 expect local flag to match between caller and callee when
expanding assembler thunk. I at least optimized it by first moving the walk to
be conditional for nonlocal functions only and then reorganizing
call_for_symbol_thunks_and_aliases to first inspect aliases (that is cheap) and
only then work on thunks. Most likely this will find the non-local thunk/alias
faster. Other cases was leftovers from the conversion of thunks from aliases
to functions.
I also noticed a bug in ipa-profile that does not disable all the
transofrms with !ipa_profile_flag used on OPTIMIZTION_NODE and fixed it.
Bootstrapped/regtested x86_64-linux, comitted. I would be interested to
know if the call_for_symbol_thunks_and_aliases is now off your oprofiles
(sorry, easier to type than perf-profiles)
Honza
* ipa-visibility.c (function_and_variable_visibility): Only
check locality if node is not already local.
* ipa-inline.c (want_inline_function_to_all_callers_p): Use
call_for_symbol_and_aliases instead of
call_for_symbol_thunks_and_aliases.
(ipa_inline): Likewise.
* cgraph.c (cgraph_node::call_for_symbol_thunks_and_aliases):
first walk aliases.
* ipa.c (symbol_table::remove_unreachable_nodes): Use
call_for_symbol_and_aliases.
* ipa-profile.c (ipa_propagate_frequency_data): Add function_symbol.
(ipa_propagate_frequency_1): Use it; use opt_for_fn
(ipa_propagate_frequency): Update.
(ipa_profile): Add opt_for_fn gueards.
Index: ipa-visibility.c
===================================================================
--- ipa-visibility.c (revision 220741)
+++ ipa-visibility.c (working copy)
@@ -595,7 +595,8 @@ function_and_variable_visibility (bool w
}
FOR_EACH_DEFINED_FUNCTION (node)
{
- node->local.local |= node->local_p ();
+ if (!node->local.local)
+ node->local.local |= node->local_p ();
/* If we know that function can not be overwritten by a different
semantics
and moreover its section can not be discarded, replace all direct calls
Index: ipa-inline.c
===================================================================
--- ipa-inline.c (revision 220741)
+++ ipa-inline.c (working copy)
@@ -975,14 +975,14 @@ want_inline_function_to_all_callers_p (s
if (node->global.inlined_to)
return false;
/* Does it have callers? */
- if (!node->call_for_symbol_thunks_and_aliases (has_caller_p, NULL, true))
+ if (!node->call_for_symbol_and_aliases (has_caller_p, NULL, true))
return false;
/* Inlining into all callers would increase size? */
if (estimate_growth (node) > 0)
return false;
/* All inlines must be possible. */
- if (node->call_for_symbol_thunks_and_aliases (check_callers, &has_hot_call,
- true))
+ if (node->call_for_symbol_and_aliases (check_callers, &has_hot_call,
+ true))
return false;
if (!cold && !has_hot_call)
return false;
@@ -2359,9 +2359,9 @@ ipa_inline (void)
if (want_inline_function_to_all_callers_p (node, cold))
{
int num_calls = 0;
- node->call_for_symbol_thunks_and_aliases (sum_callers, &num_calls,
- true);
- while (node->call_for_symbol_thunks_and_aliases
+ node->call_for_symbol_and_aliases (sum_callers, &num_calls,
+ true);
+ while (node->call_for_symbol_and_aliases
(inline_to_all_callers, &num_calls, true))
;
remove_functions = true;
Index: cgraph.c
===================================================================
--- cgraph.c (revision 220741)
+++ cgraph.c (working copy)
@@ -2191,6 +2191,16 @@ cgraph_node::call_for_symbol_thunks_and_
if (callback (this, data))
return true;
+ FOR_EACH_ALIAS (this, ref)
+ {
+ cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
+ if (include_overwritable
+ || alias->get_availability () > AVAIL_INTERPOSABLE)
+ if (alias->call_for_symbol_thunks_and_aliases (callback, data,
+ include_overwritable,
+ exclude_virtual_thunks))
+ return true;
+ }
for (e = callers; e; e = e->next_caller)
if (e->caller->thunk.thunk_p
&& (include_overwritable
@@ -2202,16 +2212,6 @@ cgraph_node::call_for_symbol_thunks_and_
exclude_virtual_thunks))
return true;
- FOR_EACH_ALIAS (this, ref)
- {
- cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
- if (include_overwritable
- || alias->get_availability () > AVAIL_INTERPOSABLE)
- if (alias->call_for_symbol_thunks_and_aliases (callback, data,
- include_overwritable,
- exclude_virtual_thunks))
- return true;
- }
return false;
}
Index: ipa.c
===================================================================
--- ipa.c (revision 220741)
+++ ipa.c (working copy)
@@ -661,7 +661,7 @@ symbol_table::remove_unreachable_nodes (
if (node->address_taken
&& !node->used_from_other_partition)
{
- if (!node->call_for_symbol_thunks_and_aliases
+ if (!node->call_for_symbol_and_aliases
(has_addr_references_p, NULL, true)
&& (!node->instrumentation_clone
|| !node->instrumented_version
Index: ipa-profile.c
===================================================================
--- ipa-profile.c (revision 220741)
+++ ipa-profile.c (working copy)
@@ -322,6 +322,7 @@ ipa_profile_read_summary (void)
struct ipa_propagate_frequency_data
{
+ cgraph_node *function_symbol;
bool maybe_unlikely_executed;
bool maybe_executed_once;
bool only_called_at_startup;
@@ -342,7 +343,7 @@ ipa_propagate_frequency_1 (struct cgraph
|| d->only_called_at_startup || d->only_called_at_exit);
edge = edge->next_caller)
{
- if (edge->caller != node)
+ if (edge->caller != d->function_symbol)
{
d->only_called_at_startup &= edge->caller->only_called_at_startup;
/* It makes sense to put main() together with the static constructors.
@@ -358,7 +359,11 @@ ipa_propagate_frequency_1 (struct cgraph
errors can make us to push function into unlikely section even when
it is executed by the train run. Transfer the function only if all
callers are unlikely executed. */
- if (profile_info && flag_branch_probabilities
+ if (profile_info
+ && opt_for_fn (d->function_symbol->decl, flag_branch_probabilities)
+ /* Thunks are not profiled. This is more or less implementation
+ bug. */
+ && !d->function_symbol->thunk.thunk_p
&& (edge->caller->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED
|| (edge->caller->global.inlined_to
&& edge->caller->global.inlined_to->frequency
@@ -418,7 +423,7 @@ contains_hot_call_p (struct cgraph_node
bool
ipa_propagate_frequency (struct cgraph_node *node)
{
- struct ipa_propagate_frequency_data d = {true, true, true, true};
+ struct ipa_propagate_frequency_data d = {node, true, true, true, true};
bool changed = false;
/* We can not propagate anything useful about externally visible functions
@@ -432,8 +437,8 @@ ipa_propagate_frequency (struct cgraph_n
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Processing frequency %s\n", node->name ());
- node->call_for_symbol_thunks_and_aliases (ipa_propagate_frequency_1, &d,
- true);
+ node->call_for_symbol_and_aliases (ipa_propagate_frequency_1, &d,
+ true);
if ((d.only_called_at_startup && !d.only_called_at_exit)
&& !node->only_called_at_startup)
@@ -597,6 +602,9 @@ ipa_profile (void)
{
bool update = false;
+ if (!opt_for_fn (n->decl, flag_ipa_profile))
+ continue;
+
for (e = n->indirect_calls; e; e = e->next_callee)
{
if (n->count)
@@ -697,7 +705,9 @@ ipa_profile (void)
order_pos = ipa_reverse_postorder (order);
for (i = order_pos - 1; i >= 0; i--)
{
- if (order[i]->local.local && ipa_propagate_frequency (order[i]))
+ if (order[i]->local.local
+ && opt_for_fn (order[i]->decl, flag_ipa_profile)
+ && ipa_propagate_frequency (order[i]))
{
for (e = order[i]->callees; e; e = e->next_callee)
if (e->callee->local.local && !e->callee->aux)
@@ -714,7 +724,9 @@ ipa_profile (void)
something_changed = false;
for (i = order_pos - 1; i >= 0; i--)
{
- if (order[i]->aux && ipa_propagate_frequency (order[i]))
+ if (order[i]->aux
+ && opt_for_fn (order[i]->decl, flag_ipa_profile)
+ && ipa_propagate_frequency (order[i]))
{
for (e = order[i]->callees; e; e = e->next_callee)
if (e->callee->local.local && !e->callee->aux)
Hi.
There's perf report and -ftime report of WPA phase.
Martin
Hm, using the same compiler, Firefox LTO time statistics and perf report and
very different.
I'm wondering how can be that possible?
Martin
Execution times (seconds)
phase setup : 0.00 ( 0%) usr 0.00 ( 0%) sys 0.01 ( 0%) wall
1988 kB ( 0%) ggc
phase opt and generate : 42.32 (70%) usr 0.85 (56%) sys 43.16 (69%) wall
1387464 kB (28%) ggc
phase stream in : 18.50 (30%) usr 0.68 (44%) sys 19.17 (31%) wall
3528077 kB (72%) ggc
garbage collection : 2.24 ( 4%) usr 0.00 ( 0%) sys 2.24 ( 4%) wall
0 kB ( 0%) ggc
callgraph optimization : 0.37 ( 1%) usr 0.00 ( 0%) sys 0.37 ( 1%) wall
38 kB ( 0%) ggc
ipa dead code removal : 3.06 ( 5%) usr 0.01 ( 1%) sys 2.88 ( 5%) wall
0 kB ( 0%) ggc
ipa virtual call target : 5.72 ( 9%) usr 0.06 ( 4%) sys 5.87 ( 9%) wall
0 kB ( 0%) ggc
ipa devirtualization : 0.18 ( 0%) usr 0.00 ( 0%) sys 0.23 ( 0%) wall
22382 kB ( 0%) ggc
ipa cp : 2.88 ( 5%) usr 0.09 ( 6%) sys 2.97 ( 5%) wall
515623 kB (10%) ggc
ipa inlining heuristics : 13.96 (23%) usr 0.13 ( 8%) sys 14.12 (23%) wall
471848 kB (10%) ggc
ipa comdats : 0.12 ( 0%) usr 0.00 ( 0%) sys 0.12 ( 0%) wall
0 kB ( 0%) ggc
ipa lto gimple in : 2.54 ( 4%) usr 0.48 (31%) sys 3.23 ( 5%) wall
645652 kB (13%) ggc
ipa lto decl in : 12.64 (21%) usr 0.37 (24%) sys 13.01 (21%) wall
2592737 kB (53%) ggc
ipa lto constructors in : 0.17 ( 0%) usr 0.01 ( 1%) sys 0.20 ( 0%) wall
16493 kB ( 0%) ggc
ipa lto cgraph I/O : 0.58 ( 1%) usr 0.09 ( 6%) sys 0.67 ( 1%) wall
437504 kB ( 9%) ggc
ipa lto decl merge : 1.90 ( 3%) usr 0.00 ( 0%) sys 1.90 ( 3%) wall
8191 kB ( 0%) ggc
ipa lto cgraph merge : 1.30 ( 2%) usr 0.00 ( 0%) sys 1.29 ( 2%) wall
14989 kB ( 0%) ggc
whopr wpa : 0.91 ( 1%) usr 0.00 ( 0%) sys 0.88 ( 1%) wall
2 kB ( 0%) ggc
whopr partitioning : 2.66 ( 4%) usr 0.00 ( 0%) sys 2.67 ( 4%) wall
6081 kB ( 0%) ggc
ipa reference : 1.38 ( 2%) usr 0.01 ( 1%) sys 1.40 ( 2%) wall
0 kB ( 0%) ggc
ipa profile : 0.21 ( 0%) usr 0.01 ( 1%) sys 0.21 ( 0%) wall
0 kB ( 0%) ggc
ipa pure const : 1.61 ( 3%) usr 0.01 ( 1%) sys 1.61 ( 3%) wall
0 kB ( 0%) ggc
ipa icf : 4.99 ( 8%) usr 0.06 ( 4%) sys 5.00 ( 8%) wall
1120 kB ( 0%) ggc
tree SSA rewrite : 0.12 ( 0%) usr 0.02 ( 1%) sys 0.12 ( 0%) wall
23170 kB ( 0%) ggc
tree SSA incremental : 0.23 ( 0%) usr 0.05 ( 3%) sys 0.21 ( 0%) wall
14434 kB ( 0%) ggc
tree operand scan : 0.14 ( 0%) usr 0.03 ( 2%) sys 0.22 ( 0%) wall
145252 kB ( 3%) ggc
dominance frontiers : 0.04 ( 0%) usr 0.00 ( 0%) sys 0.01 ( 0%) wall
0 kB ( 0%) ggc
dominance computation : 0.14 ( 0%) usr 0.05 ( 3%) sys 0.11 ( 0%) wall
0 kB ( 0%) ggc
varconst : 0.01 ( 0%) usr 0.02 ( 1%) sys 0.03 ( 0%) wall
0 kB ( 0%) ggc
loop fini : 0.07 ( 0%) usr 0.00 ( 0%) sys 0.03 ( 0%) wall
0 kB ( 0%) ggc
unaccounted todo : 0.62 ( 1%) usr 0.00 ( 0%) sys 0.65 ( 1%) wall
0 kB ( 0%) ggc
TOTAL : 60.82 1.53 62.34
4917531 kB
[ perf record: Woken up 59 times to write data ]
[ perf record: Captured and wrote 14.722 MB perf.data (~643202 samples) ]
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library>
perf report
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library>
gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/home/marxin/Programming/bin/gcc2/lib/gcc/x86_64-unknown-linux-gnu/5.0.0/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: ../configure --enable-languages=c,c++ --disable-libsanitizer
--prefix=/home/marxin/Programming/bin/gcc2 --disable-bootstrap
--enable-checking=release
Thread model: posix
gcc version 5.0.0 20150218 (experimental) (GCC)
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library>
perf report
marxin@marxinbox:~/Programming/gecko-dev/obj-x86_64-unknown-linux-gnu/toolkit/library>
perf report --stdio | sed 's/\ *$//' | head -n50
# To display the perf.data header info, please use --header/--header-only
options.
#
# Samples: 245K of event 'cycles'
# Event count (approx.): 216467422123
#
# Overhead Command Shared Object
# ........ ........ .................
..................................................................................................................................................................................................................................................................................................
#
4.97% lto1-wpa lto1 [.] inflate_fast
2.78% lto1-wpa lto1 [.]
symbol_table::remove_unreachable_nodes(_IO_FILE*)
2.37% lto1-wpa libc-2.19.so [.] _int_malloc
1.77% lto1-wpa lto1 [.]
record_target_from_binfo(vec<cgraph_node*, va_heap, vl_ptr>&, vec<tree_node*,
va_heap, vl_ptr>*, tree_node*, tree_node*, vec<tree_node*, va_heap, vl_ptr>&,
long, tree_node*, long, hash_set<tree_node*, default_hashset_traits>*,
hash_set<tree_node*, default_hashset_traits>*, bool, bool*)
1.57% lto1-wpa lto1 [.] ht_lookup_with_hash(ht*, unsigned
char const*, unsigned long, unsigned int, ht_lookup_option)
1.56% lto1-wpa lto1 [.]
streamer_read_uhwi(lto_input_block*)
1.48% lto1-wpa lto1 [.]
estimate_calls_size_and_time(cgraph_node*, int*, int*, int*, int*, unsigned
int, vec<tree_node*, va_heap, vl_ptr>, vec<ipa_polymorphic_call_context,
va_heap, vl_ptr>, vec<ipa_agg_jump_function*, va_heap, vl_ptr>) [clone
.isra.129]
1.48% lto1-wpa lto1 [.] unify_scc(streamer_tree_cache_d*,
unsigned int, unsigned int, unsigned int, unsigned int)
1.40% lto1-wpa lto1 [.]
lto_cgraph_replace_node(cgraph_node*, cgraph_node*)
1.38% lto1-wpa lto1 [.] ggc_set_mark(void const*)
1.30% lto1-wpa libc-2.19.so [.] malloc_consolidate
1.28% lto1-wpa lto1 [.] htab_hash_string
1.25% lto1-wpa lto1 [.] compare_tree_sccs_1(tree_node*,
tree_node*, tree_node***)
1.23% lto1-wpa lto1 [.] fibonacci_heap<sreal,
cgraph_edge>::consolidate()
1.19% lto1-wpa lto1 [.] splay_tree_splay
1.15% lto1-wpa lto1 [.] can_inline_edge_p(cgraph_edge*,
bool, bool)
1.14% lto1-wpa lto1 [.] cgraph_node::get_availability()
1.14% lto1-wpa lto1 [.]
evaluate_properties_for_edge(cgraph_edge*, bool, unsigned int*, vec<tree_node*,
va_heap, vl_ptr>*, vec<ipa_polymorphic_call_context, va_heap, vl_ptr>*,
vec<ipa_agg_jump_function*, va_heap, vl_ptr>*) [clone .constprop.131]
1.13% lto1-wpa lto1 [.]
gimple_get_virt_method_for_vtable(long, tree_node*, unsigned long, bool*)
1.10% lto1-wpa lto1 [.] types_same_for_odr(tree_node
const*, tree_node const*)
1.08% lto1-wpa lto1 [.] gt_ggc_mx_lang_tree_node(void*)
1.05% lto1-wpa lto1 [.]
streamer_read_tree_bitfields(lto_input_block*, data_in*, tree_node*)
0.99% lto1-wpa lto1 [.]
type_in_anonymous_namespace_p(tree_node const*)
0.99% lto1-wpa lto1 [.] gimple_has_body_p(tree_node*)
0.95% lto1-wpa lto1 [.] decl_assembler_name(tree_node*)
0.93% lto1-wpa lto1 [.] do_per_function(void
(*)(function*, void*), void*)
0.82% lto1-wpa libc-2.19.so [.] _int_free
0.81% lto1-wpa lto1 [.]
possible_polymorphic_call_targets_1(vec<cgraph_node*, va_heap, vl_ptr>&,
hash_set<tree_node*, default_hashset_traits>*, hash_set<tree_node*,
default_hashset_traits>*, tree_node*, odr_type_d*, long, tree_node*, long,
bool*, vec<tree_node*, va_heap, vl_ptr>&, bool)
0.81% lto1-wpa lto1 [.] searchc(searchc_env*,
cgraph_node*, bool (*)(cgraph_edge*))
0.80% lto1-wpa lto1 [.]
streamer_get_pickled_tree(lto_input_block*, data_in*)
0.78% lto1-wpa lto1 [.] edge_badness(cgraph_edge*, bool)
0.77% lto1-wpa lto1 [.] hash_table<asmname_hasher,
xcallocator, true>::find_slot_with_hash(tree_node const* const&, unsigned int,
insert_option)
0.77% lto1-wpa lto1 [.]
update_callee_keys(fibonacci_heap<sreal, cgraph_edge>*, cgraph_node*,
bitmap_head*)
0.76% lto1-wpa lto1 [.] ggc_internal_alloc(unsigned long,
void (*)(void*), unsigned long, unsigned long)
0.75% lto1-wpa lto1 [.] fibonacci_heap<sreal,
cgraph_edge>::extract_minimum_node()
0.75% lto1-wpa lto1 [.] execute_one_pass(opt_pass*)
0.74% lto1-wpa lto1 [.] inflate
0.71% lto1-wpa lto1 [.]
contains_polymorphic_type_p(tree_node const*)
0.67% lto1-wpa lto1 [.] get_binfo_at_offset(tree_node*,
long, tree_node*)
0.64% lto1-wpa lto1 [.]
symbol_table::decl_assembler_name_equal(tree_node*, tree_node const*)
0.61% lto1-wpa lto1 [.] lto_balanced_map(int)
0.61% lto1-wpa lto1 [.]
ipa_icf::sem_item_optimizer::do_congruence_step_for_index(ipa_icf::congruence_class*,
unsigned int)