On December 15, 2018 10:17:43 PM GMT+01:00, Jan Hubicka <hubi...@ucw.cz> wrote: >Hi, >this patch is motivated by finding that Firefox ipa summaries stream-in >is fully dominated by streaming edge summaries that are not necessary >because they lead to external calls or are from comdat functions that >are unused. > >Reading them, populating hashes and later removing them at once >streaming is finished increases memory use by about 1GB. This patch >simply adds logic to notice such unnecesary summaries early and not >put them to memory. This includes ADDR_EXPR that seems to be most >common >constant values and are easy to rebuild.
Don't you want to go the full way decomposing it to sym + offset? >Our streams are not organized in a way it would be easy to skip >something so I opted to simply "fast forward" reading the dead data but >not producing datastructures. We may think of more systematic solution >next stage1, but there are only two places where this matters - >ipa-prop >and ipa-fnsummary I will treat next. > >Bootstrapped/regtested x86_64-linux, will commit it shortly. > >Honza > > * cgraph.h (cgraph_node): Add predicate prevailing_p. > (cgraph_edge): Add predicate possible_call_in_translation_unit_p. > * ipa-prop.c (ipa_write_jump_function): Optimize streaming of >ADDR_EXPR. > (ipa_read_jump_function): Add prevails parameter; optimize streaming. > (ipa_read_edge_info): Break out from ... > (ipa_read_node_info): ... here; optimize streaming. > * cgraph.c (cgraph_edge::possibly_call_in_translation_unit_p): New > predicate. > >Index: cgraph.h >=================================================================== >--- cgraph.h (revision 267122) >+++ cgraph.h (working copy) >@@ -308,6 +308,10 @@ public: > /* Return availability of NODE when referenced from REF. */ > enum availability get_availability (symtab_node *ref = NULL); > >+ /* During LTO stream-in this predicate can be used to check whether >node >+ in question prevails in the linking to save some memory usage. >*/ >+ bool prevailing_p (void); >+ > /* Return true if NODE binds to current definition in final executable > when referenced from REF. If REF is NULL return conservative value > for any reference. */ >@@ -1730,6 +1734,10 @@ struct GTY((chain_next ("%h.next_caller" > after passes that don't update the cgraph. */ > static void rebuild_references (void); > >+ /* During LTO stream in this can be used to check whether call can >possibly >+ be internal to the current translation unit. */ >+ bool possibly_call_in_translation_unit_p (void); >+ > /* Expected number of executions: calculated in profile.c. */ > profile_count count; > cgraph_node *caller; >@@ -3357,6 +3365,15 @@ xstrdup_for_dump (const char *transient_ > return ggc_strdup (transient_str); > } > >+/* During LTO stream-in this predicate can be used to check whether >node >+ in question prevails in the linking to save some memory usage. */ >+inline bool >+symtab_node::prevailing_p (void) >+{ >+ return definition && ((!TREE_PUBLIC (decl) && !DECL_EXTERNAL (decl)) >+ || previous_sharing_asm_name == NULL); >+} >+ > extern GTY(()) symbol_table *saved_symtab; > > #if CHECKING_P >Index: ipa-prop.c >=================================================================== >--- ipa-prop.c (revision 267122) >+++ ipa-prop.c (working copy) >@@ -4053,8 +4053,15 @@ ipa_write_jump_function (struct output_b > struct ipa_agg_jf_item *item; > struct bitpack_d bp; > int i, count; >+ int flag = 0; > >- streamer_write_uhwi (ob, jump_func->type); >+ /* ADDR_EXPRs are very comon IP invariants; save some streamer data >+ as well as WPA memory by handling them specially. */ >+ if (jump_func->type == IPA_JF_CONST >+ && TREE_CODE (jump_func->value.constant.value) == ADDR_EXPR) >+ flag = 1; >+ >+ streamer_write_uhwi (ob, jump_func->type * 2 + flag); > switch (jump_func->type) > { > case IPA_JF_UNKNOWN: >@@ -4062,7 +4069,10 @@ ipa_write_jump_function (struct output_b > case IPA_JF_CONST: > gcc_assert ( > EXPR_LOCATION (jump_func->value.constant.value) == >UNKNOWN_LOCATION); >- stream_write_tree (ob, jump_func->value.constant.value, true); >+ stream_write_tree (ob, >+ flag >+ ? TREE_OPERAND (jump_func->value.constant.value, 0) >+ : jump_func->value.constant.value, true); > break; > case IPA_JF_PASS_THROUGH: > streamer_write_uhwi (ob, jump_func->value.pass_through.operation); >@@ -4131,20 +4141,28 @@ static void > ipa_read_jump_function (struct lto_input_block *ib, > struct ipa_jump_func *jump_func, > struct cgraph_edge *cs, >- struct data_in *data_in) >+ struct data_in *data_in, >+ bool prevails) > { > enum jump_func_type jftype; > enum tree_code operation; > int i, count; >+ int val = streamer_read_uhwi (ib); >+ bool flag = val & 1; > >- jftype = (enum jump_func_type) streamer_read_uhwi (ib); >+ jftype = (enum jump_func_type) (val / 2); > switch (jftype) > { > case IPA_JF_UNKNOWN: > ipa_set_jf_unknown (jump_func); > break; > case IPA_JF_CONST: >- ipa_set_jf_constant (jump_func, stream_read_tree (ib, data_in), >cs); >+ { >+ tree t = stream_read_tree (ib, data_in); >+ if (flag && prevails) >+ t = build_fold_addr_expr (t); >+ ipa_set_jf_constant (jump_func, t, cs); >+ } > break; > case IPA_JF_PASS_THROUGH: > operation = (enum tree_code) streamer_read_uhwi (ib); >@@ -4177,10 +4195,13 @@ ipa_read_jump_function (struct lto_input > ipa_set_ancestor_jf (jump_func, offset, formal_id, agg_preserved); > break; > } >+ default: >+ fatal_error (UNKNOWN_LOCATION, "invalid jump function in LTO >stream"); > } > > count = streamer_read_uhwi (ib); >- vec_alloc (jump_func->agg.items, count); >+ if (prevails) >+ vec_alloc (jump_func->agg.items, count); > if (count) > { > struct bitpack_d bp = streamer_read_bitpack (ib); >@@ -4191,7 +4212,8 @@ ipa_read_jump_function (struct lto_input > struct ipa_agg_jf_item item; > item.offset = streamer_read_uhwi (ib); > item.value = stream_read_tree (ib, data_in); >- jump_func->agg.items->quick_push (item); >+ if (prevails) >+ jump_func->agg.items->quick_push (item); > } > > struct bitpack_d bp = streamer_read_bitpack (ib); >@@ -4200,7 +4222,8 @@ ipa_read_jump_function (struct lto_input > { > widest_int value = streamer_read_widest_int (ib); > widest_int mask = streamer_read_widest_int (ib); >- ipa_set_jfunc_bits (jump_func, value, mask); >+ if (prevails) >+ ipa_set_jfunc_bits (jump_func, value, mask); > } > else > jump_func->bits = NULL; >@@ -4213,7 +4236,8 @@ ipa_read_jump_function (struct lto_input > VR_LAST); > tree min = stream_read_tree (ib, data_in); > tree max = stream_read_tree (ib, data_in); >- ipa_set_jfunc_vr (jump_func, type, min, max); >+ if (prevails) >+ ipa_set_jfunc_vr (jump_func, type, min, max); > } > else > jump_func->m_vr = NULL; >@@ -4345,24 +4369,48 @@ ipa_write_node_info (struct output_block > } > } > >-/* If jump functions points to node we possibly can propagate into. >- At this moment symbol table is still not merged, but the prevailing >- symbol is always first in the list. */ >+/* Stream in edge E from IB. */ > >-static bool >-jump_function_useful_p (symtab_node *node) >+static void >+ipa_read_edge_info (struct lto_input_block *ib, >+ struct data_in *data_in, >+ struct cgraph_edge *e, bool prevails) > { >- /* While incremental linking we may end up getting function body >later. */ >- if (flag_incremental_link == INCREMENTAL_LINK_LTO) >- return true; >- if (!TREE_PUBLIC (node->decl) && !DECL_EXTERNAL (node->decl)) >- return true; >- for (int n = 10; node->previous_sharing_asm_name && n ; n--) >- node = node->previous_sharing_asm_name; >- if (node->previous_sharing_asm_name) >- node = symtab_node::get_for_asmname (DECL_ASSEMBLER_NAME >(node->decl)); >- gcc_assert (TREE_PUBLIC (node->decl)); >- return node->definition; >+ int count = streamer_read_uhwi (ib); >+ bool contexts_computed = count & 1; >+ >+ count /= 2; >+ if (!count) >+ return; >+ if (prevails && e->possibly_call_in_translation_unit_p ()) >+ { >+ struct ipa_edge_args *args = IPA_EDGE_REF (e); >+ vec_safe_grow_cleared (args->jump_functions, count); >+ if (contexts_computed) >+ vec_safe_grow_cleared (args->polymorphic_call_contexts, count); >+ for (int k = 0; k < count; k++) >+ { >+ ipa_read_jump_function (ib, ipa_get_ith_jump_func (args, k), e, >+ data_in, prevails); >+ if (contexts_computed) >+ ipa_get_ith_polymorhic_call_context (args, k)->stream_in >+ (ib, data_in); >+ } >+ } >+ else >+ { >+ for (int k = 0; k < count; k++) >+ { >+ struct ipa_jump_func dummy; >+ ipa_read_jump_function (ib, &dummy, e, >+ data_in, prevails); >+ if (contexts_computed) >+ { >+ struct ipa_polymorphic_call_context ctx; >+ ctx.stream_in (ib, data_in); >+ } >+ } >+ } > } > > /* Stream in NODE info from IB. */ >@@ -4371,82 +4419,50 @@ static void >ipa_read_node_info (struct lto_input_block *ib, struct cgraph_node >*node, > struct data_in *data_in) > { >- struct ipa_node_params *info = IPA_NODE_REF (node); > int k; > struct cgraph_edge *e; > struct bitpack_d bp; >+ bool prevails = node->prevailing_p (); >+ struct ipa_node_params *info = prevails ? IPA_NODE_REF (node) : >NULL; > >- ipa_alloc_node_params (node, streamer_read_uhwi (ib)); >- >- for (k = 0; k < ipa_get_param_count (info); k++) >- (*info->descriptors)[k].move_cost = streamer_read_uhwi (ib); >+ int param_count = streamer_read_uhwi (ib); >+ if (prevails) >+ { >+ ipa_alloc_node_params (node, param_count); >+ for (k = 0; k < param_count; k++) >+ (*info->descriptors)[k].move_cost = streamer_read_uhwi (ib); >+ if (ipa_get_param_count (info) != 0) >+ info->analysis_done = true; >+ info->node_enqueued = false; >+ } >+ else >+ for (k = 0; k < param_count; k++) >+ streamer_read_uhwi (ib); > > bp = streamer_read_bitpack (ib); >- if (ipa_get_param_count (info) != 0) >- info->analysis_done = true; >- info->node_enqueued = false; >- for (k = 0; k < ipa_get_param_count (info); k++) >- ipa_set_param_used (info, k, bp_unpack_value (&bp, 1)); >- for (k = 0; k < ipa_get_param_count (info); k++) >+ for (k = 0; k < param_count; k++) > { >- ipa_set_controlled_uses (info, k, streamer_read_hwi (ib)); >- (*info->descriptors)[k].decl_or_type = stream_read_tree (ib, >data_in); >+ bool used = bp_unpack_value (&bp, 1); >+ >+ if (prevails) >+ ipa_set_param_used (info, k, used); > } >- for (e = node->callees; e; e = e->next_callee) >+ for (k = 0; k < param_count; k++) > { >- struct ipa_edge_args *args = IPA_EDGE_REF (e); >- int count = streamer_read_uhwi (ib); >- bool contexts_computed = count & 1; >- count /= 2; >- >- if (!count) >- continue; >- if (!jump_function_useful_p (e->callee)) >- { >- for (k = 0; k < count; k++) >- { >- struct ipa_jump_func dummy; >- ipa_read_jump_function (ib, &dummy, e, data_in); >- if (contexts_computed) >- { >- struct ipa_polymorphic_call_context ctx; >- ctx.stream_in (ib, data_in); >- } >- } >- continue; >- } >- vec_safe_grow_cleared (args->jump_functions, count); >- if (contexts_computed) >- vec_safe_grow_cleared (args->polymorphic_call_contexts, count); >+ int nuses = streamer_read_hwi (ib); >+ tree type = stream_read_tree (ib, data_in); > >- for (k = 0; k < ipa_get_cs_argument_count (args); k++) >+ if (prevails) > { >- ipa_read_jump_function (ib, ipa_get_ith_jump_func (args, k), e, >- data_in); >- if (contexts_computed) >- ipa_get_ith_polymorhic_call_context (args, k)->stream_in (ib, >data_in); >+ ipa_set_controlled_uses (info, k, nuses); >+ (*info->descriptors)[k].decl_or_type = type; > } > } >+ for (e = node->callees; e; e = e->next_callee) >+ ipa_read_edge_info (ib, data_in, e, prevails); > for (e = node->indirect_calls; e; e = e->next_callee) > { >- struct ipa_edge_args *args = IPA_EDGE_REF (e); >- int count = streamer_read_uhwi (ib); >- bool contexts_computed = count & 1; >- count /= 2; >- >- if (count) >- { >- vec_safe_grow_cleared (args->jump_functions, count); >- if (contexts_computed) >- vec_safe_grow_cleared (args->polymorphic_call_contexts, count); >- for (k = 0; k < ipa_get_cs_argument_count (args); k++) >- { >- ipa_read_jump_function (ib, ipa_get_ith_jump_func (args, k), e, >- data_in); >- if (contexts_computed) >- ipa_get_ith_polymorhic_call_context (args, k)->stream_in (ib, >data_in); >- } >- } >+ ipa_read_edge_info (ib, data_in, e, prevails); > ipa_read_indirect_edge_info (ib, data_in, e); > } > } >Index: cgraph.c >=================================================================== >--- cgraph.c (revision 267122) >+++ cgraph.c (working copy) >@@ -3766,6 +3780,41 @@ cgraph_edge::sreal_frequency () > : caller->count); > } > >+ >+/* During LTO stream in this can be used to check whether call can >possibly >+ be internal to the current translation unit. */ >+ >+bool >+cgraph_edge::possibly_call_in_translation_unit_p (void) >+{ >+ gcc_checking_assert (in_lto_p && caller->prevailing_p ()); >+ >+ /* While incremental linking we may end up getting function body >later. */ >+ if (flag_incremental_link == INCREMENTAL_LINK_LTO) >+ return true; >+ >+ /* We may be smarter here and avoid stremaing in indirect calls we >can't >+ track, but that would require arranging stremaing the indirect >call >+ summary first. */ >+ if (!callee) >+ return true; >+ >+ /* If calle is local to the original translation unit, it will be >defined. */ >+ if (!TREE_PUBLIC (callee->decl) && !DECL_EXTERNAL (callee->decl)) >+ return true; >+ >+ /* Otherwise we need to lookup prevailing symbol (symbol table is >not merged, >+ yet) and see if it is a definition. In fact we may also resolve >aliases, >+ but that is probably not too important. */ >+ symtab_node *node = callee; >+ for (int n = 10; node->previous_sharing_asm_name && n ; n--) >+ node = node->previous_sharing_asm_name; >+ if (node->previous_sharing_asm_name) >+ node = symtab_node::get_for_asmname (DECL_ASSEMBLER_NAME >(callee->decl)); >+ gcc_assert (TREE_PUBLIC (node->decl)); >+ return node->get_availability () >= AVAIL_AVAILABLE; >+} >+ > /* A stashed copy of "symtab" for use by selftest::symbol_table_test. > This needs to be a global so that it can be a GC root, and thus > prevent the stashed copy from being garbage-collected if the GC runs