Hi, this patch is motivated by finding that Firefox ipa summaries stream-in is fully dominated by streaming edge summaries that are not necessary because they lead to external calls or are from comdat functions that are unused.
Reading them, populating hashes and later removing them at once streaming is finished increases memory use by about 1GB. This patch simply adds logic to notice such unnecesary summaries early and not put them to memory. This includes ADDR_EXPR that seems to be most common constant values and are easy to rebuild. Our streams are not organized in a way it would be easy to skip something so I opted to simply "fast forward" reading the dead data but not producing datastructures. We may think of more systematic solution next stage1, but there are only two places where this matters - ipa-prop and ipa-fnsummary I will treat next. Bootstrapped/regtested x86_64-linux, will commit it shortly. Honza * cgraph.h (cgraph_node): Add predicate prevailing_p. (cgraph_edge): Add predicate possible_call_in_translation_unit_p. * ipa-prop.c (ipa_write_jump_function): Optimize streaming of ADDR_EXPR. (ipa_read_jump_function): Add prevails parameter; optimize streaming. (ipa_read_edge_info): Break out from ... (ipa_read_node_info): ... here; optimize streaming. * cgraph.c (cgraph_edge::possibly_call_in_translation_unit_p): New predicate. Index: cgraph.h =================================================================== --- cgraph.h (revision 267122) +++ cgraph.h (working copy) @@ -308,6 +308,10 @@ public: /* Return availability of NODE when referenced from REF. */ enum availability get_availability (symtab_node *ref = NULL); + /* During LTO stream-in this predicate can be used to check whether node + in question prevails in the linking to save some memory usage. */ + bool prevailing_p (void); + /* Return true if NODE binds to current definition in final executable when referenced from REF. If REF is NULL return conservative value for any reference. */ @@ -1730,6 +1734,10 @@ struct GTY((chain_next ("%h.next_caller" after passes that don't update the cgraph. */ static void rebuild_references (void); + /* During LTO stream in this can be used to check whether call can possibly + be internal to the current translation unit. */ + bool possibly_call_in_translation_unit_p (void); + /* Expected number of executions: calculated in profile.c. */ profile_count count; cgraph_node *caller; @@ -3357,6 +3365,15 @@ xstrdup_for_dump (const char *transient_ return ggc_strdup (transient_str); } +/* During LTO stream-in this predicate can be used to check whether node + in question prevails in the linking to save some memory usage. */ +inline bool +symtab_node::prevailing_p (void) +{ + return definition && ((!TREE_PUBLIC (decl) && !DECL_EXTERNAL (decl)) + || previous_sharing_asm_name == NULL); +} + extern GTY(()) symbol_table *saved_symtab; #if CHECKING_P Index: ipa-prop.c =================================================================== --- ipa-prop.c (revision 267122) +++ ipa-prop.c (working copy) @@ -4053,8 +4053,15 @@ ipa_write_jump_function (struct output_b struct ipa_agg_jf_item *item; struct bitpack_d bp; int i, count; + int flag = 0; - streamer_write_uhwi (ob, jump_func->type); + /* ADDR_EXPRs are very comon IP invariants; save some streamer data + as well as WPA memory by handling them specially. */ + if (jump_func->type == IPA_JF_CONST + && TREE_CODE (jump_func->value.constant.value) == ADDR_EXPR) + flag = 1; + + streamer_write_uhwi (ob, jump_func->type * 2 + flag); switch (jump_func->type) { case IPA_JF_UNKNOWN: @@ -4062,7 +4069,10 @@ ipa_write_jump_function (struct output_b case IPA_JF_CONST: gcc_assert ( EXPR_LOCATION (jump_func->value.constant.value) == UNKNOWN_LOCATION); - stream_write_tree (ob, jump_func->value.constant.value, true); + stream_write_tree (ob, + flag + ? TREE_OPERAND (jump_func->value.constant.value, 0) + : jump_func->value.constant.value, true); break; case IPA_JF_PASS_THROUGH: streamer_write_uhwi (ob, jump_func->value.pass_through.operation); @@ -4131,20 +4141,28 @@ static void ipa_read_jump_function (struct lto_input_block *ib, struct ipa_jump_func *jump_func, struct cgraph_edge *cs, - struct data_in *data_in) + struct data_in *data_in, + bool prevails) { enum jump_func_type jftype; enum tree_code operation; int i, count; + int val = streamer_read_uhwi (ib); + bool flag = val & 1; - jftype = (enum jump_func_type) streamer_read_uhwi (ib); + jftype = (enum jump_func_type) (val / 2); switch (jftype) { case IPA_JF_UNKNOWN: ipa_set_jf_unknown (jump_func); break; case IPA_JF_CONST: - ipa_set_jf_constant (jump_func, stream_read_tree (ib, data_in), cs); + { + tree t = stream_read_tree (ib, data_in); + if (flag && prevails) + t = build_fold_addr_expr (t); + ipa_set_jf_constant (jump_func, t, cs); + } break; case IPA_JF_PASS_THROUGH: operation = (enum tree_code) streamer_read_uhwi (ib); @@ -4177,10 +4195,13 @@ ipa_read_jump_function (struct lto_input ipa_set_ancestor_jf (jump_func, offset, formal_id, agg_preserved); break; } + default: + fatal_error (UNKNOWN_LOCATION, "invalid jump function in LTO stream"); } count = streamer_read_uhwi (ib); - vec_alloc (jump_func->agg.items, count); + if (prevails) + vec_alloc (jump_func->agg.items, count); if (count) { struct bitpack_d bp = streamer_read_bitpack (ib); @@ -4191,7 +4212,8 @@ ipa_read_jump_function (struct lto_input struct ipa_agg_jf_item item; item.offset = streamer_read_uhwi (ib); item.value = stream_read_tree (ib, data_in); - jump_func->agg.items->quick_push (item); + if (prevails) + jump_func->agg.items->quick_push (item); } struct bitpack_d bp = streamer_read_bitpack (ib); @@ -4200,7 +4222,8 @@ ipa_read_jump_function (struct lto_input { widest_int value = streamer_read_widest_int (ib); widest_int mask = streamer_read_widest_int (ib); - ipa_set_jfunc_bits (jump_func, value, mask); + if (prevails) + ipa_set_jfunc_bits (jump_func, value, mask); } else jump_func->bits = NULL; @@ -4213,7 +4236,8 @@ ipa_read_jump_function (struct lto_input VR_LAST); tree min = stream_read_tree (ib, data_in); tree max = stream_read_tree (ib, data_in); - ipa_set_jfunc_vr (jump_func, type, min, max); + if (prevails) + ipa_set_jfunc_vr (jump_func, type, min, max); } else jump_func->m_vr = NULL; @@ -4345,24 +4369,48 @@ ipa_write_node_info (struct output_block } } -/* If jump functions points to node we possibly can propagate into. - At this moment symbol table is still not merged, but the prevailing - symbol is always first in the list. */ +/* Stream in edge E from IB. */ -static bool -jump_function_useful_p (symtab_node *node) +static void +ipa_read_edge_info (struct lto_input_block *ib, + struct data_in *data_in, + struct cgraph_edge *e, bool prevails) { - /* While incremental linking we may end up getting function body later. */ - if (flag_incremental_link == INCREMENTAL_LINK_LTO) - return true; - if (!TREE_PUBLIC (node->decl) && !DECL_EXTERNAL (node->decl)) - return true; - for (int n = 10; node->previous_sharing_asm_name && n ; n--) - node = node->previous_sharing_asm_name; - if (node->previous_sharing_asm_name) - node = symtab_node::get_for_asmname (DECL_ASSEMBLER_NAME (node->decl)); - gcc_assert (TREE_PUBLIC (node->decl)); - return node->definition; + int count = streamer_read_uhwi (ib); + bool contexts_computed = count & 1; + + count /= 2; + if (!count) + return; + if (prevails && e->possibly_call_in_translation_unit_p ()) + { + struct ipa_edge_args *args = IPA_EDGE_REF (e); + vec_safe_grow_cleared (args->jump_functions, count); + if (contexts_computed) + vec_safe_grow_cleared (args->polymorphic_call_contexts, count); + for (int k = 0; k < count; k++) + { + ipa_read_jump_function (ib, ipa_get_ith_jump_func (args, k), e, + data_in, prevails); + if (contexts_computed) + ipa_get_ith_polymorhic_call_context (args, k)->stream_in + (ib, data_in); + } + } + else + { + for (int k = 0; k < count; k++) + { + struct ipa_jump_func dummy; + ipa_read_jump_function (ib, &dummy, e, + data_in, prevails); + if (contexts_computed) + { + struct ipa_polymorphic_call_context ctx; + ctx.stream_in (ib, data_in); + } + } + } } /* Stream in NODE info from IB. */ @@ -4371,82 +4419,50 @@ static void ipa_read_node_info (struct lto_input_block *ib, struct cgraph_node *node, struct data_in *data_in) { - struct ipa_node_params *info = IPA_NODE_REF (node); int k; struct cgraph_edge *e; struct bitpack_d bp; + bool prevails = node->prevailing_p (); + struct ipa_node_params *info = prevails ? IPA_NODE_REF (node) : NULL; - ipa_alloc_node_params (node, streamer_read_uhwi (ib)); - - for (k = 0; k < ipa_get_param_count (info); k++) - (*info->descriptors)[k].move_cost = streamer_read_uhwi (ib); + int param_count = streamer_read_uhwi (ib); + if (prevails) + { + ipa_alloc_node_params (node, param_count); + for (k = 0; k < param_count; k++) + (*info->descriptors)[k].move_cost = streamer_read_uhwi (ib); + if (ipa_get_param_count (info) != 0) + info->analysis_done = true; + info->node_enqueued = false; + } + else + for (k = 0; k < param_count; k++) + streamer_read_uhwi (ib); bp = streamer_read_bitpack (ib); - if (ipa_get_param_count (info) != 0) - info->analysis_done = true; - info->node_enqueued = false; - for (k = 0; k < ipa_get_param_count (info); k++) - ipa_set_param_used (info, k, bp_unpack_value (&bp, 1)); - for (k = 0; k < ipa_get_param_count (info); k++) + for (k = 0; k < param_count; k++) { - ipa_set_controlled_uses (info, k, streamer_read_hwi (ib)); - (*info->descriptors)[k].decl_or_type = stream_read_tree (ib, data_in); + bool used = bp_unpack_value (&bp, 1); + + if (prevails) + ipa_set_param_used (info, k, used); } - for (e = node->callees; e; e = e->next_callee) + for (k = 0; k < param_count; k++) { - struct ipa_edge_args *args = IPA_EDGE_REF (e); - int count = streamer_read_uhwi (ib); - bool contexts_computed = count & 1; - count /= 2; - - if (!count) - continue; - if (!jump_function_useful_p (e->callee)) - { - for (k = 0; k < count; k++) - { - struct ipa_jump_func dummy; - ipa_read_jump_function (ib, &dummy, e, data_in); - if (contexts_computed) - { - struct ipa_polymorphic_call_context ctx; - ctx.stream_in (ib, data_in); - } - } - continue; - } - vec_safe_grow_cleared (args->jump_functions, count); - if (contexts_computed) - vec_safe_grow_cleared (args->polymorphic_call_contexts, count); + int nuses = streamer_read_hwi (ib); + tree type = stream_read_tree (ib, data_in); - for (k = 0; k < ipa_get_cs_argument_count (args); k++) + if (prevails) { - ipa_read_jump_function (ib, ipa_get_ith_jump_func (args, k), e, - data_in); - if (contexts_computed) - ipa_get_ith_polymorhic_call_context (args, k)->stream_in (ib, data_in); + ipa_set_controlled_uses (info, k, nuses); + (*info->descriptors)[k].decl_or_type = type; } } + for (e = node->callees; e; e = e->next_callee) + ipa_read_edge_info (ib, data_in, e, prevails); for (e = node->indirect_calls; e; e = e->next_callee) { - struct ipa_edge_args *args = IPA_EDGE_REF (e); - int count = streamer_read_uhwi (ib); - bool contexts_computed = count & 1; - count /= 2; - - if (count) - { - vec_safe_grow_cleared (args->jump_functions, count); - if (contexts_computed) - vec_safe_grow_cleared (args->polymorphic_call_contexts, count); - for (k = 0; k < ipa_get_cs_argument_count (args); k++) - { - ipa_read_jump_function (ib, ipa_get_ith_jump_func (args, k), e, - data_in); - if (contexts_computed) - ipa_get_ith_polymorhic_call_context (args, k)->stream_in (ib, data_in); - } - } + ipa_read_edge_info (ib, data_in, e, prevails); ipa_read_indirect_edge_info (ib, data_in, e); } } Index: cgraph.c =================================================================== --- cgraph.c (revision 267122) +++ cgraph.c (working copy) @@ -3766,6 +3780,41 @@ cgraph_edge::sreal_frequency () : caller->count); } + +/* During LTO stream in this can be used to check whether call can possibly + be internal to the current translation unit. */ + +bool +cgraph_edge::possibly_call_in_translation_unit_p (void) +{ + gcc_checking_assert (in_lto_p && caller->prevailing_p ()); + + /* While incremental linking we may end up getting function body later. */ + if (flag_incremental_link == INCREMENTAL_LINK_LTO) + return true; + + /* We may be smarter here and avoid stremaing in indirect calls we can't + track, but that would require arranging stremaing the indirect call + summary first. */ + if (!callee) + return true; + + /* If calle is local to the original translation unit, it will be defined. */ + if (!TREE_PUBLIC (callee->decl) && !DECL_EXTERNAL (callee->decl)) + return true; + + /* Otherwise we need to lookup prevailing symbol (symbol table is not merged, + yet) and see if it is a definition. In fact we may also resolve aliases, + but that is probably not too important. */ + symtab_node *node = callee; + for (int n = 10; node->previous_sharing_asm_name && n ; n--) + node = node->previous_sharing_asm_name; + if (node->previous_sharing_asm_name) + node = symtab_node::get_for_asmname (DECL_ASSEMBLER_NAME (callee->decl)); + gcc_assert (TREE_PUBLIC (node->decl)); + return node->get_availability () >= AVAIL_AVAILABLE; +} + /* A stashed copy of "symtab" for use by selftest::symbol_table_test. This needs to be a global so that it can be a GC root, and thus prevent the stashed copy from being garbage-collected if the GC runs