Hi, this patch treads ages old problem that compile time needed to estimate call size is not constant, but a function of a number of calls of the callee. If there is a function with many callers and many callees this triggers quadratic behaviour.
This patch adds summary info for all callees of a node which is of same format as size_time_table used to account normal code. It is a different table since, unlike normal code, call statemetns are changing thorough the IPA ooptimization queue. In a followup patch I will add incremental update of this summary which will solve the second traditional quadratic bottleneck of greedy inliner (with inlinining very many calls into large function). This has became problem for compiling cc1 itself because with enablement of -finline-functions at -O2 we end up inlining a lot into the auto-generated insn-* pattern matchers (since a lot of predicates are small). Thus WPA inlining time regressed from 6s in gcc 9 to 65s in trunk two weeks back. The memory use of new tables is 64bytes per function summary and at most one entry in the allocated vector per call (whole point of the change is to glob calls with same predicates together and also cap number of predicates we care about). Overal 10MB for cc1 (out of 900MB we need at WPA time). It would be possible to merge both size_time_table and call_size_time_table to one saving some of overhead. This would however make it impossible to recalculate the data and do some sanity checking and I am affraid of making that very hard to maintain The profile of WPA cc1 shows: - 47.85% 0.30% lto1-wpa lto1 [.] inline_small_functions - 47.55% inline_small_functions - 24.20% update_callee_keys - 6.95% can_inline_edge_p 1.32% sanitize_flags_p + 1.32% is_tm_pure - 4.34% update_edge_key 3.48% edge_badness 4.24% want_inline_small_function_p - 3.51% can_inline_edge_by_limits_p 0.61% estimate_size_after_inlining 0.89% cgraph_node::get_availability - 13.44% inline_call + 10.57% ipa_update_overall_fn_summary + 1.23% clone_inlined_nodes 1.06% ipa_merge_fn_summary_after_inlining - 5.27% update_caller_keys + 4.46% want_inline_small_function_p 0.62% can_inline_edge_p 1.34% fibonacci_heap<sreal, cgraph_edge>::extract_minimum_node + 0.83% want_inline_small_function_p + 0.73% estimate_growth ipa_update_overall_fn_summary is the nonlinearity of updating summaries after inlining (each inline update is function of size of the inline tree of caller). Honza * ipa-fnsummary.c (ipa_fn_summary::account_size_time): Add CALL parameter and update call_size_time_table. (ipa_fn_summary::max_size_time_table_size): New constant. (estimate_calls_size_and_time_1): Break out from ... (estimate_calls_size_and_time): ... here; implement summary production. (summarize_calls_size_and_time): New function. (ipa_call_context::estimate_size_and_time): Bypass estimate_calls_size_and_time for leaf functions. (ipa_update_overall_fn_summary): Likewise. * ipa-fnsummary.h (call_size_time_table): New. (ipa_fn_summary::account_size_time): Update prototype. Index: ipa-fnsummary.c =================================================================== --- ipa-fnsummary.c (revision 278496) +++ ipa-fnsummary.c (working copy) @@ -146,17 +147,22 @@ ipa_dump_hints (FILE *f, ipa_hints hints /* Record SIZE and TIME to SUMMARY. The accounted code will be executed when EXEC_PRED is true. When NONCONST_PRED is false the code will evaulate to constant and - will get optimized out in specialized clones of the function. */ + will get optimized out in specialized clones of the function. + If CALL is true account to call_size_time_table rather than + size_time_table. */ void ipa_fn_summary::account_size_time (int size, sreal time, const predicate &exec_pred, - const predicate &nonconst_pred_in) + const predicate &nonconst_pred_in, + bool call) { size_time_entry *e; bool found = false; int i; predicate nonconst_pred; + vec<size_time_entry, va_gc> *table = call + ? call_size_time_table : size_time_table; if (exec_pred == false) return; @@ -168,23 +174,23 @@ ipa_fn_summary::account_size_time (int s /* We need to create initial empty unconitional clause, but otherwie we don't need to account empty times and sizes. */ - if (!size && time == 0 && size_time_table) + if (!size && time == 0 && table) return; gcc_assert (time >= 0); - for (i = 0; vec_safe_iterate (size_time_table, i, &e); i++) + for (i = 0; vec_safe_iterate (table, i, &e); i++) if (e->exec_predicate == exec_pred && e->nonconst_predicate == nonconst_pred) { found = true; break; } - if (i == 256) + if (i == max_size_time_table_size) { i = 0; found = true; - e = &(*size_time_table)[0]; + e = &(*table)[0]; if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "\t\tReached limit on number of entries, " @@ -212,7 +218,10 @@ ipa_fn_summary::account_size_time (int s new_entry.time = time; new_entry.exec_predicate = exec_pred; new_entry.nonconst_predicate = nonconst_pred; - vec_safe_push (size_time_table, new_entry); + if (call) + vec_safe_push (call_size_time_table, new_entry); + else + vec_safe_push (size_time_table, new_entry); } else { @@ -642,6 +651,7 @@ ipa_fn_summary::~ipa_fn_summary () edge_predicate_pool.remove (loop_stride); vec_free (conds); vec_free (size_time_table); + vec_free (call_size_time_table); } void @@ -2973,19 +2983,21 @@ estimate_edge_size_and_time (struct cgra } - /* Increase SIZE, MIN_SIZE and TIME for size and time needed to handle all calls in NODE. POSSIBLE_TRUTHS, KNOWN_VALS, KNOWN_AGGS and KNOWN_CONTEXTS - describe context of the call site. */ + describe context of the call site. + + Helper for estimate_calls_size_and_time which does the same but + (in most cases) faster. */ static void -estimate_calls_size_and_time (struct cgraph_node *node, int *size, - int *min_size, sreal *time, - ipa_hints *hints, - clause_t possible_truths, - vec<tree> known_vals, - vec<ipa_polymorphic_call_context> known_contexts, - vec<ipa_agg_value_set> known_aggs) +estimate_calls_size_and_time_1 (struct cgraph_node *node, int *size, + int *min_size, sreal *time, + ipa_hints *hints, + clause_t possible_truths, + vec<tree> known_vals, + vec<ipa_polymorphic_call_context> known_contexts, + vec<ipa_agg_value_set> known_aggs) { struct cgraph_edge *e; for (e = node->callees; e; e = e->next_callee) @@ -2993,11 +3005,11 @@ estimate_calls_size_and_time (struct cgr if (!e->inline_failed) { gcc_checking_assert (!ipa_call_summaries->get (e)); - estimate_calls_size_and_time (e->callee, size, min_size, time, - hints, - possible_truths, - known_vals, known_contexts, - known_aggs); + estimate_calls_size_and_time_1 (e->callee, size, min_size, time, + hints, + possible_truths, + known_vals, known_contexts, + known_aggs); continue; } class ipa_call_summary *es = ipa_call_summaries->get (e); @@ -3033,6 +3045,157 @@ estimate_calls_size_and_time (struct cgr } } +/* Populate sum->call_size_time_table for edges from NODE. */ + +static void +summarize_calls_size_and_time (struct cgraph_node *node, + ipa_fn_summary *sum) +{ + struct cgraph_edge *e; + for (e = node->callees; e; e = e->next_callee) + { + if (!e->inline_failed) + { + gcc_checking_assert (!ipa_call_summaries->get (e)); + summarize_calls_size_and_time (e->callee, sum); + continue; + } + int size = 0; + sreal time = 0; + + estimate_edge_size_and_time (e, &size, NULL, &time, + vNULL, vNULL, vNULL, NULL); + + struct predicate pred = true; + class ipa_call_summary *es = ipa_call_summaries->get (e); + + if (es->predicate) + pred = *es->predicate; + sum->account_size_time (size, time, pred, pred, true); + } + for (e = node->indirect_calls; e; e = e->next_callee) + { + int size = 0; + sreal time = 0; + + estimate_edge_size_and_time (e, &size, NULL, &time, + vNULL, vNULL, vNULL, NULL); + struct predicate pred = true; + class ipa_call_summary *es = ipa_call_summaries->get (e); + + if (es->predicate) + pred = *es->predicate; + sum->account_size_time (size, time, pred, pred, true); + } +} + +/* Increase SIZE, MIN_SIZE and TIME for size and time needed to handle all + calls in NODE. POSSIBLE_TRUTHS, KNOWN_VALS, KNOWN_AGGS and KNOWN_CONTEXTS + describe context of the call site. */ + +static void +estimate_calls_size_and_time (struct cgraph_node *node, int *size, + int *min_size, sreal *time, + ipa_hints *hints, + clause_t possible_truths, + vec<tree> known_vals, + vec<ipa_polymorphic_call_context> known_contexts, + vec<ipa_agg_value_set> known_aggs) +{ + class ipa_fn_summary *sum = ipa_fn_summaries->get (node); + bool use_table = true; + + gcc_assert (node->callees || node->indirect_calls); + + /* During early inlining we do not calculate info for very + large functions and thus there is no need for producing + summaries. */ + if (!ipa_node_params_sum) + use_table = false; + /* Do not calculate summaries for simple wrappers; it is waste + of memory. */ + else if (node->callees && node->indirect_calls + && node->callees->inline_failed && !node->callees->next_callee) + use_table = false; + /* If there is an indirect edge that may be optimized, we need + to go the slow way. */ + else if ((known_vals.length () + || known_contexts.length () + || known_aggs.length ()) && hints) + { + class ipa_node_params *params_summary = IPA_NODE_REF (node); + unsigned int nargs = params_summary + ? ipa_get_param_count (params_summary) : 0; + + for (unsigned int i = 0; i < nargs && use_table; i++) + { + if (ipa_is_param_used_by_indirect_call (params_summary, i) + && ((known_vals.length () > i && known_vals[i]) + || (known_aggs.length () > i + && known_aggs[i].items.length ()))) + use_table = false; + else if (ipa_is_param_used_by_polymorphic_call (params_summary, i) + && (known_contexts.length () > i + && !known_contexts[i].useless_p ())) + use_table = false; + } + } + + /* Fast path is via the call size time table. */ + if (use_table) + { + /* Build summary if it is absent. */ + if (!sum->call_size_time_table) + { + predicate true_pred = true; + sum->account_size_time (0, 0, true_pred, true_pred, true); + summarize_calls_size_and_time (node, sum); + } + + int old_size = *size; + sreal old_time = time ? *time : 0; + + if (min_size) + *min_size += (*sum->call_size_time_table)[0].size; + + unsigned int i; + size_time_entry *e; + + /* Walk the table and account sizes and times. */ + for (i = 0; vec_safe_iterate (sum->call_size_time_table, i, &e); + i++) + if (e->exec_predicate.evaluate (possible_truths)) + { + *size += e->size; + if (time) + *time += e->time; + } + + /* Be careful and see if both methods agree. */ + if (flag_checking || dump_file + /* Do not try to sanity check when we know we lost some + precision. */ + && sum->call_size_time_table->length () + < ipa_fn_summary::max_size_time_table_size) + { + estimate_calls_size_and_time_1 (node, &old_size, NULL, &old_time, NULL, + possible_truths, known_vals, + known_contexts, known_aggs); + gcc_assert (*size == old_size); + if (time && (*time - old_time > 1 || *time - old_time < -1) + && dump_file) + fprintf (dump_file, "Time mismatch in call summary %f!=%f", + old_time.to_double (), + time->to_double ()); + } + } + /* Slow path by walking all edges. */ + else + estimate_calls_size_and_time_1 (node, size, min_size, time, hints, + possible_truths, known_vals, known_contexts, + known_aggs); +} + /* Default constructor for ipa call context. Memory alloction of known_vals, known_contexts and known_aggs vectors is owned by the caller, but can @@ -3303,10 +3466,11 @@ ipa_call_context::estimate_size_and_time } } - estimate_calls_size_and_time (m_node, &size, &min_size, - ret_time ? &time : NULL, - ret_hints ? &hints : NULL, m_possible_truths, - m_known_vals, m_known_contexts, m_known_aggs); + if (m_node->callees || m_node->indirect_calls) + estimate_calls_size_and_time (m_node, &size, &min_size, + ret_time ? &time : NULL, + ret_hints ? &hints : NULL, m_possible_truths, + m_known_vals, m_known_contexts, m_known_aggs); sreal nonspecialized_time = time; @@ -3760,10 +3924,12 @@ ipa_update_overall_fn_summary (struct cg info->time += e->time; } info->min_size = (*info->size_time_table)[0].size; - estimate_calls_size_and_time (node, &size_info->size, &info->min_size, - &info->time, NULL, - ~(clause_t) (1 << predicate::false_condition), - vNULL, vNULL, vNULL); + vec_free (info->call_size_time_table); + if (node->callees || node->indirect_calls) + estimate_calls_size_and_time (node, &size_info->size, &info->min_size, + &info->time, NULL, + ~(clause_t) (1 << predicate::false_condition), + vNULL, vNULL, vNULL); size_info->size = RDIV (size_info->size, ipa_fn_summary::size_scale); info->min_size = RDIV (info->min_size, ipa_fn_summary::size_scale); } Index: ipa-fnsummary.h =================================================================== --- ipa-fnsummary.h (revision 278496) +++ ipa-fnsummary.h (working copy) @@ -117,8 +117,8 @@ public: inlinable (false), single_caller (false), fp_expressions (false), estimated_stack_size (false), time (0), conds (NULL), - size_time_table (NULL), loop_iterations (NULL), loop_stride (NULL), - growth (0), scc_no (0) + size_time_table (NULL), call_size_time_table (NULL), loop_iterations (NULL), + loop_stride (NULL), growth (0), scc_no (0) { } @@ -129,6 +129,7 @@ public: fp_expressions (s.fp_expressions), estimated_stack_size (s.estimated_stack_size), time (s.time), conds (s.conds), size_time_table (s.size_time_table), + call_size_time_table (NULL), loop_iterations (s.loop_iterations), loop_stride (s.loop_stride), growth (s.growth), scc_no (s.scc_no) {} @@ -161,7 +162,12 @@ public: /* Conditional size/time information. The summaries are being merged during inlining. */ conditions conds; + /* Normal code is acocunted in size_time_table, while calls are + accounted in call_size_time_table. This is because calls + are often adjusted by IPA optimizations and thus this summary + is generated from call summary information when needed. */ vec<size_time_entry, va_gc> *size_time_table; + vec<size_time_entry, va_gc> *call_size_time_table; /* Predicate on when some loop in the function becomes to have known bounds. */ @@ -179,10 +185,13 @@ public: int scc_no; /* Record time and size under given predicates. */ - void account_size_time (int, sreal, const predicate &, const predicate &); + void account_size_time (int, sreal, const predicate &, const predicate &, + bool call = false); /* We keep values scaled up, so fractional sizes can be accounted. */ static const int size_scale = 2; + /* Maximal size of size_time_table before we start to be conservative. */ + static const int max_size_time_table_size = 256; }; class GTY((user)) ipa_fn_summary_t: Index: ipa-inline.c =================================================================== --- ipa-inline.c (revision 278496) +++ ipa-inline.c (working copy) @@ -672,12 +672,27 @@ want_early_inline_function_p (struct cgr } else { - int growth = estimate_edge_growth (e); - int n; + /* First take care of very large functions. */ + int min_growth = estimate_min_edge_growth (e); int early_inlining_insns = opt_for_fn (e->caller->decl, optimize) >= 3 ? param_early_inlining_insns : param_early_inlining_insns_o2; + if (min_growth > early_inlining_insns) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt, + " will not early inline: %C->%C, " + "call is cold and code would grow " + "at least by %i\n", + e->caller, callee, + min_growth); + want_inline = false; + } + + int growth = estimate_edge_growth (e); + int n; + if (growth <= param_max_inline_insns_size) ; Index: ipa-inline.h =================================================================== --- ipa-inline.h (revision 278496) +++ ipa-inline.h (working copy) @@ -82,6 +82,16 @@ estimate_edge_size (struct cgraph_edge * /* Return estimated callee growth after inlining EDGE. */ static inline int +estimate_min_edge_growth (struct cgraph_edge *edge) +{ + ipa_call_summary *s = ipa_call_summaries->get (edge); + struct cgraph_node *callee = edge->callee->ultimate_alias_target (); + return (ipa_fn_summaries->get (callee)->min_size - s->call_stmt_size); +} + +/* Return estimated callee growth after inlining EDGE. */ + +static inline int estimate_edge_growth (struct cgraph_edge *edge) { ipa_call_summary *s = ipa_call_summaries->get (edge);