> 2On Sun, 8 Dec 2019, Jan Hubicka wrote:
>
> > Other explanation would be that our new qsort with broken comparator due to
> > overflow can actualy remove some entries in the array, but that sounds bit
> > crazy.
>
> gcc_qsort only reorders elements, making it possible for gcc_qsort_chk (that
> runs afterwards) to catch crazy comparators in a sound manner.
I understand this problem (and it is very weird). It was caused by
optimize attribute overwritting incorrectly
flag_profile_reorder_functions which is not supposed to change mid
compilation globally. So we ended up putting some symbols for ordered
output and then forgetting about them.
> > /* Functions with time profile must be before these without profile. */
> > - if (!a->tp_first_run || !b->tp_first_run)
> > - return a->tp_first_run - b->tp_first_run;
> > + if (!tp_first_run_a || !tp_first_run_b)
> > + return tp_first_run_a ? 1 : -1;
>
> The code does the opposite of the comment: when tp_first_run_b is 0, it will
> return 1, indicating a > b, causing b to appear in front of a in the sorted
> array.
You are right - I have noticed that and fixed it as (apparently
forgotten) last minute change. Trunk says tp_first_run_b.
>
> I would recommend to make these variables uint64_t, then you can simply do
>
> tp_first_run_a--;
> tp_first_run_b--;
>
> making 0 wrap around to UINT64_MAX. Then they will naturally sort after all
> other nodes.
Then we would still have to watch the overflow before returning? I
actually find the condtional sort of more readable than intentional wrap
around the range, so I kept it in the code espeically because I made the
value 32bit again and without this trick I no longer need to watch
overflows.
>
> > + /* Output functions in RPO so callers get optimized before callees. This
> > + makes ipa-ra and other propagators to work.
> > + FIXME: This is far from optimal code layout. */
>
> I think this should have said "callees get optimized before callers".
Indeed.
Here is patch fixing the issues which I have bootstrapped®tested.
I will wait a bit for comments before comitting.
Honza
* cgraph.c (cgraph_node::verify_node): Verify tp_first_run.
* cgraph.h (cgrpah_node): Turn tp_first_run back to int.
* cgraphunit.c (tp_first_run_node_cmp): Do not watch for overflows.
(expand_all_functions): First expand ordered section and then
unordered.
* lto-partition.c (lto_balanced_map): Fix printing of tp_first_run.
* profile.c (compute_value_histograms): Error on out of range
tp_first_runs.
Index: cgraph.c
===================================================================
--- cgraph.c (revision 279093)
+++ cgraph.c (working copy)
@@ -3074,6 +3074,11 @@ cgraph_node::verify_node (void)
inlined_to->count.debug ();
error_found = true;
}
+ if (tp_first_run < 0)
+ {
+ error ("tp_first_run must be positive");
+ error_found = true;
+ }
if (!definition && !in_other_partition && local)
{
error ("local symbols must be defined");
Index: cgraph.h
===================================================================
--- cgraph.h (revision 279093)
+++ cgraph.h (working copy)
@@ -1430,8 +1430,6 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cg
/* Expected number of executions: calculated in profile.c. */
profile_count count;
- /* Time profiler: first run of function. */
- gcov_type tp_first_run;
/* How to scale counts at materialization time; used to merge
LTO units with different number of profile runs. */
int count_materialization_scale;
@@ -1439,6 +1437,8 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cg
unsigned int profile_id;
/* ID of the translation unit. */
int unit_id;
+ /* Time profiler: first run of function. */
+ int tp_first_run;
/* Set when decl is an abstract function pointed to by the
ABSTRACT_DECL_ORIGIN of a reachable function. */
Index: lto/lto-partition.c
===================================================================
--- lto/lto-partition.c (revision 279093)
+++ lto/lto-partition.c (working copy)
@@ -514,11 +514,11 @@ lto_balanced_map (int n_lto_partitions,
if (dump_file)
{
for (unsigned i = 0; i < order.length (); i++)
- fprintf (dump_file, "Balanced map symbol order:%s:%" PRId64 "\n",
- order[i]->name (), (int64_t) order[i]->tp_first_run);
+ fprintf (dump_file, "Balanced map symbol order:%s:%u\n",
+ order[i]->name (), order[i]->tp_first_run);
for (unsigned i = 0; i < noreorder.length (); i++)
- fprintf (dump_file, "Balanced map symbol no_reorder:%s:%" PRId64 "\n",
- noreorder[i]->name (), (int64_t) noreorder[i]->tp_first_run);
+ fprintf (dump_file, "Balanced map symbol no_reorder:%s:%u\n",
+ noreorder[i]->name (), noreorder[i]->tp_first_run);
}
/* Collect all variables that should not be reordered. */
Index: profile.c
===================================================================
--- profile.c (revision 279093)
+++ profile.c (working copy)
@@ -871,11 +871,18 @@ compute_value_histograms (histogram_valu
if (hist->type == HIST_TYPE_TIME_PROFILE)
{
node = cgraph_node::get (hist->fun->decl);
- node->tp_first_run = hist->hvalue.counters[0];
+ if (hist->hvalue.counters[0] >= 0
+ && hist->hvalue.counters[0] < INT_MAX / 2)
+ node->tp_first_run = hist->hvalue.counters[0];
+ else
+ {
+ if (flag_profile_correction)
+ error ("corrupted profile info: invalid time profile");
+ node->tp_first_run = 0;
+ }
if (dump_file)
- fprintf (dump_file, "Read tp_first_run: %" PRId64 "\n",
- (int64_t) node->tp_first_run);
+ fprintf (dump_file, "Read tp_first_run: %d\n", node->tp_first_run);
}
}
Index: cgraphunit.c
===================================================================
--- cgraphunit.c (revision 279093)
+++ cgraphunit.c (working copy)
@@ -2364,8 +2364,8 @@ tp_first_run_node_cmp (const void *pa, c
{
const cgraph_node *a = *(const cgraph_node * const *) pa;
const cgraph_node *b = *(const cgraph_node * const *) pb;
- gcov_type tp_first_run_a = a->tp_first_run;
- gcov_type tp_first_run_b = b->tp_first_run;
+ int tp_first_run_a = a->tp_first_run;
+ int tp_first_run_b = b->tp_first_run;
if (!opt_for_fn (a->decl, flag_profile_reorder_functions)
|| a->no_reorder)
@@ -2381,11 +2381,7 @@ tp_first_run_node_cmp (const void *pa, c
if (!tp_first_run_a || !tp_first_run_b)
return tp_first_run_b ? 1 : -1;
- /* Watch for overlflow - tp_first_run is 64bit. */
- if (tp_first_run_a > tp_first_run_b)
- return 1;
- else
- return -1;
+ return tp_first_run_a - tp_first_run_b;
}
/* Expand all functions that must be output.
@@ -2425,43 +2421,45 @@ expand_all_functions (void)
order[new_order_pos++] = order[i];
}
- /* Output functions in RPO so callers get optimized before callees. This
- makes ipa-ra and other propagators to work.
- FIXME: This is far from optimal code layout. */
- for (i = new_order_pos - 1; i >= 0; i--)
+ /* First output functions with time profile in specified order. */
+ qsort (tp_first_run_order, tp_first_run_order_pos,
+ sizeof (cgraph_node *), tp_first_run_node_cmp);
+ for (i = 0; i < tp_first_run_order_pos; i++)
{
- node = order[i];
+ node = tp_first_run_order[i];
if (node->process)
{
expanded_func_count++;
+ profiled_func_count++;
+
+ if (symtab->dump_file)
+ fprintf (symtab->dump_file,
+ "Time profile order in expand_all_functions:%s:%d\n",
+ node->asm_name (), node->tp_first_run);
node->process = 0;
node->expand ();
}
}
- qsort (tp_first_run_order, tp_first_run_order_pos,
- sizeof (cgraph_node *), tp_first_run_node_cmp);
- for (i = 0; i < tp_first_run_order_pos; i++)
+
+ /* Output functions in RPO so callees get optimized before callers. This
+ makes ipa-ra and other propagators to work.
+ FIXME: This is far from optimal code layout. */
+ for (i = new_order_pos - 1; i >= 0; i--)
{
- node = tp_first_run_order[i];
+ node = order[i];
if (node->process)
{
expanded_func_count++;
- profiled_func_count++;
-
- if (symtab->dump_file)
- fprintf (symtab->dump_file,
- "Time profile order in expand_all_functions:%s:%" PRId64
- "\n", node->asm_name (), (int64_t) node->tp_first_run);
node->process = 0;
node->expand ();
}
}
- if (dump_file)
- fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
- main_input_filename, profiled_func_count, expanded_func_count);
+ if (dump_file)
+ fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
+ main_input_filename, profiled_func_count, expanded_func_count);
if (symtab->dump_file && tp_first_run_order_pos)
fprintf (symtab->dump_file, "Expanded functions with time profile:%u/%u\n",