> On Tue, 10 Dec 2019, Jan Hubicka wrote: > > > > I would recommend to make these variables uint64_t, then you can simply do > > > > > > tp_first_run_a--; > > > tp_first_run_b--; > > > > > > making 0 wrap around to UINT64_MAX. Then they will naturally sort after > > > all > > > other nodes. > > > > Then we would still have to watch the overflow before returning? I > > actually find the condtional sort of more readable than intentional wrap > > around the range, so I kept it in the code espeically because I made the > > value 32bit again and without this trick I no longer need to watch > > overflows. > > For int-typed timestamps, you'd need to warp 0 to INT_MAX, e.g. like this: > > tp_first_run_a = (tp_first_run_a - 1) & INT_MAX; > tp_first_run_b = (tp_first_run_b - 1) & INT_MAX; > > which keeps them in 0..INT_MAX range so 'return tp_first_run_a - > tp_first_run_b' > still works.
OK, updatd code for that :) > > > > > + /* Output functions in RPO so callers get optimized before callees. > > > > This > > > > + makes ipa-ra and other propagators to work. > > > > + FIXME: This is far from optimal code layout. */ > > > > > > I think this should have said "callees get optimized before callers". > > > > Indeed. > > So technically this would be just postorder, not RPO :) Well, we it is computed by ipa_reverse_postorder :) > > > --- cgraph.c (revision 279093) > > +++ cgraph.c (working copy) > > @@ -3074,6 +3074,11 @@ cgraph_node::verify_node (void) > > inlined_to->count.debug (); > > error_found = true; > > } > > + if (tp_first_run < 0) > > + { > > + error ("tp_first_run must be positive"); > > + error_found = true; > > + } > > "non-negative"? Fixed too. I have comitted the following variant of patch after re-testing. * cgraph.c (cgraph_node::verify_node): Verify tp_first_run. * cgraph.h (cgrpah_node): Turn tp_first_run back to int. * cgraphunit.c (tp_first_run_node_cmp): Do not watch for overflows. (expand_all_functions): First expand ordered section and then unordered. * lto-partition.c (lto_balanced_map): Fix printing of tp_first_run. * profile.c (compute_value_histograms): Error on out of range tp_first_runs. Index: cgraph.c =================================================================== --- cgraph.c (revision 279167) +++ cgraph.c (working copy) @@ -3066,6 +3066,11 @@ cgraph_node::verify_node (void) inlined_to->count.debug (); error_found = true; } + if (tp_first_run < 0) + { + error ("tp_first_run must be non-negative"); + error_found = true; + } if (!definition && !in_other_partition && local) { error ("local symbols must be defined"); Index: cgraph.h =================================================================== --- cgraph.h (revision 279167) +++ cgraph.h (working copy) @@ -926,9 +926,9 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cg clone_of (NULL), call_site_hash (NULL), former_clone_of (NULL), simdclone (NULL), simd_clones (NULL), ipa_transforms_to_apply (vNULL), inlined_to (NULL), rtl (NULL), clone (), thunk (), - count (profile_count::uninitialized ()), tp_first_run (false), + count (profile_count::uninitialized ()), count_materialization_scale (REG_BR_PROB_BASE), profile_id (0), - unit_id (0), used_as_abstract_origin (false), + unit_id (0), tp_first_run (0), used_as_abstract_origin (false), lowered (false), process (false), frequency (NODE_FREQUENCY_NORMAL), only_called_at_startup (false), only_called_at_exit (false), tm_clone (false), dispatcher_function (false), calls_comdat_local (false), @@ -1469,8 +1469,6 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cg /* Expected number of executions: calculated in profile.c. */ profile_count count; - /* Time profiler: first run of function. */ - gcov_type tp_first_run; /* How to scale counts at materialization time; used to merge LTO units with different number of profile runs. */ int count_materialization_scale; @@ -1478,6 +1476,8 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cg unsigned int profile_id; /* ID of the translation unit. */ int unit_id; + /* Time profiler: first run of function. */ + int tp_first_run; /* Set when decl is an abstract function pointed to by the ABSTRACT_DECL_ORIGIN of a reachable function. */ Index: cgraphunit.c =================================================================== --- cgraphunit.c (revision 279167) +++ cgraphunit.c (working copy) @@ -2364,8 +2364,8 @@ tp_first_run_node_cmp (const void *pa, c { const cgraph_node *a = *(const cgraph_node * const *) pa; const cgraph_node *b = *(const cgraph_node * const *) pb; - gcov_type tp_first_run_a = a->tp_first_run; - gcov_type tp_first_run_b = b->tp_first_run; + unsigned int tp_first_run_a = a->tp_first_run; + unsigned int tp_first_run_b = b->tp_first_run; if (!opt_for_fn (a->decl, flag_profile_reorder_functions) || a->no_reorder) @@ -2378,14 +2378,10 @@ tp_first_run_node_cmp (const void *pa, c return a->order - b->order; /* Functions with time profile must be before these without profile. */ - if (!tp_first_run_a || !tp_first_run_b) - return tp_first_run_b ? 1 : -1; + tp_first_run_a = (tp_first_run_a - 1) & INT_MAX; + tp_first_run_b = (tp_first_run_b - 1) & INT_MAX; - /* Watch for overlflow - tp_first_run is 64bit. */ - if (tp_first_run_a > tp_first_run_b) - return 1; - else - return -1; + return tp_first_run_a - tp_first_run_b; } /* Expand all functions that must be output. @@ -2425,43 +2421,45 @@ expand_all_functions (void) order[new_order_pos++] = order[i]; } - /* Output functions in RPO so callers get optimized before callees. This - makes ipa-ra and other propagators to work. - FIXME: This is far from optimal code layout. */ - for (i = new_order_pos - 1; i >= 0; i--) + /* First output functions with time profile in specified order. */ + qsort (tp_first_run_order, tp_first_run_order_pos, + sizeof (cgraph_node *), tp_first_run_node_cmp); + for (i = 0; i < tp_first_run_order_pos; i++) { - node = order[i]; + node = tp_first_run_order[i]; if (node->process) { expanded_func_count++; + profiled_func_count++; + + if (symtab->dump_file) + fprintf (symtab->dump_file, + "Time profile order in expand_all_functions:%s:%d\n", + node->asm_name (), node->tp_first_run); node->process = 0; node->expand (); } } - qsort (tp_first_run_order, tp_first_run_order_pos, - sizeof (cgraph_node *), tp_first_run_node_cmp); - for (i = 0; i < tp_first_run_order_pos; i++) + + /* Output functions in RPO so callees get optimized before callers. This + makes ipa-ra and other propagators to work. + FIXME: This is far from optimal code layout. */ + for (i = new_order_pos - 1; i >= 0; i--) { - node = tp_first_run_order[i]; + node = order[i]; if (node->process) { expanded_func_count++; - profiled_func_count++; - - if (symtab->dump_file) - fprintf (symtab->dump_file, - "Time profile order in expand_all_functions:%s:%" PRId64 - "\n", node->asm_name (), (int64_t) node->tp_first_run); node->process = 0; node->expand (); } } - if (dump_file) - fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n", - main_input_filename, profiled_func_count, expanded_func_count); + if (dump_file) + fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n", + main_input_filename, profiled_func_count, expanded_func_count); if (symtab->dump_file && tp_first_run_order_pos) fprintf (symtab->dump_file, "Expanded functions with time profile:%u/%u\n", Index: lto/lto-partition.c =================================================================== --- lto/lto-partition.c (revision 279167) +++ lto/lto-partition.c (working copy) @@ -514,11 +514,11 @@ lto_balanced_map (int n_lto_partitions, if (dump_file) { for (unsigned i = 0; i < order.length (); i++) - fprintf (dump_file, "Balanced map symbol order:%s:%" PRId64 "\n", - order[i]->name (), (int64_t) order[i]->tp_first_run); + fprintf (dump_file, "Balanced map symbol order:%s:%u\n", + order[i]->name (), order[i]->tp_first_run); for (unsigned i = 0; i < noreorder.length (); i++) - fprintf (dump_file, "Balanced map symbol no_reorder:%s:%" PRId64 "\n", - noreorder[i]->name (), (int64_t) noreorder[i]->tp_first_run); + fprintf (dump_file, "Balanced map symbol no_reorder:%s:%u\n", + noreorder[i]->name (), noreorder[i]->tp_first_run); } /* Collect all variables that should not be reordered. */ Index: profile.c =================================================================== --- profile.c (revision 279167) +++ profile.c (working copy) @@ -871,11 +871,18 @@ compute_value_histograms (histogram_valu if (hist->type == HIST_TYPE_TIME_PROFILE) { node = cgraph_node::get (hist->fun->decl); - node->tp_first_run = hist->hvalue.counters[0]; + if (hist->hvalue.counters[0] >= 0 + && hist->hvalue.counters[0] < INT_MAX / 2) + node->tp_first_run = hist->hvalue.counters[0]; + else + { + if (flag_profile_correction) + error ("corrupted profile info: invalid time profile"); + node->tp_first_run = 0; + } if (dump_file) - fprintf (dump_file, "Read tp_first_run: %" PRId64 "\n", - (int64_t) node->tp_first_run); + fprintf (dump_file, "Read tp_first_run: %d\n", node->tp_first_run); } }