> On Tue, 10 Dec 2019, Jan Hubicka wrote:
> 
> > > I would recommend to make these variables uint64_t, then you can simply do
> > > 
> > >   tp_first_run_a--;
> > >   tp_first_run_b--;
> > > 
> > > making 0 wrap around to UINT64_MAX. Then they will naturally sort after 
> > > all
> > > other nodes.
> > 
> > Then we would still have to watch the overflow before returning? I
> > actually find the condtional sort of more readable than intentional wrap
> > around the range, so I kept it in the code espeically because I made the
> > value 32bit again and without this trick I no longer need to watch
> > overflows.
> 
> For int-typed timestamps, you'd need to warp 0 to INT_MAX, e.g. like this:
> 
>   tp_first_run_a = (tp_first_run_a - 1) & INT_MAX;
>   tp_first_run_b = (tp_first_run_b - 1) & INT_MAX;
> 
> which keeps them in 0..INT_MAX range so 'return tp_first_run_a - 
> tp_first_run_b'
> still works.

OK, updatd code for that :)
> 
> > > > +  /* Output functions in RPO so callers get optimized before callees.  
> > > > This
> > > > +     makes ipa-ra and other propagators to work.
> > > > +     FIXME: This is far from optimal code layout.  */
> > > 
> > > I think this should have said "callees get optimized before callers".
> > 
> > Indeed.
> 
> So technically this would be just postorder, not RPO :)

Well, we it is computed by ipa_reverse_postorder :)
> 
> > --- cgraph.c        (revision 279093)
> > +++ cgraph.c        (working copy)
> > @@ -3074,6 +3074,11 @@ cgraph_node::verify_node (void)
> >        inlined_to->count.debug ();
> >        error_found = true;
> >      }
> > +  if (tp_first_run < 0)
> > +    {
> > +      error ("tp_first_run must be positive");
> > +      error_found = true;
> > +    }
> 
> "non-negative"?
Fixed too.

I have comitted the following variant of patch after re-testing.


        * cgraph.c (cgraph_node::verify_node): Verify tp_first_run.
        * cgraph.h (cgrpah_node): Turn tp_first_run back to int.
        * cgraphunit.c (tp_first_run_node_cmp): Do not watch for overflows.
        (expand_all_functions): First expand ordered section and then
        unordered.
        * lto-partition.c (lto_balanced_map): Fix printing of tp_first_run.
        * profile.c (compute_value_histograms): Error on out of range
        tp_first_runs.
Index: cgraph.c
===================================================================
--- cgraph.c    (revision 279167)
+++ cgraph.c    (working copy)
@@ -3066,6 +3066,11 @@ cgraph_node::verify_node (void)
       inlined_to->count.debug ();
       error_found = true;
     }
+  if (tp_first_run < 0)
+    {
+      error ("tp_first_run must be non-negative");
+      error_found = true;
+    }
   if (!definition && !in_other_partition && local)
     {
       error ("local symbols must be defined");
Index: cgraph.h
===================================================================
--- cgraph.h    (revision 279167)
+++ cgraph.h    (working copy)
@@ -926,9 +926,9 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cg
       clone_of (NULL), call_site_hash (NULL), former_clone_of (NULL),
       simdclone (NULL), simd_clones (NULL), ipa_transforms_to_apply (vNULL),
       inlined_to (NULL), rtl (NULL), clone (), thunk (),
-      count (profile_count::uninitialized ()), tp_first_run (false),
+      count (profile_count::uninitialized ()),
       count_materialization_scale (REG_BR_PROB_BASE), profile_id (0),
-      unit_id (0), used_as_abstract_origin (false),
+      unit_id (0), tp_first_run (0), used_as_abstract_origin (false),
       lowered (false), process (false), frequency (NODE_FREQUENCY_NORMAL),
       only_called_at_startup (false), only_called_at_exit (false),
       tm_clone (false), dispatcher_function (false), calls_comdat_local 
(false),
@@ -1469,8 +1469,6 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cg
 
   /* Expected number of executions: calculated in profile.c.  */
   profile_count count;
-  /* Time profiler: first run of function.  */
-  gcov_type tp_first_run;
   /* How to scale counts at materialization time; used to merge
      LTO units with different number of profile runs.  */
   int count_materialization_scale;
@@ -1478,6 +1476,8 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cg
   unsigned int profile_id;
   /* ID of the translation unit.  */
   int unit_id;
+  /* Time profiler: first run of function.  */
+  int tp_first_run;
 
   /* Set when decl is an abstract function pointed to by the
      ABSTRACT_DECL_ORIGIN of a reachable function.  */
Index: cgraphunit.c
===================================================================
--- cgraphunit.c        (revision 279167)
+++ cgraphunit.c        (working copy)
@@ -2364,8 +2364,8 @@ tp_first_run_node_cmp (const void *pa, c
 {
   const cgraph_node *a = *(const cgraph_node * const *) pa;
   const cgraph_node *b = *(const cgraph_node * const *) pb;
-  gcov_type tp_first_run_a = a->tp_first_run;
-  gcov_type tp_first_run_b = b->tp_first_run;
+  unsigned int tp_first_run_a = a->tp_first_run;
+  unsigned int tp_first_run_b = b->tp_first_run;
 
   if (!opt_for_fn (a->decl, flag_profile_reorder_functions)
       || a->no_reorder)
@@ -2378,14 +2378,10 @@ tp_first_run_node_cmp (const void *pa, c
     return a->order - b->order;
 
   /* Functions with time profile must be before these without profile.  */
-  if (!tp_first_run_a || !tp_first_run_b)
-    return tp_first_run_b ? 1 : -1;
+  tp_first_run_a = (tp_first_run_a - 1) & INT_MAX;
+  tp_first_run_b = (tp_first_run_b - 1) & INT_MAX;
 
-  /* Watch for overlflow - tp_first_run is 64bit.  */
-  if (tp_first_run_a > tp_first_run_b)
-    return 1;
-  else
-    return -1;
+  return tp_first_run_a - tp_first_run_b;
 }
 
 /* Expand all functions that must be output.
@@ -2425,43 +2421,45 @@ expand_all_functions (void)
           order[new_order_pos++] = order[i];
       }
 
-  /* Output functions in RPO so callers get optimized before callees.  This
-     makes ipa-ra and other propagators to work.
-     FIXME: This is far from optimal code layout.  */
-  for (i = new_order_pos - 1; i >= 0; i--)
+  /* First output functions with time profile in specified order.  */
+  qsort (tp_first_run_order, tp_first_run_order_pos,
+        sizeof (cgraph_node *), tp_first_run_node_cmp);
+  for (i = 0; i < tp_first_run_order_pos; i++)
     {
-      node = order[i];
+      node = tp_first_run_order[i];
 
       if (node->process)
        {
          expanded_func_count++;
+         profiled_func_count++;
+
+         if (symtab->dump_file)
+           fprintf (symtab->dump_file,
+                    "Time profile order in expand_all_functions:%s:%d\n",
+                    node->asm_name (), node->tp_first_run);
          node->process = 0;
          node->expand ();
        }
     }
-  qsort (tp_first_run_order, tp_first_run_order_pos,
-        sizeof (cgraph_node *), tp_first_run_node_cmp);
-  for (i = 0; i < tp_first_run_order_pos; i++)
+
+  /* Output functions in RPO so callees get optimized before callers.  This
+     makes ipa-ra and other propagators to work.
+     FIXME: This is far from optimal code layout.  */
+  for (i = new_order_pos - 1; i >= 0; i--)
     {
-      node = tp_first_run_order[i];
+      node = order[i];
 
       if (node->process)
        {
          expanded_func_count++;
-         profiled_func_count++;
-
-         if (symtab->dump_file)
-           fprintf (symtab->dump_file,
-                    "Time profile order in expand_all_functions:%s:%" PRId64
-                    "\n", node->asm_name (), (int64_t) node->tp_first_run);
          node->process = 0;
          node->expand ();
        }
     }
 
-    if (dump_file)
-      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
-               main_input_filename, profiled_func_count, expanded_func_count);
+  if (dump_file)
+    fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
+            main_input_filename, profiled_func_count, expanded_func_count);
 
   if (symtab->dump_file && tp_first_run_order_pos)
     fprintf (symtab->dump_file, "Expanded functions with time profile:%u/%u\n",
Index: lto/lto-partition.c
===================================================================
--- lto/lto-partition.c (revision 279167)
+++ lto/lto-partition.c (working copy)
@@ -514,11 +514,11 @@ lto_balanced_map (int n_lto_partitions,
   if (dump_file)
     {
       for (unsigned i = 0; i < order.length (); i++)
-       fprintf (dump_file, "Balanced map symbol order:%s:%" PRId64 "\n",
-                order[i]->name (), (int64_t) order[i]->tp_first_run);
+       fprintf (dump_file, "Balanced map symbol order:%s:%u\n",
+                order[i]->name (), order[i]->tp_first_run);
       for (unsigned i = 0; i < noreorder.length (); i++)
-       fprintf (dump_file, "Balanced map symbol no_reorder:%s:%" PRId64 "\n",
-                noreorder[i]->name (), (int64_t) noreorder[i]->tp_first_run);
+       fprintf (dump_file, "Balanced map symbol no_reorder:%s:%u\n",
+                noreorder[i]->name (), noreorder[i]->tp_first_run);
     }
 
   /* Collect all variables that should not be reordered.  */
Index: profile.c
===================================================================
--- profile.c   (revision 279167)
+++ profile.c   (working copy)
@@ -871,11 +871,18 @@ compute_value_histograms (histogram_valu
       if (hist->type == HIST_TYPE_TIME_PROFILE)
         {
          node = cgraph_node::get (hist->fun->decl);
-         node->tp_first_run = hist->hvalue.counters[0];
+         if (hist->hvalue.counters[0] >= 0
+             && hist->hvalue.counters[0] < INT_MAX / 2)
+           node->tp_first_run = hist->hvalue.counters[0];
+         else
+           {
+             if (flag_profile_correction)
+               error ("corrupted profile info: invalid time profile");
+             node->tp_first_run = 0;
+           }
 
           if (dump_file)
-            fprintf (dump_file, "Read tp_first_run: %" PRId64 "\n",
-                    (int64_t) node->tp_first_run);
+            fprintf (dump_file, "Read tp_first_run: %d\n", node->tp_first_run);
         }
     }
 

Reply via email to