https://gcc.gnu.org/g:348c623d7dc43caafa9ae7878133df4e12edf54b

commit r16-6920-g348c623d7dc43caafa9ae7878133df4e12edf54b
Author: Prachi Godbole <[email protected]>
Date:   Mon Jan 19 20:49:38 2026 -0800

    ipa-reorder-for-locality - Introduce C++ template heuristics
    
    This patch introduces a new heuristics for reordering functions, to be used
    in the absense of profile information.  This approach uses C++ template
    instantiation types to group functions together.  Entry functions are sorted
    in the beginning, and callees are sorted as part of partition_callchain ().
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    Signed-off-by: Prachi Godbole <[email protected]>
    
    config/ChangeLog:
    
            * bootstrap-lto-locality-cpp-template.mk: New file.
    
    gcc/ChangeLog:
    
            * flag-types.h (enum lto_locality_heuristics): New enum.
            * ipa-locality-cloning.cc (struct templ_info): New struct.
            (struct locality_info): Add templ_info field.
            (templ_hash_map): New hash_map.
            (callee_templ_cmp): Ditto.
            (static_profile_templ_cmp): Ditto.
            (sort_templ_hashes_cmp): Ditto.
            (order_templ_hashes): Ditto.
            (locality_dc_template_p): Ditto.
            (populate_templ_info): Ditto.
            (create_locality_info): Call populate_templ_info.
            (partition_callchain): Call callee_templ_cmp.
            (locality_determine_static_order): Populate and sort templ_hash_map.
            (locality_partition_and_clone): Handle lto_locality_heuristics.
            (lc_execute): Initialize templ_hash_map.
            * params.opt: New param.

Diff:
---
 config/bootstrap-lto-locality-cpp-template.mk |  22 +++
 gcc/flag-types.h                              |   6 +
 gcc/ipa-locality-cloning.cc                   | 243 ++++++++++++++++++++++++--
 gcc/params.opt                                |  10 ++
 4 files changed, 271 insertions(+), 10 deletions(-)

diff --git a/config/bootstrap-lto-locality-cpp-template.mk 
b/config/bootstrap-lto-locality-cpp-template.mk
new file mode 100644
index 000000000000..8cf804765dc9
--- /dev/null
+++ b/config/bootstrap-lto-locality-cpp-template.mk
@@ -0,0 +1,22 @@
+# This option enables LTO and locality partitioning for stage2 and stage3 in 
slim mode
+
+STAGE2_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality \
+                --param=lto-partition-locality-heuristics=cpp_template
+STAGE3_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality \
+                --param=lto-partition-locality-heuristics=cpp_template
+STAGEprofile_CFLAGS += -flto=jobserver -frandom-seed=1
+STAGEtrain_CFLAGS += -flto=jobserver -frandom-seed=1
+STAGEfeedback_CFLAGS += -flto=jobserver -frandom-seed=1 
-fipa-reorder-for-locality
+
+# assumes the host supports the linker plugin
+LTO_AR = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-ar$(exeext) 
-B$$r/$(HOST_SUBDIR)/prev-gcc/
+LTO_RANLIB = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-ranlib$(exeext) 
-B$$r/$(HOST_SUBDIR)/prev-gcc/
+LTO_NM = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-nm$(exeext) 
-B$$r/$(HOST_SUBDIR)/prev-gcc/
+
+LTO_EXPORTS = AR="$(LTO_AR)"; export AR; \
+             RANLIB="$(LTO_RANLIB)"; export RANLIB; \
+             NM="$(LTO_NM)"; export NM;
+LTO_FLAGS_TO_PASS = AR="$(LTO_AR)" RANLIB="$(LTO_RANLIB)" NM="$(LTO_NM)"
+
+do-compare = $(SHELL) $(srcdir)/contrib/compare-lto $$f1 $$f2
+extra-compare = gcc/lto1$(exeext)
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 4d2f08c43992..77dc1f658fbd 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -422,6 +422,12 @@ enum lto_locality_cloning_model {
   LTO_LOCALITY_MAXIMAL_CLONING = 2,
 };
 
+/* flag_lto_locality_heuristics initialization values.  */
+enum lto_locality_heuristics {
+  LTO_LOCALITY_DEFAULT_HEURISTIC = 0,
+  LTO_LOCALITY_CPP_TEMPLATE = 1,
+};
+
 /* flag_lto_linker_output initialization values.  */
 enum lto_linker_output {
   LTO_LINKER_OUTPUT_UNKNOWN,
diff --git a/gcc/ipa-locality-cloning.cc b/gcc/ipa-locality-cloning.cc
index fa461518da43..283f76d048bb 100644
--- a/gcc/ipa-locality-cloning.cc
+++ b/gcc/ipa-locality-cloning.cc
@@ -43,7 +43,15 @@ along with GCC; see the file COPYING3.  If not see
         Create a local clone in current partition, if cloning criteria is
         satisfied.
       3. Redirect any new caller to a local clone if one exists.
-   Partition size is param controlled to fine tune per program behavior.  */
+
+   Another heuristics can be used in the absense of profile information.  This
+   approach uses C++ template instantiation types to group functions together.
+   Entry functions are sorted in the beginning by most used template types, and
+   callees are sorted as part of partition_callchain (), again by template
+   types.
+
+   For bothe schemes, partition size is param controlled to fine tune per
+   program behavior.  */
 
 #include "config.h"
 #define INCLUDE_ALGORITHM
@@ -65,6 +73,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ipa-modref-tree.h"
 #include "ipa-modref.h"
 #include "symtab-clones.h"
+#include "demangle.h"
 #include "ipa-locality-cloning.h"
 
 /* Locality partitions, assigns nodes to partitions.  These are used later in
@@ -88,6 +97,14 @@ struct locality_order
   {}
 };
 
+/* Data structure to hold information about unique template instantiations.  */
+struct templ_info
+{
+  hashval_t templ_hash;
+  int count;
+  int order;
+};
+
 /* Data structure to hold accumulated edge frequencies for unique callees.
    Frequencies of aliased callees are accumulated with the ultimate target
    alias represented by ultimate_callee.
@@ -114,9 +131,13 @@ struct locality_info
   /* Accumulated node->callee edge frequencies for unique callees.  */
   hash_map<loc_map_hash, locality_callee_info> callee_info;
 
+  /* Template instantiation info.  */
+  hashval_t templ_hash;
+
   locality_info ()
     {
       all_callees.create (1);
+      templ_hash = 0;
     }
   ~locality_info ()
     {
@@ -124,6 +145,10 @@ struct locality_info
     }
 };
 
+/* Template instantiation hash_map.  */
+typedef hash_map<int_hash<hashval_t, 0>, templ_info> hash_freq_map_t;
+static std::unique_ptr<hash_freq_map_t> templ_hash_map;
+
 /* Pool allocation for locality_info.  */
 static object_allocator<locality_info> loc_infos ("IPA locality callchain");
 static
@@ -157,6 +182,72 @@ callee_default_cmp (const void *pa, const void *pb)
   return compare_node_uids (a->ultimate_callee, b->ultimate_callee);
 }
 
+/* Sort all_callees of NODE by template hashes and edge hotness.
+
+   all_callees is already sorted by call frequency.  If templ_hash of the
+   caller is same as a callee's templ_hash then that callee is ordered first.
+   Callees having same templ_hash are sorted by call frequency, callees with
+   different templ_hash are sorted by the template order.
+
+   if (Both PA and PB have templates && PA's template == PB'template) ||
+   (Both A and B don't have templates): sort by frequency
+
+   if (PA doesn't have template && PB has) || (Both have different templates &&
+    caller's template == PB's template): PB before PA
+
+   if (PA has template && PB doesn't) || (Both have different templates &&
+    caller's template == PA's template): PA before PB
+
+   if (Both have different templates && both are different than caller's
+    template: sort by template order.  */
+static int
+callee_templ_cmp (const void *pa, const void *pb)
+{
+  const locality_callee_info *a = *static_cast<const locality_callee_info
+    *const *> (pa);
+  const locality_callee_info *b = *static_cast<const locality_callee_info
+    *const *> (pb);
+
+  locality_info *caller_info = get_locality_info (a->ultimate_caller);
+  hashval_t caller_hash = 0;
+  if (caller_info)
+    caller_hash = caller_info->templ_hash;
+
+  locality_info *ainfo = get_locality_info (a->edge->callee);
+  locality_info *binfo = get_locality_info (b->edge->callee);
+  templ_info *atinfo = NULL, *btinfo = NULL;
+  hashval_t a_hash = 0, b_hash = 0;
+  if (ainfo)
+    {
+      a_hash = ainfo->templ_hash;
+      atinfo = templ_hash_map->get (a_hash);
+    }
+  if (binfo)
+    {
+      b_hash = binfo->templ_hash;
+      btinfo = templ_hash_map->get (b_hash);
+    }
+
+  if (a_hash == b_hash)
+    {
+      /* We get here if both have same template type or both have 0 as hash.  
*/
+      return callee_default_cmp (pa, pb);
+    }
+  else if (a_hash && (!b_hash || a_hash == caller_hash))
+    return -1;
+  else if (b_hash && (!a_hash || b_hash == caller_hash))
+    return 1;
+  else
+    {
+      gcc_assert (atinfo && btinfo);
+      if (atinfo->order < btinfo->order)
+       return -1;
+      /* Order being same is already handled above.  */
+      return 1;
+    }
+  return callee_default_cmp (pa, pb);
+}
+
 /* Sort all_callees in INFO by edge hotness.  */
 static void
 sort_all_callees_default (locality_info *info)
@@ -198,6 +289,69 @@ populate_callee_locality_info (cgraph_node *node, 
cgraph_node *n,
     }
 }
 
+/* Forward declaration.  */
+static int static_profile_cmp (const void *pa, const void *pb);
+
+/* Helper function for qsort; sort nodes in the descending order derived of
+   template instantiation types by frequency count.  */
+static int
+static_profile_templ_cmp (const void *pa, const void *pb)
+{
+  const locality_order *a = *static_cast<const locality_order *const *> (pa);
+  const locality_order *b = *static_cast<const locality_order *const *> (pb);
+  locality_info *ainfo = get_locality_info (a->node);
+  templ_info *atinfo = templ_hash_map->get (ainfo->templ_hash);
+  locality_info *binfo = get_locality_info (b->node);
+  templ_info *btinfo = templ_hash_map->get (binfo->templ_hash);
+  if ((atinfo && !btinfo)
+      || (atinfo && btinfo && atinfo->order < btinfo->order))
+    return -1;
+  else if ((btinfo && !atinfo)
+          || (atinfo && btinfo && atinfo->order > btinfo->order))
+    return 1;
+  return static_profile_cmp (pa, pb);
+}
+
+/* Helper function for qsort; sort template instantiation types in descending
+   order of their frequency count.  */
+static int
+sort_templ_hashes_cmp (const void *pa, const void *pb)
+{
+  const std::pair<hashval_t, int> *a = static_cast<const std::pair<hashval_t,
+       int> *> (pa);
+  const std::pair<hashval_t, int> *b = static_cast<const std::pair<hashval_t,
+       int> *> (pb);
+  if (a->second < b->second)
+    return 1;
+  else if (a->second > b->second)
+    return -1;
+  else
+    {
+      long long int res = (long long int)a->first - (long long int)b->first;
+      /* Hashes from templ_hash_map cannot be equal.  */
+      gcc_assert (res != 0);
+      return res > 0 ? 1 : -1;
+    }
+}
+
+/* Sort template instantiation types and record the sorted order.  */
+static void
+order_templ_hashes ()
+{
+  auto_vec<std::pair<hashval_t, int>> templ_hashes;
+  hash_freq_map_t::iterator iter = templ_hash_map->begin ();
+  for (; iter != templ_hash_map->end (); ++iter)
+    templ_hashes.safe_push (std::make_pair ((*iter).first,
+                                            (*iter).second.count));
+  templ_hashes.qsort (sort_templ_hashes_cmp);
+  for (unsigned i = 0; i < templ_hashes.length (); i++)
+    {
+      templ_info *tinfo = templ_hash_map->get (templ_hashes[i].first);
+      gcc_assert (tinfo);
+      tinfo->order = i;
+    }
+}
+
 /* Populate locality_info for NODE from its direct callers.  */
 static void
 populate_caller_locality_info (cgraph_node *node, locality_info *info)
@@ -216,13 +370,62 @@ populate_caller_locality_info (cgraph_node *node, 
locality_info *info)
     }
 }
 
+/* Return true if template component is found in the demangled function name in
+   DC.  */
+static bool
+locality_dc_template_p (struct demangle_component *dc)
+{
+  switch (dc->type)
+    {
+    case DEMANGLE_COMPONENT_QUAL_NAME:
+      return locality_dc_template_p (dc->u.s_binary.left);
+    case DEMANGLE_COMPONENT_TEMPLATE:
+      return true;
+    default:
+      return false;
+    }
+  return false;
+}
+
+/* Generate hash for the template type from NODE's name if NODE represents a
+   templated functions.  If present, record in INFO.  */
+
+static void
+populate_templ_info (cgraph_node *node, locality_info *info)
+{
+  if (!info)
+    return;
+  void *alloc = NULL;
+  const char *asm_name = IDENTIFIER_POINTER  (DECL_ASSEMBLER_NAME
+                                             (node->decl));
+  const char *demangled_name = cplus_demangle_v3 (asm_name, 0);
+  struct demangle_component *dc =
+    cplus_demangle_v3_components (asm_name, DMGL_NO_OPTS, &alloc);
+
+  if (demangled_name && dc && locality_dc_template_p (dc))
+    {
+      const char *templ = strchr (demangled_name, '<');
+      hashval_t t_hash = htab_hash_string (templ);
+      info->templ_hash = t_hash;
+
+      bool existed;
+      templ_info &tinfo = templ_hash_map->get_or_insert (t_hash, &existed);
+      if (existed)
+       tinfo.count = tinfo.count + 1;
+      else
+       tinfo.count = 1;
+    }
+  free (alloc);
+}
+
 /* Initialize locality_info for node V.  If CLONE_P is true, V is a locality
    clone; populate callee information for locality clones because caller info
    is needed for cloning decisions and clones are not cloned again.  Populate
    both caller and callee info for non-clone nodes.  */
 
 static inline void
-create_locality_info (cgraph_node *v, bool clone_p = false)
+create_locality_info (cgraph_node *v, bool templ_p = false,
+                     bool clone_p = false)
 {
   locality_info **info = node_to_ch_info->get (v->get_uid ());
   gcc_assert (!info);
@@ -236,6 +439,8 @@ create_locality_info (cgraph_node *v, bool clone_p = false)
     populate_caller_locality_info (v, vinfo);
   populate_callee_locality_info (v, v, vinfo);
   sort_all_callees_default (vinfo);
+  if (templ_p)
+    populate_templ_info (v, vinfo);
 }
 
 /* Return true if NODE is already in some partition.  */
@@ -976,7 +1181,8 @@ static void
 partition_callchain (cgraph_node *node, locality_partition &partition,
                     lto_locality_cloning_model cloning_model,
                     double freq_cutoff, int size, int &cl_num,
-                    int &npartitions, int64_t partition_size)
+                    int &npartitions, int64_t partition_size,
+                    lto_locality_heuristics scheme)
 {
   /* Aliases are added in the same partition as their targets.
      Aliases are not cloned and their callees are not processed separately.  */
@@ -989,6 +1195,8 @@ partition_callchain (cgraph_node *node, locality_partition 
&partition,
      inlined nodes which themselves are already partitioned along with their
      inlined_to nodes.  */
   locality_info *info = get_locality_info (node);
+  if (scheme == LTO_LOCALITY_CPP_TEMPLATE)
+    info->all_callees.qsort (callee_templ_cmp);
   for (unsigned i = 0; i < info->all_callees.length (); i++)
     {
       cgraph_edge *e = info->all_callees[i]->edge;
@@ -1002,7 +1210,8 @@ partition_callchain (cgraph_node *node, 
locality_partition &partition,
              fprintf (dump_file, "Partitioned node: %s\n",
                       n->dump_asm_name ());
              partition_callchain (n, partition, cloning_model, freq_cutoff,
-                                  size, cl_num, npartitions, partition_size);
+                                  size, cl_num, npartitions, partition_size,
+                                  scheme);
            }
          else if (cloning_model >= LTO_LOCALITY_NON_INTERPOSABLE_CLONING
                   && (!e->callee->alias)
@@ -1042,11 +1251,13 @@ partition_callchain (cgraph_node *node, 
locality_partition &partition,
                                                  cloning_model);
                  if (cl_node)
                    {
-                     create_locality_info (cl_node, true);
+                     create_locality_info (cl_node,
+                                           scheme == LTO_LOCALITY_CPP_TEMPLATE,
+                                           true);
                      add_node_to_partition (partition, cl_node);
                      partition_callchain (cl_node, partition, cloning_model,
                                           freq_cutoff, size, cl_num,
-                                          npartitions, partition_size);
+                                          npartitions, partition_size, scheme);
                    }
                }
            }
@@ -1131,13 +1342,14 @@ locality_determine_ipa_order (auto_vec<locality_order 
*> *order)
    Store the order in ORDER.  */
 
 static bool
-locality_determine_static_order (auto_vec<locality_order *> *order)
+locality_determine_static_order (auto_vec<locality_order *> *order,
+                                lto_locality_heuristics scheme)
 {
   struct cgraph_node *node;
   FOR_EACH_DEFINED_FUNCTION (node)
     if (node->get_partitioning_class () == SYMBOL_PARTITION)
       {
-       create_locality_info (node);
+       create_locality_info (node, scheme == LTO_LOCALITY_CPP_TEMPLATE);
        if (node->no_reorder)
          {
            if (dump_file)
@@ -1157,6 +1369,14 @@ locality_determine_static_order (auto_vec<locality_order 
*> *order)
            order->safe_push (cl);
          }
       }
+
+  if (scheme == LTO_LOCALITY_CPP_TEMPLATE)
+    {
+      order_templ_hashes ();
+      order->qsort (static_profile_templ_cmp);
+      return true;
+    }
+
   order->qsort (static_profile_cmp);
   return true;
 }
@@ -1172,6 +1392,7 @@ locality_determine_static_order (auto_vec<locality_order 
*> *order)
 static void
 locality_partition_and_clone (int max_locality_partition_size,
                              lto_locality_cloning_model cloning_model,
+                             lto_locality_heuristics scheme,
                              int freq_denominator, int size)
 {
   locality_partition partition;
@@ -1192,7 +1413,7 @@ locality_partition_and_clone (int 
max_locality_partition_size,
   if (n && n->count.ipa_p ())
     order_p = locality_determine_ipa_order (&order);
   else
-    order_p = locality_determine_static_order (&order);
+    order_p = locality_determine_static_order (&order, scheme);
   if (!order_p)
     {
       if (dump_file)
@@ -1225,7 +1446,7 @@ locality_partition_and_clone (int 
max_locality_partition_size,
        fprintf (dump_file, "Ordered Node: %s\n", node->dump_asm_name ());
 
       partition_callchain (node, partition, cloning_model, real_freq, size,
-                          cl_num, npartitions, partition_size);
+                          cl_num, npartitions, partition_size, scheme);
     }
 
   for (unsigned i = 0; i < order.length (); i++)
@@ -1247,9 +1468,11 @@ lc_execute (void)
     ();
   node_to_clone = std::make_unique<hash_map<loc_map_hash, cgraph_node *>> ();
   clone_to_node = std::make_unique<hash_map<loc_map_hash, cgraph_node *>> ();
+  templ_hash_map = std::make_unique<hash_freq_map_t> ();
 
   locality_partition_and_clone (param_max_locality_partition_size,
                                flag_lto_locality_cloning,
+                               flag_lto_locality_heuristics,
                                param_lto_locality_frequency,
                                param_lto_locality_size);
 
diff --git a/gcc/params.opt b/gcc/params.opt
index 2cd7717f6fa7..c35216f03698 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -520,6 +520,16 @@ Size cut-off for callee including inlined calls to be 
cloned for a particular ca
 Common Joined UInteger Var(param_max_locality_partition_size) Init(1000000) 
Param
 Maximal size of a locality partition for LTO (in estimated instructions). 
Value of 0 results in default value being used.
 
+Enum
+Name(lto_locality_heuristics) Type(enum lto_locality_heuristics) 
UnknownError(unknown LTO locality partitioning heuristics %qs)
+
+EnumValue
+Enum(lto_locality_heuristics) String(cpp_template) 
Value(LTO_LOCALITY_CPP_TEMPLATE)
+
+-param=lto-partition-locality-heuristics=
+Common Joined RejectNegative Enum(lto_locality_heuristics) 
Var(flag_lto_locality_heuristics) Init(LTO_LOCALITY_DEFAULT_HEURISTIC) 
Optimization
+Choose static heuristics for locality layout. Option cpp_template groups by 
C++ template instantiation type.
+
 -param=max-average-unrolled-insns=
 Common Joined UInteger Var(param_max_average_unrolled_insns) Init(80) Param 
Optimization
 The maximum number of instructions to consider to unroll in a loop on average.

Reply via email to