https://gcc.gnu.org/g:348c623d7dc43caafa9ae7878133df4e12edf54b
commit r16-6920-g348c623d7dc43caafa9ae7878133df4e12edf54b Author: Prachi Godbole <[email protected]> Date: Mon Jan 19 20:49:38 2026 -0800 ipa-reorder-for-locality - Introduce C++ template heuristics This patch introduces a new heuristics for reordering functions, to be used in the absense of profile information. This approach uses C++ template instantiation types to group functions together. Entry functions are sorted in the beginning, and callees are sorted as part of partition_callchain (). Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Prachi Godbole <[email protected]> config/ChangeLog: * bootstrap-lto-locality-cpp-template.mk: New file. gcc/ChangeLog: * flag-types.h (enum lto_locality_heuristics): New enum. * ipa-locality-cloning.cc (struct templ_info): New struct. (struct locality_info): Add templ_info field. (templ_hash_map): New hash_map. (callee_templ_cmp): Ditto. (static_profile_templ_cmp): Ditto. (sort_templ_hashes_cmp): Ditto. (order_templ_hashes): Ditto. (locality_dc_template_p): Ditto. (populate_templ_info): Ditto. (create_locality_info): Call populate_templ_info. (partition_callchain): Call callee_templ_cmp. (locality_determine_static_order): Populate and sort templ_hash_map. (locality_partition_and_clone): Handle lto_locality_heuristics. (lc_execute): Initialize templ_hash_map. * params.opt: New param. Diff: --- config/bootstrap-lto-locality-cpp-template.mk | 22 +++ gcc/flag-types.h | 6 + gcc/ipa-locality-cloning.cc | 243 ++++++++++++++++++++++++-- gcc/params.opt | 10 ++ 4 files changed, 271 insertions(+), 10 deletions(-) diff --git a/config/bootstrap-lto-locality-cpp-template.mk b/config/bootstrap-lto-locality-cpp-template.mk new file mode 100644 index 000000000000..8cf804765dc9 --- /dev/null +++ b/config/bootstrap-lto-locality-cpp-template.mk @@ -0,0 +1,22 @@ +# This option enables LTO and locality partitioning for stage2 and stage3 in slim mode + +STAGE2_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality \ + --param=lto-partition-locality-heuristics=cpp_template +STAGE3_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality \ + --param=lto-partition-locality-heuristics=cpp_template +STAGEprofile_CFLAGS += -flto=jobserver -frandom-seed=1 +STAGEtrain_CFLAGS += -flto=jobserver -frandom-seed=1 +STAGEfeedback_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality + +# assumes the host supports the linker plugin +LTO_AR = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-ar$(exeext) -B$$r/$(HOST_SUBDIR)/prev-gcc/ +LTO_RANLIB = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-ranlib$(exeext) -B$$r/$(HOST_SUBDIR)/prev-gcc/ +LTO_NM = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-nm$(exeext) -B$$r/$(HOST_SUBDIR)/prev-gcc/ + +LTO_EXPORTS = AR="$(LTO_AR)"; export AR; \ + RANLIB="$(LTO_RANLIB)"; export RANLIB; \ + NM="$(LTO_NM)"; export NM; +LTO_FLAGS_TO_PASS = AR="$(LTO_AR)" RANLIB="$(LTO_RANLIB)" NM="$(LTO_NM)" + +do-compare = $(SHELL) $(srcdir)/contrib/compare-lto $$f1 $$f2 +extra-compare = gcc/lto1$(exeext) diff --git a/gcc/flag-types.h b/gcc/flag-types.h index 4d2f08c43992..77dc1f658fbd 100644 --- a/gcc/flag-types.h +++ b/gcc/flag-types.h @@ -422,6 +422,12 @@ enum lto_locality_cloning_model { LTO_LOCALITY_MAXIMAL_CLONING = 2, }; +/* flag_lto_locality_heuristics initialization values. */ +enum lto_locality_heuristics { + LTO_LOCALITY_DEFAULT_HEURISTIC = 0, + LTO_LOCALITY_CPP_TEMPLATE = 1, +}; + /* flag_lto_linker_output initialization values. */ enum lto_linker_output { LTO_LINKER_OUTPUT_UNKNOWN, diff --git a/gcc/ipa-locality-cloning.cc b/gcc/ipa-locality-cloning.cc index fa461518da43..283f76d048bb 100644 --- a/gcc/ipa-locality-cloning.cc +++ b/gcc/ipa-locality-cloning.cc @@ -43,7 +43,15 @@ along with GCC; see the file COPYING3. If not see Create a local clone in current partition, if cloning criteria is satisfied. 3. Redirect any new caller to a local clone if one exists. - Partition size is param controlled to fine tune per program behavior. */ + + Another heuristics can be used in the absense of profile information. This + approach uses C++ template instantiation types to group functions together. + Entry functions are sorted in the beginning by most used template types, and + callees are sorted as part of partition_callchain (), again by template + types. + + For bothe schemes, partition size is param controlled to fine tune per + program behavior. */ #include "config.h" #define INCLUDE_ALGORITHM @@ -65,6 +73,7 @@ along with GCC; see the file COPYING3. If not see #include "ipa-modref-tree.h" #include "ipa-modref.h" #include "symtab-clones.h" +#include "demangle.h" #include "ipa-locality-cloning.h" /* Locality partitions, assigns nodes to partitions. These are used later in @@ -88,6 +97,14 @@ struct locality_order {} }; +/* Data structure to hold information about unique template instantiations. */ +struct templ_info +{ + hashval_t templ_hash; + int count; + int order; +}; + /* Data structure to hold accumulated edge frequencies for unique callees. Frequencies of aliased callees are accumulated with the ultimate target alias represented by ultimate_callee. @@ -114,9 +131,13 @@ struct locality_info /* Accumulated node->callee edge frequencies for unique callees. */ hash_map<loc_map_hash, locality_callee_info> callee_info; + /* Template instantiation info. */ + hashval_t templ_hash; + locality_info () { all_callees.create (1); + templ_hash = 0; } ~locality_info () { @@ -124,6 +145,10 @@ struct locality_info } }; +/* Template instantiation hash_map. */ +typedef hash_map<int_hash<hashval_t, 0>, templ_info> hash_freq_map_t; +static std::unique_ptr<hash_freq_map_t> templ_hash_map; + /* Pool allocation for locality_info. */ static object_allocator<locality_info> loc_infos ("IPA locality callchain"); static @@ -157,6 +182,72 @@ callee_default_cmp (const void *pa, const void *pb) return compare_node_uids (a->ultimate_callee, b->ultimate_callee); } +/* Sort all_callees of NODE by template hashes and edge hotness. + + all_callees is already sorted by call frequency. If templ_hash of the + caller is same as a callee's templ_hash then that callee is ordered first. + Callees having same templ_hash are sorted by call frequency, callees with + different templ_hash are sorted by the template order. + + if (Both PA and PB have templates && PA's template == PB'template) || + (Both A and B don't have templates): sort by frequency + + if (PA doesn't have template && PB has) || (Both have different templates && + caller's template == PB's template): PB before PA + + if (PA has template && PB doesn't) || (Both have different templates && + caller's template == PA's template): PA before PB + + if (Both have different templates && both are different than caller's + template: sort by template order. */ +static int +callee_templ_cmp (const void *pa, const void *pb) +{ + const locality_callee_info *a = *static_cast<const locality_callee_info + *const *> (pa); + const locality_callee_info *b = *static_cast<const locality_callee_info + *const *> (pb); + + locality_info *caller_info = get_locality_info (a->ultimate_caller); + hashval_t caller_hash = 0; + if (caller_info) + caller_hash = caller_info->templ_hash; + + locality_info *ainfo = get_locality_info (a->edge->callee); + locality_info *binfo = get_locality_info (b->edge->callee); + templ_info *atinfo = NULL, *btinfo = NULL; + hashval_t a_hash = 0, b_hash = 0; + if (ainfo) + { + a_hash = ainfo->templ_hash; + atinfo = templ_hash_map->get (a_hash); + } + if (binfo) + { + b_hash = binfo->templ_hash; + btinfo = templ_hash_map->get (b_hash); + } + + if (a_hash == b_hash) + { + /* We get here if both have same template type or both have 0 as hash. */ + return callee_default_cmp (pa, pb); + } + else if (a_hash && (!b_hash || a_hash == caller_hash)) + return -1; + else if (b_hash && (!a_hash || b_hash == caller_hash)) + return 1; + else + { + gcc_assert (atinfo && btinfo); + if (atinfo->order < btinfo->order) + return -1; + /* Order being same is already handled above. */ + return 1; + } + return callee_default_cmp (pa, pb); +} + /* Sort all_callees in INFO by edge hotness. */ static void sort_all_callees_default (locality_info *info) @@ -198,6 +289,69 @@ populate_callee_locality_info (cgraph_node *node, cgraph_node *n, } } +/* Forward declaration. */ +static int static_profile_cmp (const void *pa, const void *pb); + +/* Helper function for qsort; sort nodes in the descending order derived of + template instantiation types by frequency count. */ +static int +static_profile_templ_cmp (const void *pa, const void *pb) +{ + const locality_order *a = *static_cast<const locality_order *const *> (pa); + const locality_order *b = *static_cast<const locality_order *const *> (pb); + locality_info *ainfo = get_locality_info (a->node); + templ_info *atinfo = templ_hash_map->get (ainfo->templ_hash); + locality_info *binfo = get_locality_info (b->node); + templ_info *btinfo = templ_hash_map->get (binfo->templ_hash); + if ((atinfo && !btinfo) + || (atinfo && btinfo && atinfo->order < btinfo->order)) + return -1; + else if ((btinfo && !atinfo) + || (atinfo && btinfo && atinfo->order > btinfo->order)) + return 1; + return static_profile_cmp (pa, pb); +} + +/* Helper function for qsort; sort template instantiation types in descending + order of their frequency count. */ +static int +sort_templ_hashes_cmp (const void *pa, const void *pb) +{ + const std::pair<hashval_t, int> *a = static_cast<const std::pair<hashval_t, + int> *> (pa); + const std::pair<hashval_t, int> *b = static_cast<const std::pair<hashval_t, + int> *> (pb); + if (a->second < b->second) + return 1; + else if (a->second > b->second) + return -1; + else + { + long long int res = (long long int)a->first - (long long int)b->first; + /* Hashes from templ_hash_map cannot be equal. */ + gcc_assert (res != 0); + return res > 0 ? 1 : -1; + } +} + +/* Sort template instantiation types and record the sorted order. */ +static void +order_templ_hashes () +{ + auto_vec<std::pair<hashval_t, int>> templ_hashes; + hash_freq_map_t::iterator iter = templ_hash_map->begin (); + for (; iter != templ_hash_map->end (); ++iter) + templ_hashes.safe_push (std::make_pair ((*iter).first, + (*iter).second.count)); + templ_hashes.qsort (sort_templ_hashes_cmp); + for (unsigned i = 0; i < templ_hashes.length (); i++) + { + templ_info *tinfo = templ_hash_map->get (templ_hashes[i].first); + gcc_assert (tinfo); + tinfo->order = i; + } +} + /* Populate locality_info for NODE from its direct callers. */ static void populate_caller_locality_info (cgraph_node *node, locality_info *info) @@ -216,13 +370,62 @@ populate_caller_locality_info (cgraph_node *node, locality_info *info) } } +/* Return true if template component is found in the demangled function name in + DC. */ +static bool +locality_dc_template_p (struct demangle_component *dc) +{ + switch (dc->type) + { + case DEMANGLE_COMPONENT_QUAL_NAME: + return locality_dc_template_p (dc->u.s_binary.left); + case DEMANGLE_COMPONENT_TEMPLATE: + return true; + default: + return false; + } + return false; +} + +/* Generate hash for the template type from NODE's name if NODE represents a + templated functions. If present, record in INFO. */ + +static void +populate_templ_info (cgraph_node *node, locality_info *info) +{ + if (!info) + return; + void *alloc = NULL; + const char *asm_name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME + (node->decl)); + const char *demangled_name = cplus_demangle_v3 (asm_name, 0); + struct demangle_component *dc = + cplus_demangle_v3_components (asm_name, DMGL_NO_OPTS, &alloc); + + if (demangled_name && dc && locality_dc_template_p (dc)) + { + const char *templ = strchr (demangled_name, '<'); + hashval_t t_hash = htab_hash_string (templ); + info->templ_hash = t_hash; + + bool existed; + templ_info &tinfo = templ_hash_map->get_or_insert (t_hash, &existed); + if (existed) + tinfo.count = tinfo.count + 1; + else + tinfo.count = 1; + } + free (alloc); +} + /* Initialize locality_info for node V. If CLONE_P is true, V is a locality clone; populate callee information for locality clones because caller info is needed for cloning decisions and clones are not cloned again. Populate both caller and callee info for non-clone nodes. */ static inline void -create_locality_info (cgraph_node *v, bool clone_p = false) +create_locality_info (cgraph_node *v, bool templ_p = false, + bool clone_p = false) { locality_info **info = node_to_ch_info->get (v->get_uid ()); gcc_assert (!info); @@ -236,6 +439,8 @@ create_locality_info (cgraph_node *v, bool clone_p = false) populate_caller_locality_info (v, vinfo); populate_callee_locality_info (v, v, vinfo); sort_all_callees_default (vinfo); + if (templ_p) + populate_templ_info (v, vinfo); } /* Return true if NODE is already in some partition. */ @@ -976,7 +1181,8 @@ static void partition_callchain (cgraph_node *node, locality_partition &partition, lto_locality_cloning_model cloning_model, double freq_cutoff, int size, int &cl_num, - int &npartitions, int64_t partition_size) + int &npartitions, int64_t partition_size, + lto_locality_heuristics scheme) { /* Aliases are added in the same partition as their targets. Aliases are not cloned and their callees are not processed separately. */ @@ -989,6 +1195,8 @@ partition_callchain (cgraph_node *node, locality_partition &partition, inlined nodes which themselves are already partitioned along with their inlined_to nodes. */ locality_info *info = get_locality_info (node); + if (scheme == LTO_LOCALITY_CPP_TEMPLATE) + info->all_callees.qsort (callee_templ_cmp); for (unsigned i = 0; i < info->all_callees.length (); i++) { cgraph_edge *e = info->all_callees[i]->edge; @@ -1002,7 +1210,8 @@ partition_callchain (cgraph_node *node, locality_partition &partition, fprintf (dump_file, "Partitioned node: %s\n", n->dump_asm_name ()); partition_callchain (n, partition, cloning_model, freq_cutoff, - size, cl_num, npartitions, partition_size); + size, cl_num, npartitions, partition_size, + scheme); } else if (cloning_model >= LTO_LOCALITY_NON_INTERPOSABLE_CLONING && (!e->callee->alias) @@ -1042,11 +1251,13 @@ partition_callchain (cgraph_node *node, locality_partition &partition, cloning_model); if (cl_node) { - create_locality_info (cl_node, true); + create_locality_info (cl_node, + scheme == LTO_LOCALITY_CPP_TEMPLATE, + true); add_node_to_partition (partition, cl_node); partition_callchain (cl_node, partition, cloning_model, freq_cutoff, size, cl_num, - npartitions, partition_size); + npartitions, partition_size, scheme); } } } @@ -1131,13 +1342,14 @@ locality_determine_ipa_order (auto_vec<locality_order *> *order) Store the order in ORDER. */ static bool -locality_determine_static_order (auto_vec<locality_order *> *order) +locality_determine_static_order (auto_vec<locality_order *> *order, + lto_locality_heuristics scheme) { struct cgraph_node *node; FOR_EACH_DEFINED_FUNCTION (node) if (node->get_partitioning_class () == SYMBOL_PARTITION) { - create_locality_info (node); + create_locality_info (node, scheme == LTO_LOCALITY_CPP_TEMPLATE); if (node->no_reorder) { if (dump_file) @@ -1157,6 +1369,14 @@ locality_determine_static_order (auto_vec<locality_order *> *order) order->safe_push (cl); } } + + if (scheme == LTO_LOCALITY_CPP_TEMPLATE) + { + order_templ_hashes (); + order->qsort (static_profile_templ_cmp); + return true; + } + order->qsort (static_profile_cmp); return true; } @@ -1172,6 +1392,7 @@ locality_determine_static_order (auto_vec<locality_order *> *order) static void locality_partition_and_clone (int max_locality_partition_size, lto_locality_cloning_model cloning_model, + lto_locality_heuristics scheme, int freq_denominator, int size) { locality_partition partition; @@ -1192,7 +1413,7 @@ locality_partition_and_clone (int max_locality_partition_size, if (n && n->count.ipa_p ()) order_p = locality_determine_ipa_order (&order); else - order_p = locality_determine_static_order (&order); + order_p = locality_determine_static_order (&order, scheme); if (!order_p) { if (dump_file) @@ -1225,7 +1446,7 @@ locality_partition_and_clone (int max_locality_partition_size, fprintf (dump_file, "Ordered Node: %s\n", node->dump_asm_name ()); partition_callchain (node, partition, cloning_model, real_freq, size, - cl_num, npartitions, partition_size); + cl_num, npartitions, partition_size, scheme); } for (unsigned i = 0; i < order.length (); i++) @@ -1247,9 +1468,11 @@ lc_execute (void) (); node_to_clone = std::make_unique<hash_map<loc_map_hash, cgraph_node *>> (); clone_to_node = std::make_unique<hash_map<loc_map_hash, cgraph_node *>> (); + templ_hash_map = std::make_unique<hash_freq_map_t> (); locality_partition_and_clone (param_max_locality_partition_size, flag_lto_locality_cloning, + flag_lto_locality_heuristics, param_lto_locality_frequency, param_lto_locality_size); diff --git a/gcc/params.opt b/gcc/params.opt index 2cd7717f6fa7..c35216f03698 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -520,6 +520,16 @@ Size cut-off for callee including inlined calls to be cloned for a particular ca Common Joined UInteger Var(param_max_locality_partition_size) Init(1000000) Param Maximal size of a locality partition for LTO (in estimated instructions). Value of 0 results in default value being used. +Enum +Name(lto_locality_heuristics) Type(enum lto_locality_heuristics) UnknownError(unknown LTO locality partitioning heuristics %qs) + +EnumValue +Enum(lto_locality_heuristics) String(cpp_template) Value(LTO_LOCALITY_CPP_TEMPLATE) + +-param=lto-partition-locality-heuristics= +Common Joined RejectNegative Enum(lto_locality_heuristics) Var(flag_lto_locality_heuristics) Init(LTO_LOCALITY_DEFAULT_HEURISTIC) Optimization +Choose static heuristics for locality layout. Option cpp_template groups by C++ template instantiation type. + -param=max-average-unrolled-insns= Common Joined UInteger Var(param_max_average_unrolled_insns) Init(80) Param Optimization The maximum number of instructions to consider to unroll in a loop on average.
