Richard Biener <rguent...@suse.de> writes: > We currently fail to re-CSE SLP nodes after optimizing permutes > which results in off cost estimates. For gcc.dg/vect/bb-slp-32.c > this shows in not re-using the SLP node with the load and arithmetic > for both the store and the reduction. The following implements > CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp. > > I've tried to make the CSE part of permute materialization but it > isn't a very good fit there. I've not bothered to implement something > more complete, also handling external defs or defs without > SLP_TREE_SCALAR_STMTS. > > I realize this might result in more BB SLP which in turn might slow > down code given costing for BB SLP is difficult (even that we now > vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea). > This is nevertheless feeding more accurate info to costing which is > good. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > Does this look OK? > > Thanks, > Richard. > > PR tree-optimization/114413 > * tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map): > New function, split out from ... > (vect_analyze_slp): ... here. Call it. > (vect_cse_slp_nodes): New function. > (vect_optimize_slp): Call it. > > * gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86. > --- > gcc/testsuite/gcc.dg/vect/bb-slp-32.c | 6 +++ > gcc/tree-vect-slp.cc | 77 ++++++++++++++++++++++----- > 2 files changed, 71 insertions(+), 12 deletions(-) > > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > index 4f72727b694..475b241c36e 100644 > --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c > @@ -38,3 +38,9 @@ int main() > abort (); > return 0; > } > + > +/* This is a weak test but we want to re-use the arithmetic for both the > + store and the reduction. */ > +/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { > x86_64-*-* i?86-*-* } } } } */ > +/* On i386 we vectorize both the store and the reduction. */ > +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 "slp2" > { target { x86_64-*-* i?86-*-* } } } } */ > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index 2552dacbd69..980d1e7267d 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -1586,6 +1586,23 @@ bst_traits::equal (value_type existing, value_type > candidate) > return true; > } > > +typedef hash_map <vec <stmt_vec_info>, slp_tree, > + simple_hashmap_traits <bst_traits, slp_tree> > > + scalar_stmts_to_slp_tree_map_t; > + > +/* Release BST_MAP. */ > + > +static void > +release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t > *bst_map) > +{ > + /* The map keeps a reference on SLP nodes built, release that. */ > + for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); > + it != bst_map->end (); ++it) > + if ((*it).second) > + vect_free_slp_tree ((*it).second); > + delete bst_map; > +} > + > /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree> > but then vec::insert does memmove and that's not compatible with > std::pair. */ > @@ -1684,10 +1701,6 @@ vect_slp_linearize_chain (vec_info *vinfo, > } > } > > -typedef hash_map <vec <stmt_vec_info>, slp_tree, > - simple_hashmap_traits <bst_traits, slp_tree> > > - scalar_stmts_to_slp_tree_map_t; > - > static slp_tree > vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > vec<stmt_vec_info> stmts, unsigned int group_size, > @@ -4308,14 +4321,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned > max_tree_size) > } > } > > - > - > - /* The map keeps a reference on SLP nodes built, release that. */ > - for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); > - it != bst_map->end (); ++it) > - if ((*it).second) > - vect_free_slp_tree ((*it).second); > - delete bst_map; > + release_scalar_stmts_to_slp_tree_map (bst_map); > > if (pattern_found && dump_enabled_p ()) > { > @@ -6373,6 +6379,43 @@ vect_optimize_slp_pass::run () > free_graph (m_slpg); > } > > +/* Apply CSE to NODE and its children using BST_MAP. */ > + > +static void > +vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node) > +{ > + if (SLP_TREE_DEF_TYPE (node) == vect_internal_def > + && SLP_TREE_CODE (node) != VEC_PERM_EXPR > + /* Besides some VEC_PERM_EXPR, two-operator nodes also > + lack scalar stmts and thus CSE doesn't work via bst_map. Ideally > + we'd have sth that works for all internal and external nodes. */ > + && !SLP_TREE_SCALAR_STMTS (node).is_empty ()) > + { > + if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node))) > + { > + if (*leader != node) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "re-using SLP tree %p for %p\n", > + (void *)*leader, (void *)node); > + vect_free_slp_tree (node); > + (*leader)->refcnt += 1; > + node = *leader; > + } > + return; > + } > + > + bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node); > + node->refcnt += 1;
Since we're not starting with zeroed refcnts, won't this end up double counting the references? > + /* And recurse. */ > + } > + > + for (slp_tree &child : SLP_TREE_CHILDREN (node)) > + if (child) > + vect_cse_slp_nodes (bst_map, child); > +} > + > /* Optimize the SLP graph of VINFO. */ > > void > @@ -6381,6 +6424,16 @@ vect_optimize_slp (vec_info *vinfo) > if (vinfo->slp_instances.is_empty ()) > return; > vect_optimize_slp_pass (vinfo).run (); > + > + Super minor nit, but: one newline too many :) LGTM otherwise FWIW. Thanks, Richard > + /* Apply CSE again to nodes after permute optimization. */ > + scalar_stmts_to_slp_tree_map_t *bst_map > + = new scalar_stmts_to_slp_tree_map_t (); > + > + for (auto inst : vinfo->slp_instances) > + vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst)); > + > + release_scalar_stmts_to_slp_tree_map (bst_map); > } > > /* Gather loads reachable from the individual SLP graph entries. */