Richard Biener <rguent...@suse.de> writes:
> On Wed, 19 Jun 2024, Richard Sandiford wrote:
>
>> Richard Biener <rguent...@suse.de> writes:
>> > We currently fail to re-CSE SLP nodes after optimizing permutes
>> > which results in off cost estimates.  For gcc.dg/vect/bb-slp-32.c
>> > this shows in not re-using the SLP node with the load and arithmetic
>> > for both the store and the reduction.  The following implements
>> > CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.
>> >
>> > I've tried to make the CSE part of permute materialization but it
>> > isn't a very good fit there.  I've not bothered to implement something
>> > more complete, also handling external defs or defs without
>> > SLP_TREE_SCALAR_STMTS.
>> >
>> > I realize this might result in more BB SLP which in turn might slow
>> > down code given costing for BB SLP is difficult (even that we now
>> > vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea).
>> > This is nevertheless feeding more accurate info to costing which is
>> > good.
>> >
>> > Bootstrapped and tested on x86_64-unknown-linux-gnu.
>> >
>> > Does this look OK?
>> >
>> > Thanks,
>> > Richard.
>> >
>> >    PR tree-optimization/114413
>> >    * tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map):
>> >    New function, split out from ...
>> >    (vect_analyze_slp): ... here.  Call it.
>> >    (vect_cse_slp_nodes): New function.
>> >    (vect_optimize_slp): Call it.
>> >
>> >    * gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86.
>> > ---
>> >  gcc/testsuite/gcc.dg/vect/bb-slp-32.c |  6 +++
>> >  gcc/tree-vect-slp.cc                  | 77 ++++++++++++++++++++++-----
>> >  2 files changed, 71 insertions(+), 12 deletions(-)
>> >
>> > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c 
>> > b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
>> > index 4f72727b694..475b241c36e 100644
>> > --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
>> > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
>> > @@ -38,3 +38,9 @@ int main()
>> >      abort ();
>> >    return 0;
>> >  }
>> > +
>> > +/* This is a weak test but we want to re-use the arithmetic for both the
>> > +   store and the reduction.  */
>> > +/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { 
>> > x86_64-*-* i?86-*-* } } } } */
>> > +/* On i386 we vectorize both the store and the reduction.  */
>> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 
>> > "slp2" { target { x86_64-*-* i?86-*-* } } } } */
>> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
>> > index 2552dacbd69..980d1e7267d 100644
>> > --- a/gcc/tree-vect-slp.cc
>> > +++ b/gcc/tree-vect-slp.cc
>> > @@ -1586,6 +1586,23 @@ bst_traits::equal (value_type existing, value_type 
>> > candidate)
>> >    return true;
>> >  }
>> >  
>> > +typedef hash_map <vec <stmt_vec_info>, slp_tree,
>> > +            simple_hashmap_traits <bst_traits, slp_tree> >
>> > +  scalar_stmts_to_slp_tree_map_t;
>> > +
>> > +/* Release BST_MAP.  */
>> > +
>> > +static void
>> > +release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t 
>> > *bst_map)
>> > +{
>> > +  /* The map keeps a reference on SLP nodes built, release that.  */
>> > +  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
>> > +       it != bst_map->end (); ++it)
>> > +    if ((*it).second)
>> > +      vect_free_slp_tree ((*it).second);
>> > +  delete bst_map;
>> > +}
>> > +
>> >  /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
>> >     but then vec::insert does memmove and that's not compatible with
>> >     std::pair.  */
>> > @@ -1684,10 +1701,6 @@ vect_slp_linearize_chain (vec_info *vinfo,
>> >      }
>> >  }
>> >  
>> > -typedef hash_map <vec <stmt_vec_info>, slp_tree,
>> > -            simple_hashmap_traits <bst_traits, slp_tree> >
>> > -  scalar_stmts_to_slp_tree_map_t;
>> > -
>> >  static slp_tree
>> >  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>> >                   vec<stmt_vec_info> stmts, unsigned int group_size,
>> > @@ -4308,14 +4321,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
>> > max_tree_size)
>> >    }
>> >      }
>> >  
>> > -
>> > -
>> > -  /* The map keeps a reference on SLP nodes built, release that.  */
>> > -  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
>> > -       it != bst_map->end (); ++it)
>> > -    if ((*it).second)
>> > -      vect_free_slp_tree ((*it).second);
>> > -  delete bst_map;
>> > +  release_scalar_stmts_to_slp_tree_map (bst_map);
>> >  
>> >    if (pattern_found && dump_enabled_p ())
>> >      {
>> > @@ -6373,6 +6379,43 @@ vect_optimize_slp_pass::run ()
>> >    free_graph (m_slpg);
>> >  }
>> >  
>> > +/* Apply CSE to NODE and its children using BST_MAP.  */
>> > +
>> > +static void
>> > +vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& 
>> > node)
>> > +{
>> > +  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
>> > +      && SLP_TREE_CODE (node) != VEC_PERM_EXPR
>> > +      /* Besides some VEC_PERM_EXPR, two-operator nodes also
>> > +   lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
>> > +   we'd have sth that works for all internal and external nodes.  */
>> > +      && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
>> > +    {
>> > +      if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
>> > +  {
>> > +    if (*leader != node)
>> > +      {
>> > +        if (dump_enabled_p ())
>> > +          dump_printf_loc (MSG_NOTE, vect_location,
>> > +                           "re-using SLP tree %p for %p\n",
>> > +                           (void *)*leader, (void *)node);
>> > +        vect_free_slp_tree (node);
>> > +        (*leader)->refcnt += 1;
>> > +        node = *leader;
>> > +      }
>> > +    return;
>> > +  }
>> > +
>> > +      bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
>> > +      node->refcnt += 1;
>> 
>> Since we're not starting with zeroed refcnts, won't this end up double
>> counting the references?
>
> This is for the reference bst_map holds, we're releasing that when
> destroying bst_map.  I copied this from other bst-map populating
> code (and release_scalar_stmts_to_slp_tree_map performs the
> corresponding decrement).

Ah, ok, thanks!

Reply via email to