Richard Biener <rguent...@suse.de> writes:
> We currently fail to re-CSE SLP nodes after optimizing permutes
> which results in off cost estimates.  For gcc.dg/vect/bb-slp-32.c
> this shows in not re-using the SLP node with the load and arithmetic
> for both the store and the reduction.  The following implements
> CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.
>
> I've tried to make the CSE part of permute materialization but it
> isn't a very good fit there.  I've not bothered to implement something
> more complete, also handling external defs or defs without
> SLP_TREE_SCALAR_STMTS.
>
> I realize this might result in more BB SLP which in turn might slow
> down code given costing for BB SLP is difficult (even that we now
> vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea).
> This is nevertheless feeding more accurate info to costing which is
> good.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> Does this look OK?
>
> Thanks,
> Richard.
>
>       PR tree-optimization/114413
>       * tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map):
>       New function, split out from ...
>       (vect_analyze_slp): ... here.  Call it.
>       (vect_cse_slp_nodes): New function.
>       (vect_optimize_slp): Call it.
>
>       * gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86.
> ---
>  gcc/testsuite/gcc.dg/vect/bb-slp-32.c |  6 +++
>  gcc/tree-vect-slp.cc                  | 77 ++++++++++++++++++++++-----
>  2 files changed, 71 insertions(+), 12 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> index 4f72727b694..475b241c36e 100644
> --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> @@ -38,3 +38,9 @@ int main()
>      abort ();
>    return 0;
>  }
> +
> +/* This is a weak test but we want to re-use the arithmetic for both the
> +   store and the reduction.  */
> +/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { 
> x86_64-*-* i?86-*-* } } } } */
> +/* On i386 we vectorize both the store and the reduction.  */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 "slp2" 
> { target { x86_64-*-* i?86-*-* } } } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 2552dacbd69..980d1e7267d 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -1586,6 +1586,23 @@ bst_traits::equal (value_type existing, value_type 
> candidate)
>    return true;
>  }
>  
> +typedef hash_map <vec <stmt_vec_info>, slp_tree,
> +               simple_hashmap_traits <bst_traits, slp_tree> >
> +  scalar_stmts_to_slp_tree_map_t;
> +
> +/* Release BST_MAP.  */
> +
> +static void
> +release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t 
> *bst_map)
> +{
> +  /* The map keeps a reference on SLP nodes built, release that.  */
> +  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
> +       it != bst_map->end (); ++it)
> +    if ((*it).second)
> +      vect_free_slp_tree ((*it).second);
> +  delete bst_map;
> +}
> +
>  /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
>     but then vec::insert does memmove and that's not compatible with
>     std::pair.  */
> @@ -1684,10 +1701,6 @@ vect_slp_linearize_chain (vec_info *vinfo,
>      }
>  }
>  
> -typedef hash_map <vec <stmt_vec_info>, slp_tree,
> -               simple_hashmap_traits <bst_traits, slp_tree> >
> -  scalar_stmts_to_slp_tree_map_t;
> -
>  static slp_tree
>  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>                      vec<stmt_vec_info> stmts, unsigned int group_size,
> @@ -4308,14 +4321,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
> max_tree_size)
>       }
>      }
>  
> -
> -
> -  /* The map keeps a reference on SLP nodes built, release that.  */
> -  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
> -       it != bst_map->end (); ++it)
> -    if ((*it).second)
> -      vect_free_slp_tree ((*it).second);
> -  delete bst_map;
> +  release_scalar_stmts_to_slp_tree_map (bst_map);
>  
>    if (pattern_found && dump_enabled_p ())
>      {
> @@ -6373,6 +6379,43 @@ vect_optimize_slp_pass::run ()
>    free_graph (m_slpg);
>  }
>  
> +/* Apply CSE to NODE and its children using BST_MAP.  */
> +
> +static void
> +vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
> +{
> +  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
> +      && SLP_TREE_CODE (node) != VEC_PERM_EXPR
> +      /* Besides some VEC_PERM_EXPR, two-operator nodes also
> +      lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
> +      we'd have sth that works for all internal and external nodes.  */
> +      && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
> +    {
> +      if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
> +     {
> +       if (*leader != node)
> +         {
> +           if (dump_enabled_p ())
> +             dump_printf_loc (MSG_NOTE, vect_location,
> +                              "re-using SLP tree %p for %p\n",
> +                              (void *)*leader, (void *)node);
> +           vect_free_slp_tree (node);
> +           (*leader)->refcnt += 1;
> +           node = *leader;
> +         }
> +       return;
> +     }
> +
> +      bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
> +      node->refcnt += 1;

Since we're not starting with zeroed refcnts, won't this end up double
counting the references?

> +      /* And recurse.  */
> +    }
> +
> +  for (slp_tree &child : SLP_TREE_CHILDREN (node))
> +    if (child)
> +      vect_cse_slp_nodes (bst_map, child);
> +}
> +
>  /* Optimize the SLP graph of VINFO.  */
>  
>  void
> @@ -6381,6 +6424,16 @@ vect_optimize_slp (vec_info *vinfo)
>    if (vinfo->slp_instances.is_empty ())
>      return;
>    vect_optimize_slp_pass (vinfo).run ();
> +
> +

Super minor nit, but: one newline too many :)

LGTM otherwise FWIW.

Thanks,
Richard

> +  /* Apply CSE again to nodes after permute optimization.  */
> +  scalar_stmts_to_slp_tree_map_t *bst_map
> +    = new scalar_stmts_to_slp_tree_map_t ();
> +
> +  for (auto inst : vinfo->slp_instances)
> +    vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
> +
> +  release_scalar_stmts_to_slp_tree_map (bst_map);
>  }
>  
>  /* Gather loads reachable from the individual SLP graph entries.  */

Reply via email to