On Mon, 16 Nov 2020, Richard Biener wrote:

> On Sat, 14 Nov 2020, Tamar Christina wrote:
> 
> > Hi All,
> > 
> > This patch adds the pre-requisites and general scaffolding for supporting 
> > doing
> > SLP pattern matching.
> > 
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > 
> > Ok for master?
> 
> Comments below.
> 
> > Thanks,
> > Tamar
> > 
> > gcc/ChangeLog:
> > 
> >     * tree-vect-loop.c (vect_dissolve_slp_only_patterns): New.
> >     (vect_dissolve_slp_only_groups): Call here.
> >     * tree-vect-slp.c (vect_free_slp_tree, vect_create_new_slp_node): Export
> >     from file.
> >     (vect_build_slp_tree_2): Set vectype for externals.
> >     (vect_print_slp_tree): Print SLP only patterns.
> >     (optimize_load_redistribution_1, optimize_load_redistribution,
> >     vect_match_slp_patterns_2, vect_match_slp_patterns): New.
> >     (vect_analyze_slp): Call matcher.
> >     * tree-vectorizer.c (vec_info::add_pattern_stmt): Save relevancy.
> >     * tree-vectorizer.h (STMT_VINFO_SAVED_RELEVANT, vect_pop_relevancy,
> >     vect_dissolve_pattern_relevancy, vect_save_relevancy,
> >     vect_push_relevancy, vect_free_slp_tree, enum _complex_operation,
> >     class vect_pattern): New.
> > 
> > --- inline copy of patch --
> > 
> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> > index 
> > 39b7319e8253c351a4f6fbdd8c154330f08f2b1b..791d9c6cb0649862a84fd3c80efc89fefedbb085
> >  100644
> > --- a/gcc/tree-vect-loop.c
> > +++ b/gcc/tree-vect-loop.c
> > @@ -1979,6 +1979,61 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block 
> > *bbs,
> >    return opt_result::success ();
> >  }
> >  
> > +/* For every SLP only pattern created by the pattern matched rooted in ROOT
> > +   restore the relevancy of the original statements over those of the 
> > pattern
> > +   and destroy the pattern relationship.  This restores the SLP tree to a 
> > state
> > +   where it can be used when SLP build is cancelled or re-tried.  */
> > +
> > +static void
> > +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo,
> > +                            hash_set<slp_tree> *visited, slp_tree root)
> > +{
> > +  if (!root || visited->add (root))
> > +    return;
> > +
> > +  unsigned int i;
> > +  slp_tree node;
> > +  stmt_vec_info related_stmt_info;
> > +  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (root);
> > +
> > +  if (stmt_info && STMT_VINFO_SLP_VECT_ONLY (stmt_info))
> > +    {
> > +      vect_pop_relevancy (stmt_info);
> > +      if ((related_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info)) != 
> > NULL)
> > +   {
> > +     if (dump_enabled_p ())
> > +       dump_printf_loc (MSG_NOTE, vect_location,
> > +                        "dissolving relevancy of %G",
> > +                        STMT_VINFO_STMT (stmt_info));
> > +     vect_dissolve_pattern_relevancy (related_stmt_info);
> > +   }
> > +    }
> > +
> > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
> > +    vect_dissolve_slp_only_patterns (loop_vinfo, visited, node);
> > +}
> > +
> > +/* Lookup any SLP Only Pattern statements created by the SLP pattern 
> > matcher in
> > +   all slp_instances in LOOP_VINFO and undo the relevancy of statements 
> > such
> > +   that the original SLP tree before the pattern matching is used.  */
> > +
> > +static void
> > +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo)
> > +{
> > +
> > +  unsigned int i;
> > +  hash_set<slp_tree> visited;
> > +
> > +  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_patterns");
> > +
> > +  /* Unmark any SLP only patterns as relevant and restore the STMT_INFO of 
> > the
> > +     related instruction.  */
> > +  slp_instance instance;
> > +  FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
> > +    vect_dissolve_slp_only_patterns (loop_vinfo, &visited,
> > +                                SLP_INSTANCE_TREE (instance));
> > +}
> > +
> >  /* Look for SLP-only access groups and turn each individual access into 
> > its own
> >     group.  */
> >  static void
> > @@ -2583,6 +2638,9 @@ again:
> >    /* Ensure that "ok" is false (with an opt_problem if dumping is 
> > enabled).  */
> >    gcc_assert (!ok);
> >  
> > +  /* Dissolve any SLP patterns created by the SLP pattern matcher.  */
> > +  vect_dissolve_slp_only_patterns (loop_vinfo);
> > +
> >    /* Try again with SLP forced off but if we didn't do any SLP there is
> >       no point in re-trying.  */
> >    if (!slp)
> > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> > index 
> > 0c065e835ad13ad32d222e2590e05ef56849c411..3be565a2e566e09a9e42d6c77ba402b9499b06b6
> >  100644
> > --- a/gcc/tree-vect-slp.c
> > +++ b/gcc/tree-vect-slp.c
> > @@ -105,7 +105,7 @@ _slp_tree::~_slp_tree ()
> >  
> >  /* Recursively free the memory allocated for the SLP tree rooted at NODE.  
> > */
> >  
> > -static void
> > +void
> >  vect_free_slp_tree (slp_tree node)
> >  {
> >    int i;
> > @@ -148,7 +148,7 @@ vect_free_slp_instance (slp_instance instance)
> >  
> >  /* Create an SLP node for SCALAR_STMTS.  */
> >  
> > -slp_tree
> > +static slp_tree
> >  vect_create_new_slp_node (slp_tree node,
> >                       vec<stmt_vec_info> scalar_stmts, unsigned nops)
> >  {
> > @@ -165,7 +165,7 @@ vect_create_new_slp_node (slp_tree node,
> >  
> >  /* Create an SLP node for SCALAR_STMTS.  */
> >  
> > -static slp_tree
> > +slp_tree
> >  vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
> >  {
> >    return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
> > @@ -1646,6 +1646,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
> >     {
> >       slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
> >       SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
> > +     SLP_TREE_VECTYPE (invnode) = vectype;
> 
> This is wrong - the vector type of invariants is determined by
> the consuming SLP stmts in vectorizable_*
> 
> >       oprnd_info->ops = vNULL;
> >       children.safe_push (invnode);
> >       continue;
> > @@ -1929,6 +1930,13 @@ vect_print_slp_tree (dump_flags_t dump_kind, 
> > dump_location_t loc,
> >     dump_printf (dump_kind, " %u", j);
> >        dump_printf (dump_kind, " }\n");
> >      }
> > +  if (SLP_TREE_REPRESENTATIVE (node)
> > +      && STMT_VINFO_SLP_VECT_ONLY (SLP_TREE_REPRESENTATIVE (node)))
> > +    {
> > +      dump_printf_loc (metadata, user_loc, "\tSLP Only pattern:\n");
> > +      dump_printf_loc (dump_kind, user_loc, "\t %G",
> > +                  STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (node)));
> > +    }
> >    if (SLP_TREE_LANE_PERMUTATION (node).exists ())
> >      {
> >        dump_printf_loc (metadata, user_loc, "\tlane permutation {");
> > @@ -2174,6 +2182,219 @@ calculate_unrolling_factor (poly_uint64 nunits, 
> > unsigned int group_size)
> >    return exact_div (common_multiple (nunits, group_size), group_size);
> >  }
> >  
> > +/* Helper function of optimize_load_redistribution that performs the 
> > operation
> > +   recursively.  */
> > +
> > +bool optimize_load_redistribution_1 (hash_set<slp_tree> *loads,
> > +                                scalar_stmts_to_slp_tree_map_t *bst_map,
> > +                                hash_set<slp_tree> *visited,
> > +                                slp_tree parent, unsigned idx,
> > +                                slp_tree root)
> > +{
> > +  if (visited->contains (root))
> > +    return true;
> > +  visited->add (root);
> 
>     if (visited->add (root))
>       return true;
> 
> > +
> > +  slp_tree node;
> > +  unsigned i;
> > +  stmt_vec_info dr_stmt = NULL;
> > +
> > +  /* For now, we don't know anything about externals so do not do 
> > anything.  */
> > +  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
> > +      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
> > +    return false;
> > +
> > +  if (gimple_assign_load_p (STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE 
> > (root))))
> 
> The appropriate check is STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE 
> (root)) && DR_IS_READ (STMT_VINFO_DATA_REF (...))
> 
> let's not mix gimple & SLP checks when not necessary.
> 
> > +    loads->add (root);
> > +
> > +  if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
> 
> else if
> 
> > +      && SLP_TREE_LANE_PERMUTATION (root).exists ()
> > +      && !SLP_TREE_SCALAR_STMTS (root).exists ())
> 
> why do !SLP_TREE_SCALAR_STMTS matter?
> 
> > +    {
> > +
> 
> extra vertical space
> 
> > +      /* First convert this node into a load node and add it to the leaves
> > +         list and flatten the permute from a lane to a load one.  If it's
> > +         unneeded it will be elided later.  */
> > +      auto_vec<stmt_vec_info> stmts;
> > +      stmts.create (SLP_TREE_LANES (root));
> > +      load_permutation_t load_perm;
> > +      load_perm.create (SLP_TREE_LANES (root));
> > +      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
> > +      for (unsigned j = 0; j < lane_perm.length (); j++)
> > +        {
> > +          std::pair<unsigned, unsigned> perm = lane_perm[j];
> > +     /* This isn't strictly needed, but this function is a temporary
> > +        one for specifically pattern matching, so don't want it to
> > +        optimize things the remainder of the pipeline will.  */
> > +     if (perm.first != j)
> > +       goto next;
> > +          node = SLP_TREE_CHILDREN (root)[perm.first];
> > +     if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> > +      {
> > +         load_perm.release ();
> > +         return false;
> 
> you're possibly prematurely exiting the DFS walk on a two-operator
> permute.  Ah, guess that's what the above check was for?  I guess
> it's better to pre-check all children of the VEC_PERM node
> to be proper, two(?) leafs with load permutation.
> 
> > +      }
> > +
> > +     stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node);
> > +     if (!STMT_VINFO_GROUPED_ACCESS (rep_stmt))
> > +       goto next;
> 
> I think for a node with a load permutation this never triggers.
> 
> > +
> > +     if (!dr_stmt)
> > +       dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt);
> > +
> > +     if (dr_stmt != DR_GROUP_FIRST_ELEMENT (rep_stmt))
> > +       goto next;
> 
> So all of the above could be done on the children w/o allocating
> and the rest on the loop iterating over the lanes.
> 
> > +     stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
> > +          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION 
> > (node)[perm.second]);
> > +        }
> > +
> > +      if (dump_enabled_p ())
> > +   dump_printf_loc (MSG_NOTE, vect_location,
> > +                    "converting loads on permute node %p\n", root);
> > +
> > +      slp_tree *value = bst_map->get (stmts);
> > +      if (value)
> > +   node = *value;
> > +      else
> > +   {
> > +     /* One last iteration to free the nodes.  */
> > +     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
> > +       {
> > +         /* If we are the only reference to the node, remove the vertex.
> > +            We don't have to modify the graph since vertices lead the
> > +            graph traversal.  */
> > +         vect_free_slp_tree (node);
> > +       }
> 
> You should do this unconditionally (also if the bst_map lookup
> succeeded), but after bumping the refcount for the use in this
> node.
> 
> > +
> > +     vec<stmt_vec_info> stmts_cpy = stmts.copy ();
> > +     node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
> > +     bst_map->put (stmts_cpy, node);
> > +   }
> > +      SLP_TREE_CHILDREN (parent)[idx] = node;
> 
> hmm, what is 'parent' - shouldn't this be 'root'?  Ah, I see what
> you are doing.  I think you should do what optimize_slp does,
> namely replace 'root' in-place so you do not have to adjust
> parents (or even care for the case of multiple parents refering to
> 'root').  I see that doesn't easily work when attempting to CSE so
> the actual CSE needs to happen below (*)
> 
> > +      SLP_TREE_REF_COUNT (node)++;
> > +      SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
> > +      SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
> > +      loads->add (node);
> 
> Note with the load_lane check delayed we no longer need
> to vect_gather_slp_loads so early so I suggest to simply
> remove the existing early call and do it after
> pattern matching instead.

I'm testing the patch below for this.

Richard.

>From 51b89070fcf8eacb188439e6c1b777fd9db4b2ae Mon Sep 17 00:00:00 2001
From: Richard Biener <rguent...@suse.de>
Date: Mon, 16 Nov 2020 14:26:20 +0100
Subject: [PATCH] Delay SLP instance loads gathering
To: gcc-patches@gcc.gnu.org

This delays filling SLP_INSTANCE_LOADS.

2020-11-16  Richard Biener  <rguent...@suse.de>

        * tree-vectorizer.h (vect_gather_slp_loads): Declare.
        * tree-vect-loop.c (vect_analyze_loop_2): Call
        vect_gather_slp_loads.
        * tree-vect-slp.c (vect_build_slp_instance): Do not gather
        SLP loads here.
        (vect_gather_slp_loads): Remove wrapper, new function.
        (vect_slp_analyze_bb_1): Call it.
---
 gcc/tree-vect-loop.c  |  3 +++
 gcc/tree-vect-slp.c   | 26 ++++++++++++++++++--------
 gcc/tree-vectorizer.h |  1 +
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 4d5532f71d0..ecaaf0116d3 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2298,6 +2298,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool 
&fatal, unsigned *n_stmts)
 
       /* Optimize the SLP graph with the vectorization factor fixed.  */
       vect_optimize_slp (loop_vinfo);
+
+      /* Gather the loads reachable from the SLP graph entries.  */
+      vect_gather_slp_loads (loop_vinfo);
     }
 
   bool saved_can_use_partial_vectors_p
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index b98d5db9f76..d2f2407ac92 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2071,13 +2071,6 @@ vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree 
node,
     }
 }
 
-static void
-vect_gather_slp_loads (slp_instance inst, slp_tree node)
-{
-  hash_set<slp_tree> visited;
-  vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
-}
-
 
 /* Find the last store in SLP INSTANCE.  */
 
@@ -2252,7 +2245,6 @@ vect_build_slp_instance (vec_info *vinfo,
          new_instance->cost_vec = vNULL;
          new_instance->subgraph_entries = vNULL;
 
-         vect_gather_slp_loads (new_instance, node);
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "SLP size %u vs. limit %u.\n",
@@ -3071,6 +3063,21 @@ vect_optimize_slp (vec_info *vinfo)
     }
 }
 
+/* Gather loads reachable from the individual SLP graph entries.  */
+
+void
+vect_gather_slp_loads (vec_info *vinfo)
+{
+  unsigned i;
+  slp_instance instance;
+  FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+    {
+      hash_set<slp_tree> visited;
+      vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
+                            SLP_INSTANCE_TREE (instance), visited);
+    }
+}
+
 
 /* For each possible SLP instance decide whether to SLP it and calculate 
overall
    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
@@ -4152,6 +4159,9 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, 
bool &fatal,
   /* Optimize permutations.  */
   vect_optimize_slp (bb_vinfo);
 
+  /* Gather the loads reachable from the SLP graph entries.  */
+  vect_gather_slp_loads (bb_vinfo);
+
   vect_record_base_alignments (bb_vinfo);
 
   /* Analyze and verify the alignment of data references and the
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 3ccd0fd552d..0ee4ef32eb2 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1974,6 +1974,7 @@ extern opt_result vect_analyze_slp (vec_info *, unsigned);
 extern bool vect_make_slp_decision (loop_vec_info);
 extern void vect_detect_hybrid_slp (loop_vec_info);
 extern void vect_optimize_slp (vec_info *);
+extern void vect_gather_slp_loads (vec_info *);
 extern void vect_get_slp_defs (slp_tree, vec<tree> *);
 extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *,
                               unsigned n = -1U);
-- 
2.26.2

Reply via email to