From tamar.christ...@arm.com Mon Dec 28 14:36:32 2020
Date: Mon, 28 Dec 2020 13:35:56 +0000
From: Tamar Christina <tamar.christ...@arm.com>
To: gcc-patches@gcc.gnu.org
Cc: n...@arm.com, rguent...@suse.de, o...@ucw.cz
Subject: [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution

Hi All,

This introduces a post processing step for the pattern matcher to flatten
permutes introduced by the complex multiplications patterns.
This performs a blend early such that SLP is not cancelled by the LOAD_LANES
permute.  This is a temporary workaround to the fact that loads are not CSEd
during building and is required to produce efficient code.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * tree-vect-slp.c (optimize_load_redistribution_1): New.
        (optimize_load_redistribution): New.
        (vect_match_slp_patterns): Use it.

--- inline copy of patch -- diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 
2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325cf8406466ac25c
 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits, 
unsigned int group_size)
   return exact_div (common_multiple (nunits, group_size), group_size);
 }

+/* Helper function of optimize_load_redistribution that performs the operation
+   recursively.  */
+
+static slp_tree
+optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
+                               hash_set<slp_tree> *visited, slp_tree root)
+{
+  if (visited->add (root))
+    return NULL;
+
+  slp_tree node;
+  unsigned i;
+
+  /* For now, we don't know anything about externals so do not do anything.  */
+  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
+      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)

use a single != vect_internal_def test please

+    return NULL;
+  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
+      && SLP_TREE_LANE_PERMUTATION (root).exists ()
+      && !SLP_TREE_SCALAR_STMTS (root).exists ())

I think both last tests are unnecessary

+    {
+      /* First convert this node into a load node and add it to the leaves
+         list and flatten the permute from a lane to a load one.  If it's
+         unneeded it will be elided later.  */
+      auto_vec<stmt_vec_info> stmts;
+      stmts.create (SLP_TREE_LANES (root));
+      load_permutation_t load_perm;
+      load_perm.create (SLP_TREE_LANES (root));
+      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);

load_perm leaks when any of the below outs is taken

+      for (unsigned j = 0; j < lane_perm.length (); j++)
+        {
+          std::pair<unsigned, unsigned> perm = lane_perm[j];
+         /* This isn't strictly needed, but this function is a temporary
+            one for specifically pattern matching, so don't want it to
+            optimize things the remainder of the pipeline will.  */
+         if (perm.first != j)
+           goto next;

but please elide it nevertheless

+          node = SLP_TREE_CHILDREN (root)[perm.first];
+
+         if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+           return NULL;

so you want to check whether this is a load, I think more to the point
would be a vect_internal_def + zero SLP children check.  And a comment
on what we test (we do lack classification of SLP nodes, so a helper
like vect_is_slp_load_node or so would be OK as well)

+
+         stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]);

As you're doing here lacks a check that we are actually loading from
the same DR group.  I think it might be easier to just collect scalar
stmts and throw them at vect_build_slp_tree?  That should perform
the necessary verification, build the appropriate lane permute and
perform the CSE.  Which leads to the question why the VEC_PERM node
doesn't have scalar stmts set while we are actually be able to compute
them here ... that is, the CSE opportunity could have been noticed
during pattern matching itself?

+        }
+
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "converting stmts on permute node %p\n", root);
+
+      slp_tree *value = bst_map->get (stmts);
+      if (value)
+       node = *value;
+      else
+       {
+         FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
+           SLP_TREE_REF_COUNT (node)++;
+
+         vec<stmt_vec_info> stmts_cpy = stmts.copy ();
+         node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
+         SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
+         SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
+         bst_map->put (stmts_cpy, node);
+       }
+      SLP_TREE_REF_COUNT (node)++;

Adjusting the refcount here but doing the replacement in the caller
is a bit awkward to follow - how about passing a reference so you
can adjust the edge here?

+
+      return node;
+    }
+
+next:
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, visited, node);
+      if (value)
+       {
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+       }
+    }
+
+  return NULL;
+}
+
+/* Temporary workaround for loads not being CSEd during SLP build.  This
+   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
+   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
+   same DR such that the final operation is equal to a permuted load.  Such
+   NODES are then directly converted into LOADS themselves.  The nodes are
+   CSEd using BST_MAP.  */
+
+static void
+optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
+                             slp_tree root)
+{
+  slp_tree node;
+  unsigned i;
+  hash_set<slp_tree> visited;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited, 
node);
+      if (value)
+       {
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+       }
+    }
+}
+
 /* Helper function of vect_match_slp_patterns.

    Attempts to match patterns against the slp tree rooted in REF_NODE using
@@ -2276,7 +2385,7 @@ static bool
 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
                         hash_set<slp_tree> *visited,
                         slp_tree_to_load_perm_map_t *perm_cache,
-                        scalar_stmts_to_slp_tree_map_t * /* bst_map */)
+                        scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
@@ -2291,6 +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info 
*vinfo,

   if (found_p)
     {
+      optimize_load_redistribution (bst_map, *ref_node);
+
       if (dump_enabled_p ())
        {
          dump_printf_loc (MSG_NOTE, vect_location,


--


    [ Part 2, Text/X-DIFF 140 lines. ]
    [ Unable to print this part. ]

Reply via email to