Hi All,

This optimizes sequential permutes. i.e. if there are two permutes back to back
this function applies the permute of the parent to the child and removed the
parent.

If the resulting permute in the child is now a no-op.  Then the child is also
dropped from the graph and the parent's parent attached to the child's child.

This relies on the materialization point calculation in optimize SLP.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Tests are included as part of the final patch as they need the SLP pattern
matcher to insert permutes in between.

This allows us to remove useless permutes such as

        ldr     q0, [x0, x3]
        ldr     q2, [x1, x3]
        trn1    v1.4s, v0.4s, v0.4s
        trn2    v0.4s, v0.4s, v0.4s
        trn1    v0.4s, v1.4s, v0.4s
        mov     v1.16b, v3.16b
        fcmla   v1.4s, v0.4s, v2.4s, #0
        fcmla   v1.4s, v0.4s, v2.4s, #90
        str     q1, [x2, x3]

from the sequence the vectorizer puts out and give

        ldr     q0, [x0, x3]
        ldr     q2, [x1, x3]
        mov     v1.16b, v3.16b
        fcmla   v1.4s, v0.4s, v2.4s, #0
        fcmla   v1.4s, v0.4s, v2.4s, #90
        str     q1, [x2, x3]

instead

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * tree-vect-slp.c (vect_slp_tree_permute_noop_p): New.
        (vect_optimize_slp): Optimize permutes.
        (vectorizable_slp_permutation): Fix typo.

-- 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index c03fc2fbecad1a2219504ac9daae75495e691775..48f615e1952707de4827f0e69e443c0a7db27d81 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -290,6 +290,27 @@ vect_slp_tree_uniform_p (slp_tree node)
   return true;
 }
 
+/* Return true when the node is a permute node and the permutation the node
+   contains is a no-op.  */
+
+static bool
+vect_slp_tree_permute_noop_p (slp_tree node)
+{
+  gcc_assert (SLP_TREE_CODE (node) == VEC_PERM_EXPR);
+
+  if (!SLP_TREE_LANE_PERMUTATION (node).exists ())
+    return true;
+
+  unsigned x, seed;
+  lane_permutation_t perms = SLP_TREE_LANE_PERMUTATION (node);
+  seed = perms[0].second;
+  for (x = 1; x < perms.length (); x++)
+    if (perms[x].first != perms[0].first || perms[x].second != ++seed)
+      return false;
+
+  return true;
+}
+
 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
    of the chain.  */
@@ -3150,6 +3171,41 @@ vect_optimize_slp (vec_info *vinfo)
 	    /* For loads simply drop the permutation, the load permutation
 	       already performs the desired permutation.  */
 	    ;
+	  else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+	    {
+	      /* If the node if already a permute node we just need to apply
+		 the permutation to the permute node itself.  */
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "simplifying permute node %p\n",
+				 node);
+
+	      vect_slp_permute (perms[perm], SLP_TREE_LANE_PERMUTATION (node),
+				true);
+
+	      /* If the remaining permute is a no-op then we can just drop the
+		 node instead of materializing it.  */
+	      if (vect_slp_tree_permute_noop_p (node))
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "removing unneeded permute node %p\n",
+				     node);
+
+		   unsigned idx = SLP_TREE_LANE_PERMUTATION (node)[0].first;
+		   slp_tree value = SLP_TREE_CHILDREN (node)[idx];
+		   unsigned src = slpg->vertices[node->vertex].pred->src;
+		   slp_tree prev = vertices[src];
+		   unsigned dest;
+		   slp_tree tmp;
+		   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (prev), dest, tmp)
+		     if (tmp == node)
+		       {
+			  SLP_TREE_CHILDREN (prev)[dest] = value;
+			  break;
+		        }
+		}
+	    }
 	  else
 	    {
 	      if (dump_enabled_p ())
@@ -5361,7 +5417,7 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			     "permutation requires at "
-			     "least three vectors");
+			     "least three vectors\n");
 	  gcc_assert (!gsi);
 	  return false;
 	}

Reply via email to