https://gcc.gnu.org/g:1c4d39ada33d3655db088a0e5c90a296da794a55

commit r15-5563-g1c4d39ada33d3655db088a0e5c90a296da794a55
Author: Christoph Müllner <christoph.muell...@vrull.eu>
Date:   Wed Nov 13 00:44:43 2024 +0100

    forwprop: Try to blend two isomorphic VEC_PERM sequences
    
    This extends forwprop by yet another VEC_PERM optimization:
    It attempts to blend two isomorphic vector sequences by using the
    redundancy in the lane utilization in these sequences.
    This redundancy in lane utilization comes from the way how specific
    scalar statements end up vectorized: two VEC_PERMs on top, binary operations
    on both of them, and a final VEC_PERM to create the result.
    Here is an example of this sequence:
    
      v_in = {e0, e1, e2, e3}
      v_1 = VEC_PERM <v_in, v_in, {0, 2, 0, 2}>
      // v_1 = {e0, e2, e0, e2}
      v_2 = VEC_PERM <v_in, v_in, {1, 3, 1, 3}>
      // v_2 = {e1, e3, e1, e3}
    
      v_x = v_1 + v_2
      // v_x = {e0+e1, e2+e3, e0+e1, e2+e3}
      v_y = v_1 - v_2
      // v_y = {e0-e1, e2-e3, e0-e1, e2-e3}
    
      v_out = VEC_PERM <v_x, v_y, {0, 1, 6, 7}>
      // v_out = {e0+e1, e2+e3, e0-e1, e2-e3}
    
    To remove the redundancy, lanes 2 and 3 can be freed, which allows to
    change the last statement into:
      v_out' = VEC_PERM <v_x, v_y, {0, 1, 4, 5}>
      // v_out' = {e0+e1, e2+e3, e0-e1, e2-e3}
    
    The cost of eliminating the redundancy in the lane utilization is that
    lowering the VEC PERM expression could get more expensive because of
    tighter packing of the lanes.  Therefore this optimization is not done
    alone, but in only in case we identify two such sequences that can be
    blended.
    
    Once all candidate sequences have been identified, we try to blend them,
    so that we can use the freed lanes for the second sequence.
    On success we convert 2x (2x BINOP + 1x VEC_PERM) to
    2x VEC_PERM + 2x BINOP + 2x VEC_PERM traded for 4x VEC_PERM + 2x BINOP.
    
    The implemented transformation reuses (rewrites) the statements
    of the first sequence and the last VEC_PERM of the second sequence.
    The remaining four statements of the second statment are left untouched
    and will be eliminated by DCE later.
    
    This targets x264_pixel_satd_8x4, which calculates the sum of absolute
    transformed differences (SATD) using Hadamard transformation.
    We have seen 8% speedup on SPEC's x264 on a 5950X (x86-64) and 7%
    speedup on an AArch64 machine.
    
    Bootstrapped and reg-tested on x86-64 and AArch64 (all languages).
    
    gcc/ChangeLog:
    
            * tree-ssa-forwprop.cc (struct _vec_perm_simplify_seq): New data
            structure to store analysis results of a vec perm simplify sequence.
            (get_vect_selector_index_map): Helper to get an index map from the
            provided vector permute selector.
            (recognise_vec_perm_simplify_seq): Helper to recognise a
            vec perm simplify sequence.
            (narrow_vec_perm_simplify_seq): Helper to pack the lanes more
            tight.
            (can_blend_vec_perm_simplify_seqs_p): Test if two vec perm
            sequences can be blended.
            (calc_perm_vec_perm_simplify_seqs): Helper to calculate the new
            permutation indices.
            (blend_vec_perm_simplify_seqs): Helper to blend two vec perm
            simplify sequences.
            (process_vec_perm_simplify_seq_list): Helper to process a list
            of vec perm simplify sequences.
            (append_vec_perm_simplify_seq_list): Helper to add a vec perm
            simplify sequence to the list.
            (pass_forwprop::execute): Integrate new functionality.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/tree-ssa/satd-hadamard.c: New test.
            * gcc.dg/tree-ssa/vector-10.c: New test.
            * gcc.dg/tree-ssa/vector-8.c: New test.
            * gcc.dg/tree-ssa/vector-9.c: New test.
            * gcc.target/aarch64/sve/satd-hadamard.c: New test.
    
    Signed-off-by: Christoph Müllner <christoph.muell...@vrull.eu>

Diff:
---
 gcc/testsuite/gcc.dg/tree-ssa/satd-hadamard.c      |  43 ++
 gcc/testsuite/gcc.dg/tree-ssa/vector-10.c          | 122 +++++
 gcc/testsuite/gcc.dg/tree-ssa/vector-8.c           |  34 ++
 gcc/testsuite/gcc.dg/tree-ssa/vector-9.c           |  34 ++
 .../gcc.target/aarch64/sve/satd-hadamard.c         |   3 +
 gcc/tree-ssa-forwprop.cc                           | 584 ++++++++++++++++++++-
 6 files changed, 819 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/satd-hadamard.c 
b/gcc/testsuite/gcc.dg/tree-ssa/satd-hadamard.c
new file mode 100644
index 000000000000..576ef01628cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/satd-hadamard.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-forwprop4-details" } */
+
+#include <stdint.h>
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
+    int t0 = s0 + s1;\
+    int t1 = s0 - s1;\
+    int t2 = s2 + s3;\
+    int t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d1 = t1 + t3;\
+    d2 = t0 - t2;\
+    d3 = t1 - t3;\
+}
+
+int
+x264_pixel_satd_8x4_simplified (uint8_t *pix1, int i_pix1, uint8_t *pix2, int 
i_pix2)
+{
+  uint32_t tmp[4][4];
+  uint32_t a0, a1, a2, a3;
+  int sum = 0;
+  int i;
+
+  for (i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+      a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+      a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+      a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+      HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
+    }
+
+  for (i = 0; i < 4; i++)
+    {
+      HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+      sum += a0 + a1 + a2 + a3;
+    }
+
+  return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1;
+}
+
+/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 3, 6, 7 }" "forwprop4" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-10.c 
b/gcc/testsuite/gcc.dg/tree-ssa/vector-10.c
new file mode 100644
index 000000000000..d5caebdf1742
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-10.c
@@ -0,0 +1,122 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details" } */
+
+typedef int vec __attribute__((vector_size (4 * sizeof (int))));
+
+void f1 (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2)
+{
+  vec sel0 = { 0, 2, 0, 2 };
+  vec sel1 = { 1, 3, 1, 3 };
+  vec sel = { 0, 1, 6, 7 };
+  vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
+  vec v_in_1 = *p_v_in_1;
+  vec v_in_2;
+
+  /* First vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0);
+  v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1);
+  v_x = v_1 + v_2;
+  v_y = v_1 - v_2;
+  v_out_1 = __builtin_shuffle (v_x, v_y, sel);
+
+  /* Won't blend because v_in_2 is defined after v_1 above.  */
+  v_in_2 = *p_v_in_2;
+  /* Second vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0);
+  v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1);
+  v_x = v_1 + v_2;
+  v_y = v_1 - v_2;
+  v_out_2 = __builtin_shuffle (v_x, v_y, sel);
+
+  *p_v_out_1 = v_out_1;
+  *p_v_out_2 = v_out_2;
+}
+
+void f2 (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2)
+{
+  vec sel0 = { 0, 2, 0, 2 };
+  vec sel1 = { 1, 3, 1, 3 };
+  vec sel = { 0, 1, 6, 7 };
+  vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
+  vec v_in_1 = *p_v_in_1;
+  vec v_in_2 = *p_v_in_2;
+
+  /* First vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0);
+  v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1);
+  v_x = v_1 + v_2;
+  v_y = v_1 - v_2;
+  v_out_1 = __builtin_shuffle (v_x, v_y, sel);
+  /* Won't blend because of this store between the sequences.  */
+  *p_v_out_1 = v_out_1;
+
+  /* Second vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0);
+  v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1);
+  v_x = v_1 + v_2;
+  v_y = v_1 - v_2;
+  v_out_2 = __builtin_shuffle (v_x, v_y, sel);
+
+  *p_v_out_2 = v_out_2;
+}
+
+void f3 (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2)
+{
+  vec sel0 = { 0, 2, 0, 2 };
+  vec sel1 = { 1, 3, 1, 3 };
+  vec sel = { 0, 1, 6, 7 };
+  vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
+  vec v_in_1 = *p_v_in_1;
+  vec v_in_2 = *p_v_in_2;
+
+  /* First vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0);
+  v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1);
+  v_x = v_1 + v_2;
+  v_y = v_1 - v_2;
+  v_out_1 = __builtin_shuffle (v_x, v_y, sel);
+
+  /* Second vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0);
+  v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1);
+  v_x = v_1 + v_2;
+  /* Won't blend because v_2 is RHS1 here.  */
+  v_y = v_2 - v_1;
+  v_out_2 = __builtin_shuffle (v_x, v_y, sel);
+
+  *p_v_out_1 = v_out_1;
+  *p_v_out_2 = v_out_2;
+}
+
+extern vec foo (vec v);
+void f4 (vec *p_v_in_1, vec *p_v_out_1, vec *p_v_out_2)
+{
+  vec sel0 = { 0, 2, 0, 2 };
+  vec sel1 = { 1, 3, 1, 3 };
+  vec sel = { 0, 1, 6, 7 };
+  vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
+  vec v_in_1 = *p_v_in_1;
+  vec v_in_2;
+
+  /* First vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0);
+  v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1);
+  v_x = v_1 + v_2;
+  v_y = v_1 - v_2;
+  v_out_1 = __builtin_shuffle (v_x, v_y, sel);
+
+  /* Won't merge because of dependency.  */
+  v_in_2 = foo (v_out_1);
+
+  /* Second vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0);
+  v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1);
+  v_x = v_1 + v_2;
+  v_y = v_1 - v_2;
+  v_out_2 = __builtin_shuffle (v_x, v_y, sel);
+
+  *p_v_out_1 = v_out_1;
+  *p_v_out_2 = v_out_2;
+}
+
+/* { dg-final { scan-tree-dump-not "Vec perm simplify sequences have been 
merged" "forwprop1" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-8.c 
b/gcc/testsuite/gcc.dg/tree-ssa/vector-8.c
new file mode 100644
index 000000000000..bc2269065e4f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-8.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details" } */
+
+typedef int vec __attribute__((vector_size (4 * sizeof (int))));
+
+void f (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2)
+{
+  vec sel0 = { 0, 2, 0, 2 };
+  vec sel1 = { 1, 3, 1, 3 };
+  vec sel = { 0, 1, 6, 7 };
+  vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
+  vec v_in_1 = *p_v_in_1;
+  vec v_in_2 = *p_v_in_2;
+
+  /* First vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0);
+  v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1);
+  v_x = v_1 + v_2;
+  v_y = v_1 - v_2;
+  v_out_1 = __builtin_shuffle (v_x, v_y, sel);
+
+  /* Second vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0);
+  v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1);
+  v_x = v_1 + v_2;
+  v_y = v_1 - v_2;
+  v_out_2 = __builtin_shuffle (v_x, v_y, sel);
+
+  *p_v_out_1 = v_out_1;
+  *p_v_out_2 = v_out_2;
+}
+
+/* { dg-final { scan-tree-dump "Vec perm simplify sequences have been blended" 
"forwprop1" } } */
+/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 3, 6, 7 }" "forwprop1" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-9.c 
b/gcc/testsuite/gcc.dg/tree-ssa/vector-9.c
new file mode 100644
index 000000000000..e5f898e02814
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-9.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details" } */
+
+typedef int vec __attribute__((vector_size (4 * sizeof (int))));
+
+void f (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2)
+{
+  vec sel0 = { 0, 2, 0, 2 };
+  vec sel1 = { 1, 3, 1, 3 };
+  vec sel = { 0, 1, 6, 7 };
+  vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
+  vec v_in_1 = *p_v_in_1;
+  vec v_in_2 = *p_v_in_2;
+
+  /* First vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0);
+  v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1);
+  v_x = v_1 * v_2;
+  v_y = v_2 - v_1;
+  v_out_1 = __builtin_shuffle (v_x, v_y, sel);
+
+  /* Second vec perm sequence.  */
+  v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0);
+  v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1);
+  v_x = v_2 * v_1;
+  v_y = v_2 - v_1;
+  v_out_2 = __builtin_shuffle (v_x, v_y, sel);
+
+  *p_v_out_1 = v_out_1;
+  *p_v_out_2 = v_out_2;
+}
+
+/* { dg-final { scan-tree-dump "Vec perm simplify sequences have been blended" 
"forwprop1" } } */
+/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 3, 6, 7 }" "forwprop1" } } 
*/
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/satd-hadamard.c 
b/gcc/testsuite/gcc.target/aarch64/sve/satd-hadamard.c
new file mode 100644
index 000000000000..fcd140e3584c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/satd-hadamard.c
@@ -0,0 +1,3 @@
+#include "../../../gcc.dg/tree-ssa/satd-hadamard.c"
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 5c690a2b03e2..8088cc190d11 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -50,6 +50,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "optabs-tree.h"
 #include "insn-config.h"
 #include "recog.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
 #include "internal-fn.h"
@@ -184,6 +186,25 @@ along with GCC; see the file COPYING3.  If not see
 
    This will (of course) be extended as other needs arise.  */
 
+/* Data structure that contains simplifiable vectorized permute sequences.
+   See recognise_vec_perm_simplify_seq () for a description of the sequence.  
*/
+
+struct _vec_perm_simplify_seq
+{
+  /* Defining stmts of vectors in the sequence.  */
+  gassign *v_1_stmt;
+  gassign *v_2_stmt;
+  gassign *v_x_stmt;
+  gassign *v_y_stmt;
+  /* Final permute statment.  */
+  gassign *stmt;
+  /* New selector indices for stmt.  */
+  tree new_sel;
+  /* Elements of each vector and selector.  */
+  unsigned int nelts;
+};
+typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq;
+
 static bool forward_propagate_addr_expr (tree, tree, bool);
 
 /* Set to true if we delete dead edges during the optimization.  */
@@ -225,7 +246,6 @@ fwprop_invalidate_lattice (tree name)
     lattice[SSA_NAME_VERSION (name)] = NULL_TREE;
 }
 
-
 /* Get the statement we can propagate from into NAME skipping
    trivial copies.  Returns the statement which defines the
    propagation source or NULL_TREE if there is no such one.
@@ -3460,6 +3480,546 @@ fwprop_ssa_val (tree name)
   return name;
 }
 
+/* Get an index map from the provided vector permute selector
+   and return the number of unique indices.
+   E.g.: { 1, 3, 1, 3 } -> <0, 1, 0, 1>, 2
+        { 0, 2, 0, 2 } -> <0, 1, 0, 1>, 2
+        { 3, 2, 1, 0 } -> <0, 1, 2, 3>, 4.  */
+
+static unsigned int
+get_vect_selector_index_map (tree sel, vec<unsigned int> *index_map)
+{
+  gcc_assert (VECTOR_CST_NELTS (sel).is_constant ());
+  unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
+  unsigned int n = 0;
+
+  for (unsigned int i = 0; i < nelts; i++)
+    {
+      /* Extract the i-th value from the selector.  */
+      tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
+      unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
+
+      unsigned int j = 0;
+      for (; j <= i; j++)
+       {
+         tree prev_sel_cst_tree = VECTOR_CST_ELT (sel, j);
+         unsigned int prev_sel_cst
+           = TREE_INT_CST_LOW (prev_sel_cst_tree);
+         if (prev_sel_cst == sel_cst)
+           break;
+       }
+      index_map->quick_push (j);
+      n += (i == j) ? 1 : 0;
+    }
+
+  return n;
+}
+
+/* Search for opportunities to free half of the lanes in the following pattern:
+
+     v_in = {e0, e1, e2, e3}
+     v_1 = VEC_PERM <v_in, v_in, {0, 2, 0, 2}>
+     // v_1 = {e0, e2, e0, e2}
+     v_2 = VEC_PERM <v_in, v_in, {1, 3, 1, 3}>
+     // v_2 = {e1, e3, e1, e3}
+
+     v_x = v_1 + v_2
+     // v_x = {e0+e1, e2+e3, e0+e1, e2+e3}
+     v_y = v_1 - v_2
+     // v_y = {e0-e1, e2-e3, e0-e1, e2-e3}
+
+     v_out = VEC_PERM <v_x, v_y, {0, 1, 6, 7}>
+     // v_out = {e0+e1, e2+e3, e0-e1, e2-e3}
+
+   The last statement could be simplified to:
+     v_out' = VEC_PERM <v_x, v_y, {0, 1, 4, 5}>
+     // v_out' = {e0+e1, e2+e3, e0-e1, e2-e3}
+
+   Characteristic properties:
+   - v_1 and v_2 are created from the same input vector v_in and introduce the
+     lane duplication (in the selection operand) that we can eliminate.
+   - v_x and v_y are results from lane-preserving operations that use v_1 and
+     v_2 as inputs.
+   - v_out is created by selecting from duplicated lanes.  */
+
+static bool
+recognise_vec_perm_simplify_seq (gassign *stmt, vec_perm_simplify_seq *seq)
+{
+  gcc_checking_assert (stmt);
+  gcc_checking_assert (gimple_assign_rhs_code (stmt) == VEC_PERM_EXPR);
+  basic_block bb = gimple_bb (stmt);
+
+  /* Decompose the final vec permute statement.  */
+  tree v_x = gimple_assign_rhs1 (stmt);
+  tree v_y = gimple_assign_rhs2 (stmt);
+  tree sel = gimple_assign_rhs3 (stmt);
+
+  if (!VECTOR_CST_NELTS (sel).is_constant ()
+      || TREE_CODE (v_x) != SSA_NAME
+      || TREE_CODE (v_y) != SSA_NAME
+      || !has_single_use (v_x)
+      || !has_single_use (v_y))
+    return false;
+
+  unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
+
+  /* Lookup the definition of v_x and v_y.  */
+  gassign *v_x_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x));
+  gassign *v_y_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_y));
+  if (!v_x_stmt || gimple_bb (v_x_stmt) != bb
+      || !v_y_stmt || gimple_bb (v_y_stmt) != bb)
+    return false;
+
+  /* Check the operations that define v_x and v_y.  */
+  if (TREE_CODE_CLASS (gimple_assign_rhs_code (v_x_stmt)) != tcc_binary
+      || TREE_CODE_CLASS (gimple_assign_rhs_code (v_y_stmt)) != tcc_binary)
+    return false;
+
+  tree v_x_1 = gimple_assign_rhs1 (v_x_stmt);
+  tree v_x_2 = gimple_assign_rhs2 (v_x_stmt);
+  tree v_y_1 = gimple_assign_rhs1 (v_y_stmt);
+  tree v_y_2 = gimple_assign_rhs2 (v_y_stmt);
+
+  if (v_x_stmt == v_y_stmt
+      || TREE_CODE (v_x_1) != SSA_NAME
+      || TREE_CODE (v_x_2) != SSA_NAME
+      || num_imm_uses (v_x_1) != 2
+      || num_imm_uses (v_x_2) != 2)
+    return false;
+
+  if (v_x_1 != v_y_1 || v_x_2 != v_y_2)
+    {
+      /* Allow operands of commutative operators to swap.  */
+      if (commutative_tree_code (gimple_assign_rhs_code (v_x_stmt)))
+       {
+         /* Keep v_x_1 the first operand for non-commutative operators.  */
+         v_x_1 = gimple_assign_rhs2 (v_x_stmt);
+         v_x_2 = gimple_assign_rhs1 (v_x_stmt);
+         if (v_x_1 != v_y_1 || v_x_2 != v_y_2)
+           return false;
+       }
+      else if (commutative_tree_code (gimple_assign_rhs_code (v_y_stmt)))
+       {
+         if (v_x_1 != v_y_2 || v_x_2 != v_y_1)
+           return false;
+       }
+      else
+       return false;
+    }
+  gassign *v_1_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x_1));
+  gassign *v_2_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x_2));
+  if (!v_1_stmt || gimple_bb (v_1_stmt) != bb
+      || !v_2_stmt || gimple_bb (v_2_stmt) != bb)
+    return false;
+
+  if (gimple_assign_rhs_code (v_1_stmt) != VEC_PERM_EXPR
+      || gimple_assign_rhs_code (v_2_stmt) != VEC_PERM_EXPR)
+    return false;
+
+  /* Decompose initial VEC_PERM_EXPRs.  */
+  tree v_in = gimple_assign_rhs1 (v_1_stmt);
+  tree v_1_sel = gimple_assign_rhs3 (v_1_stmt);
+  tree v_2_sel = gimple_assign_rhs3 (v_2_stmt);
+  if (v_in != gimple_assign_rhs2 (v_1_stmt)
+      || v_in != gimple_assign_rhs1 (v_2_stmt)
+      || v_in != gimple_assign_rhs2 (v_2_stmt))
+    return false;
+
+  if (!VECTOR_CST_NELTS (v_1_sel).is_constant ()
+      || !VECTOR_CST_NELTS (v_2_sel).is_constant ())
+    return false;
+
+  if (nelts != VECTOR_CST_NELTS (v_1_sel).to_constant ()
+      || nelts != VECTOR_CST_NELTS (v_2_sel).to_constant ())
+    return false;
+
+  /* Now check permutation selection operands.  */
+  auto_vec<unsigned int> v_1_lane_map, v_2_lane_map;
+  v_1_lane_map.reserve (nelts);
+  v_2_lane_map.reserve (nelts);
+  unsigned int v_1_lanes, v_2_lanes;
+  v_1_lanes = get_vect_selector_index_map (v_1_sel, &v_1_lane_map);
+  v_2_lanes = get_vect_selector_index_map (v_2_sel, &v_2_lane_map);
+
+  /* Check if we could free up half of the lanes.  */
+  if (v_1_lanes != v_2_lanes || v_1_lanes > (nelts / 2))
+    return false;
+
+  /* Create the new selector.  */
+  vec_perm_builder new_sel_perm (nelts, nelts, 1);
+  for (unsigned int i = 0; i < nelts; i++)
+    {
+      /* Extract the i-th value from the selector.  */
+      tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
+      unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
+
+      unsigned int j;
+      if (sel_cst < nelts)
+       j = v_1_lane_map[sel_cst];
+      else
+       j = v_2_lane_map[sel_cst - nelts] + nelts;
+
+      new_sel_perm.quick_push (j);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+       {
+         fprintf (dump_file, "%u", j);
+         if (i != (nelts -1))
+           fprintf (dump_file, ", ");
+       }
+    }
+
+  vec_perm_indices new_indices (new_sel_perm, 2, nelts);
+  tree vectype = TREE_TYPE (gimple_assign_lhs (stmt));
+  machine_mode vmode = TYPE_MODE (vectype);
+  if (!can_vec_perm_const_p (vmode, vmode, new_indices, false))
+      return false;
+
+  *seq = XNEW (struct _vec_perm_simplify_seq);
+  (*seq)->stmt = stmt;
+  (*seq)->v_1_stmt = v_1_stmt;
+  (*seq)->v_2_stmt = v_2_stmt;
+  (*seq)->v_x_stmt = v_x_stmt;
+  (*seq)->v_y_stmt = v_y_stmt;
+  (*seq)->nelts = nelts;
+  (*seq)->new_sel = vect_gen_perm_mask_checked (vectype, new_indices);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Found vec perm simplify sequence ending with: ");
+      print_gimple_stmt (dump_file, stmt, 0);
+    }
+
+  return true;
+}
+
+/* Reduce the lane consumption of a simplifiable vec perm sequence.  */
+
+static void
+narrow_vec_perm_simplify_seq (const vec_perm_simplify_seq &seq)
+{
+  gassign *stmt = seq->stmt;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Updating VEC_PERM statment:\n");
+      fprintf (dump_file, "Old stmt: ");
+      print_gimple_stmt (dump_file, stmt, 0);
+    }
+
+  /* Update the last VEC_PERM statement.  */
+  gimple_assign_set_rhs3 (stmt, seq->new_sel);
+  update_stmt (stmt);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "New stmt: ");
+      print_gimple_stmt (dump_file, stmt, 0);
+    }
+}
+
+/* Test if we can blend two simplifiable vec permute sequences.
+   NEED_SWAP will be set, if sequences must be swapped for blending.  */
+
+static bool
+can_blend_vec_perm_simplify_seqs_p (vec_perm_simplify_seq seq1,
+                                   vec_perm_simplify_seq seq2,
+                                   bool *need_swap)
+{
+  unsigned int nelts = seq1->nelts;
+  basic_block bb = gimple_bb (seq1->stmt);
+
+  gcc_assert (gimple_bb (seq2->stmt) == bb);
+
+  /* BBs and number of elements must be equal.  */
+  if (gimple_bb (seq2->stmt) != bb || seq2->nelts != nelts)
+    return false;
+
+  /* We need vectors of the same type.  */
+  if (TREE_TYPE (gimple_assign_lhs (seq1->stmt))
+      != TREE_TYPE (gimple_assign_lhs (seq2->stmt)))
+    return false;
+
+  /* We require isomorphic operators.  */
+  if (((gimple_assign_rhs_code (seq1->v_x_stmt)
+       != gimple_assign_rhs_code (seq2->v_x_stmt))
+       || (gimple_assign_rhs_code (seq1->v_y_stmt)
+          != gimple_assign_rhs_code (seq2->v_y_stmt))))
+    return false;
+
+  /* We cannot have any dependencies between the sequences.
+
+     For merging, we will reuse seq1->v_1_stmt and seq1->v_2_stmt.
+     seq1's v_in is defined before these statements, but we need
+     to check if seq2's v_in is defined before them as well.
+
+     Further, we will reuse seq2->stmt.  We need to ensure that
+     seq1->v_x_stmt and seq1->v_y_stmt are before it.
+
+     Note, that we don't need to check the BBs here, because all
+     statements of both sequences have to be in the same BB.
+     */
+
+  tree seq2_v_in = gimple_assign_rhs1 (seq2->v_1_stmt);
+  if (TREE_CODE (seq2_v_in) != SSA_NAME)
+    return false;
+
+  gassign *seq2_v_in_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT 
(seq2_v_in));
+  if (!seq2_v_in_stmt || gimple_bb (seq2_v_in_stmt) != bb
+      || (gimple_uid (seq2_v_in_stmt) > gimple_uid (seq1->v_1_stmt))
+      || (gimple_uid (seq1->v_x_stmt) > gimple_uid (seq2->stmt))
+      || (gimple_uid (seq1->v_y_stmt) > gimple_uid (seq2->stmt)))
+    {
+      tree seq1_v_in = gimple_assign_rhs1 (seq1->v_1_stmt);
+      if (TREE_CODE (seq1_v_in) != SSA_NAME)
+       return false;
+
+      gassign *seq1_v_in_stmt
+       = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (seq1_v_in));
+      /* Let's try to see if we succeed when swapping the sequences.  */
+      if (!seq1_v_in_stmt || gimple_bb (seq1_v_in_stmt)
+         || (gimple_uid (seq1_v_in_stmt) > gimple_uid (seq2->v_1_stmt))
+         || (gimple_uid (seq2->v_x_stmt) > gimple_uid (seq1->stmt))
+         || (gimple_uid (seq2->v_y_stmt) > gimple_uid (seq1->stmt)))
+       return false;
+      *need_swap = true;
+    }
+  else
+    *need_swap = false;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Found vec perm simplify sequence pair.\n");
+
+  return true;
+}
+
+/* Calculate the permutations for blending the two given vec permute
+   sequences.  This may fail if the resulting permutation is not
+   supported.  */
+
+static bool
+calc_perm_vec_perm_simplify_seqs (vec_perm_simplify_seq seq1,
+                                 vec_perm_simplify_seq seq2,
+                                 vec_perm_indices *seq2_stmt_indices,
+                                 vec_perm_indices *seq1_v_1_stmt_indices,
+                                 vec_perm_indices *seq1_v_2_stmt_indices)
+{
+  unsigned int i;
+  unsigned int nelts = seq1->nelts;
+  auto_vec<int> lane_assignment;
+  lane_assignment.create (2 * nelts);
+
+  /* Mark all lanes as free.  */
+  lane_assignment.quick_grow_cleared (2 * nelts);
+
+  /* Reserve lanes for seq1.  */
+  for (i = 0; i < nelts; i++)
+    {
+      unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq1->new_sel, i));
+      lane_assignment[l] = 1;
+    }
+
+  /* Reserve lanes for seq2 and calculate selector for seq2->stmt.  */
+  vec_perm_builder seq2_stmt_sel_perm (nelts, nelts, 1);
+  for (i = 0; i < nelts; i++)
+    {
+      unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i));
+      while (lane_assignment[l] != 0)
+       l++;
+      lane_assignment[l] = 2;
+      seq2_stmt_sel_perm.quick_push (l);
+    }
+
+  seq2_stmt_indices->new_vector (seq2_stmt_sel_perm, 2, nelts);
+  tree vectype = TREE_TYPE (gimple_assign_lhs (seq2->stmt));
+  machine_mode vmode = TYPE_MODE (vectype);
+  if (!can_vec_perm_const_p (vmode, vmode, *seq2_stmt_indices, false))
+    return false;
+
+  /* Calculate selectors for seq1->v_1_stmt and seq1->v_2_stmt.  */
+  vec_perm_builder seq1_v_1_stmt_sel_perm (nelts, nelts, 1);
+  vec_perm_builder seq1_v_2_stmt_sel_perm (nelts, nelts, 1);
+  for (i = 0; i < nelts; i++)
+    {
+      bool use_seq1 = lane_assignment[i] == 1;
+      tree s1 = gimple_assign_rhs3 (use_seq1 ? seq1->v_1_stmt
+                                            : seq2->v_1_stmt);
+      tree s2 = gimple_assign_rhs3 (use_seq1 ? seq1->v_2_stmt
+                                            : seq2->v_2_stmt);
+      unsigned int l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i)) % nelts;
+      unsigned int l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i)) % nelts;
+
+      seq1_v_1_stmt_sel_perm.quick_push (l1 + (use_seq1 ? 0 : nelts));
+      seq1_v_2_stmt_sel_perm.quick_push (l2 + (use_seq1 ? 0 : nelts));
+    }
+
+  seq1_v_1_stmt_indices->new_vector (seq1_v_1_stmt_sel_perm, 2, nelts);
+  vectype = TREE_TYPE (gimple_assign_lhs (seq1->v_1_stmt));
+  vmode = TYPE_MODE (vectype);
+  if (!can_vec_perm_const_p (vmode, vmode, *seq1_v_1_stmt_indices, false))
+    return false;
+
+  seq1_v_2_stmt_indices->new_vector (seq1_v_2_stmt_sel_perm, 2, nelts);
+  vectype = TREE_TYPE (gimple_assign_lhs (seq1->v_2_stmt));
+  vmode = TYPE_MODE (vectype);
+  if (!can_vec_perm_const_p (vmode, vmode, *seq1_v_2_stmt_indices, false))
+    return false;
+
+  return true;
+}
+
+/* Blend the two given simplifiable vec permute sequences using the
+   given permutations.  */
+
+static void
+blend_vec_perm_simplify_seqs (vec_perm_simplify_seq seq1,
+                             vec_perm_simplify_seq seq2,
+                             const vec_perm_indices &seq2_stmt_indices,
+                             const vec_perm_indices &seq1_v_1_stmt_indices,
+                             const vec_perm_indices &seq1_v_2_stmt_indices)
+{
+  /* We don't need to adjust seq1->stmt because its lanes consumption
+     was already narrowed before entering this function.  */
+
+  /* Adjust seq2->stmt: copy RHS1/RHS2 from seq1->stmt and set new sel.  */
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Updating VEC_PERM statment:\n");
+      fprintf (dump_file, "Old stmt: ");
+      print_gimple_stmt (dump_file, seq2->stmt, 0);
+    }
+
+  gimple_assign_set_rhs1 (seq2->stmt, gimple_assign_rhs1 (seq1->stmt));
+  gimple_assign_set_rhs2 (seq2->stmt, gimple_assign_rhs2 (seq1->stmt));
+  tree vectype = TREE_TYPE (gimple_assign_lhs (seq2->stmt));
+  tree sel = vect_gen_perm_mask_checked (vectype, seq2_stmt_indices);
+  gimple_assign_set_rhs3 (seq2->stmt, sel);
+  update_stmt (seq2->stmt);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "New stmt: ");
+      print_gimple_stmt (dump_file, seq2->stmt, 0);
+    }
+
+  /* Adjust seq1->v_1_stmt: copy RHS2 from seq2->v_1_stmt and set new sel.  */
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Updating VEC_PERM statment:\n");
+      fprintf (dump_file, "Old stmt: ");
+      print_gimple_stmt (dump_file, seq1->v_1_stmt, 0);
+    }
+
+  gimple_assign_set_rhs2 (seq1->v_1_stmt, gimple_assign_rhs1 (seq2->v_1_stmt));
+  vectype = TREE_TYPE (gimple_assign_lhs (seq1->v_1_stmt));
+  sel = vect_gen_perm_mask_checked (vectype, seq1_v_1_stmt_indices);
+  gimple_assign_set_rhs3 (seq1->v_1_stmt, sel);
+  update_stmt (seq1->v_1_stmt);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "New stmt: ");
+      print_gimple_stmt (dump_file, seq1->v_1_stmt, 0);
+    }
+
+  /* Adjust seq1->v_2_stmt: copy RHS2 from seq2->v_2_stmt and set new sel.  */
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Updating VEC_PERM statment:\n");
+      fprintf (dump_file, "Old stmt: ");
+      print_gimple_stmt (dump_file, seq1->v_2_stmt, 0);
+    }
+
+  gimple_assign_set_rhs2 (seq1->v_2_stmt, gimple_assign_rhs1 (seq2->v_2_stmt));
+  vectype = TREE_TYPE (gimple_assign_lhs (seq1->v_2_stmt));
+  sel = vect_gen_perm_mask_checked (vectype, seq1_v_2_stmt_indices);
+  gimple_assign_set_rhs3 (seq1->v_2_stmt, sel);
+  update_stmt (seq1->v_2_stmt);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "New stmt: ");
+      print_gimple_stmt (dump_file, seq1->v_2_stmt, 0);
+    }
+
+  /* At this point, we have four unmodified seq2 stmts, which will be
+     eliminated by DCE.  */
+
+  if (dump_file)
+    fprintf (dump_file, "Vec perm simplify sequences have been blended.\n\n");
+}
+
+/* Try to blend narrowed vec_perm_simplify_seqs pairwise.
+   The provided list will be empty after this call.  */
+
+static void
+process_vec_perm_simplify_seq_list (vec<vec_perm_simplify_seq> *l)
+{
+  unsigned int i, j;
+  vec_perm_simplify_seq seq1, seq2;
+
+  if (l->is_empty ())
+    return;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Processing %u vec perm simplify sequences.\n",
+            l->length ());
+
+  FOR_EACH_VEC_ELT (*l, i, seq1)
+    {
+      if (i + 1 < l->length ())
+       {
+         FOR_EACH_VEC_ELT_FROM (*l, j, seq2, i + 1)
+           {
+             bool swap = false;
+             if (can_blend_vec_perm_simplify_seqs_p (seq1, seq2, &swap))
+               {
+                 vec_perm_indices seq2_stmt_indices;
+                 vec_perm_indices seq1_v_1_stmt_indices;
+                 vec_perm_indices seq1_v_2_stmt_indices;
+                 if (calc_perm_vec_perm_simplify_seqs (swap ? seq2 : seq1,
+                                                       swap ? seq1 : seq2,
+                                                       &seq2_stmt_indices,
+                                                       &seq1_v_1_stmt_indices,
+                                                       &seq1_v_2_stmt_indices))
+                   {
+                     /* Narrow lane usage.  */
+                     narrow_vec_perm_simplify_seq (seq1);
+                     narrow_vec_perm_simplify_seq (seq2);
+
+                     /* Blend sequences.  */
+                     blend_vec_perm_simplify_seqs (swap ? seq2 : seq1,
+                                                   swap ? seq1 : seq2,
+                                                   seq2_stmt_indices,
+                                                   seq1_v_1_stmt_indices,
+                                                   seq1_v_2_stmt_indices);
+
+                     /* We can use unordered_remove as we break the loop.  */
+                     l->unordered_remove (j);
+                     XDELETE (seq2);
+                     break;
+                   }
+               }
+           }
+       }
+
+      /* We don't need to call l->remove for seq1.  */
+      XDELETE (seq1);
+    }
+
+  l->truncate (0);
+}
+
+static void
+append_vec_perm_simplify_seq_list (vec<vec_perm_simplify_seq> *l,
+                                  const vec_perm_simplify_seq &seq)
+{
+  /* If no space on list left, then process the list.  */
+  if (!l->space (1))
+      process_vec_perm_simplify_seq_list (l);
+
+  l->quick_push (seq);
+}
+
 /* Main entry point for the forward propagation and statement combine
    optimizer.  */
 
@@ -3526,6 +4086,7 @@ pass_forwprop::execute (function *fun)
   auto_bitmap simple_dce_worklist;
   auto_bitmap need_ab_cleanup;
   to_purge = BITMAP_ALLOC (NULL);
+  auto_vec<vec_perm_simplify_seq, 8> vec_perm_simplify_seq_list;
   for (int i = 0; i < postorder_num; ++i)
     {
       gimple_stmt_iterator gsi;
@@ -3605,14 +4166,18 @@ pass_forwprop::execute (function *fun)
 
       /* Apply forward propagation to all stmts in the basic-block.
         Note we update GSI within the loop as necessary.  */
+      unsigned int uid = 1;
       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); )
        {
          gimple *stmt = gsi_stmt (gsi);
          tree lhs, rhs;
          enum tree_code code;
 
+         gimple_set_uid (stmt, uid++);
+
          if (!is_gimple_assign (stmt))
            {
+             process_vec_perm_simplify_seq_list (&vec_perm_simplify_seq_list);
              gsi_next (&gsi);
              continue;
            }
@@ -3620,9 +4185,11 @@ pass_forwprop::execute (function *fun)
          lhs = gimple_assign_lhs (stmt);
          rhs = gimple_assign_rhs1 (stmt);
          code = gimple_assign_rhs_code (stmt);
+
          if (TREE_CODE (lhs) != SSA_NAME
              || has_zero_uses (lhs))
            {
+             process_vec_perm_simplify_seq_list (&vec_perm_simplify_seq_list);
              gsi_next (&gsi);
              continue;
            }
@@ -3901,10 +4468,25 @@ pass_forwprop::execute (function *fun)
              else
                gsi_next (&gsi);
            }
+         else if (code == VEC_PERM_EXPR)
+           {
+             /* Find vectorized sequences, where we can reduce the lane
+                utilization.  The narrowing will be donw later and only
+                if we find a pair of sequences that can be blended.  */
+             gassign *assign = dyn_cast <gassign *> (stmt);
+             vec_perm_simplify_seq seq;
+             if (recognise_vec_perm_simplify_seq (assign, &seq))
+               append_vec_perm_simplify_seq_list (&vec_perm_simplify_seq_list,
+                                                  seq);
+
+             gsi_next (&gsi);
+         }
          else
            gsi_next (&gsi);
        }
 
+      process_vec_perm_simplify_seq_list (&vec_perm_simplify_seq_list);
+
       /* Combine stmts with the stmts defining their operands.
         Note we update GSI within the loop as necessary.  */
       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))

Reply via email to