For the testcase in question which uses a fold-left vectorized
reduction of a reverse iterating loop we'd need two forwprop
invocations to first bypass the permute emitted for the reverse
iterating loop and then to decompose the vector load that only
feeds element extracts.  The following moves the first transform
to a match.pd pattern and makes sure we fold the element extracts
when the vectorizer emits them so the single forwprop pass can
then pick up the vector load decomposition, avoiding the forwarding
fail that causes.

Moving simplify_bitfield_ref also makes forwprop remove the dead
VEC_PERM_EXPR via the simple-dce it uses - this was also
previously missing.

Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?

Thanks,
Richard.

        PR tree-optimization/90579
        * tree-ssa-forwprop.cc (simplify_bitfield_ref): Move to
        match.pd.
        (pass_forwprop::execute): Adjust.
        * match.pd (bit_field_ref (vec_perm ...)): New pattern
        modeled after simplify_bitfield_ref.
        * tree-vect-loop.cc (vect_expand_fold_left): Fold the
        element extract stmt, combining it with the vector def.

        * gcc.target/i386/pr90579.c: New testcase.
---
 gcc/match.pd                            |  56 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr90579.c |  23 ++++++
 gcc/tree-ssa-forwprop.cc                | 103 +-----------------------
 gcc/tree-vect-loop.cc                   |   5 ++
 4 files changed, 85 insertions(+), 102 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90579.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 20b2aec6f37..ea44201f2eb 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9538,6 +9538,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
        (BIT_FIELD_REF { CONSTRUCTOR_ELT (ctor, idx / const_k)->value; }
                       @1 { bitsize_int ((idx % const_k) * width); })))))))))
 
+(simplify
+ (BIT_FIELD_REF (vec_perm@0 @1 @2 VECTOR_CST@3) @rsize @rpos)
+ (with
+  {
+    tree elem_type = TREE_TYPE (TREE_TYPE (@0));
+    poly_uint64 elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type));
+    poly_uint64 size = tree_to_poly_uint64 (TYPE_SIZE (type));
+    unsigned HOST_WIDE_INT nelts, idx;
+    unsigned HOST_WIDE_INT nelts_op = 0;
+  }
+  (if (constant_multiple_p (tree_to_poly_uint64 (@rpos), elem_size, &idx)
+       && VECTOR_CST_NELTS (@3).is_constant (&nelts)
+       && (known_eq (size, elem_size)
+          || (constant_multiple_p (size, elem_size, &nelts_op)
+              && pow2p_hwi (nelts_op))))
+   (with
+    {
+      bool ok = true;
+      /* One element.  */
+      if (known_eq (size, elem_size))
+        idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (@3, idx)) % (2 * nelts);
+      else
+        {
+         /* Clamp vec_perm_expr index.  */
+         unsigned start
+           = TREE_INT_CST_LOW (vector_cst_elt (@3, idx)) % (2 * nelts);
+         unsigned end
+           = (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + nelts_op - 1))
+              % (2 * nelts));
+         /* Be in the same vector.  */
+         if ((start < nelts) != (end < nelts))
+           ok = false;
+         else
+           for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
+             {
+               /* Continuous area.  */
+               if ((TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i))
+                    % (2 * nelts) - 1)
+                   != (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i - 1))
+                       % (2 * nelts)))
+                 {
+                   ok = false;
+                   break;
+                 }
+             }
+         /* Alignment not worse than before.  */
+         if (start % nelts_op)
+           ok = false;
+         idx = start;
+       }
+    }
+    (if (ok)
+     (if (idx < nelts)
+      (BIT_FIELD_REF @1 @rsize { bitsize_int (idx * elem_size); })
+      (BIT_FIELD_REF @2 @rsize { bitsize_int ((idx - nelts) * elem_size); 
})))))))
+
 /* Simplify a bit extraction from a bit insertion for the cases with
    the inserted element fully covering the extraction or the insertion
    not touching the extraction.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr90579.c 
b/gcc/testsuite/gcc.target/i386/pr90579.c
new file mode 100644
index 00000000000..ab48a44063c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90579.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -mfpmath=sse" } */
+
+extern double r[6];
+extern double a[];
+
+double
+loop (int k, double x)
+{
+  int i;
+  double t=0;
+  for (i=0;i<6;i++)
+    r[i] = x * a[i + k];
+  for (i=0;i<6;i++)
+    t+=r[5-i];
+  return t;
+}
+
+/* Verify we end up with scalar loads from r for the final sum.  */
+/* { dg-final { scan-assembler "vaddsd\tr\\\+40" } } */
+/* { dg-final { scan-assembler "vaddsd\tr\\\+32" } } */
+/* { dg-final { scan-assembler "vaddsd\tr\\\+24" } } */
+/* { dg-final { scan-assembler "vaddsd\tr\\\+16" } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 9474682152a..fafc4d6b77a 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq
 typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq;
 
 static bool forward_propagate_addr_expr (tree, tree, bool);
+static void optimize_vector_load (gimple_stmt_iterator *);
 
 /* Set to true if we delete dead edges during the optimization.  */
 static bool cfg_changed;
@@ -2481,106 +2482,6 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator 
*gsi)
 }
 
 
-/* Combine an element access with a shuffle.  Returns true if there were
-   any changes made, else it returns false.  */
-
-static bool
-simplify_bitfield_ref (gimple_stmt_iterator *gsi)
-{
-  gimple *stmt = gsi_stmt (*gsi);
-  gimple *def_stmt;
-  tree op, op0, op1;
-  tree elem_type, type;
-  tree p, m, tem;
-  unsigned HOST_WIDE_INT nelts, idx;
-  poly_uint64 size, elem_size;
-  enum tree_code code;
-
-  op = gimple_assign_rhs1 (stmt);
-  gcc_checking_assert (TREE_CODE (op) == BIT_FIELD_REF);
-
-  op0 = TREE_OPERAND (op, 0);
-  if (TREE_CODE (op0) != SSA_NAME
-      || TREE_CODE (TREE_TYPE (op0)) != VECTOR_TYPE)
-    return false;
-
-  def_stmt = get_prop_source_stmt (op0, false, NULL);
-  if (!def_stmt || !can_propagate_from (def_stmt))
-    return false;
-
-  op1 = TREE_OPERAND (op, 1);
-  code = gimple_assign_rhs_code (def_stmt);
-  elem_type = TREE_TYPE (TREE_TYPE (op0));
-  type = TREE_TYPE (op);
-  /* Also handle vector type.
-     .i.e.
-     _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>;
-     _11 = BIT_FIELD_REF <_7, 64, 0>;
-
-     to
-
-     _11 = BIT_FIELD_REF <_1, 64, 64>.  */
-
-  size = tree_to_poly_uint64 (TYPE_SIZE (type));
-  if (maybe_ne (bit_field_size (op), size))
-    return false;
-
-  elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type));
-  if (code != VEC_PERM_EXPR
-      || !constant_multiple_p (bit_field_offset (op), elem_size, &idx))
-    return false;
-
-  m = gimple_assign_rhs3 (def_stmt);
-  if (TREE_CODE (m) != VECTOR_CST
-      || !VECTOR_CST_NELTS (m).is_constant (&nelts))
-    return false;
-
-  /* One element.  */
-  if (known_eq (size, elem_size))
-    idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx)) % (2 * nelts);
-  else
-    {
-      unsigned HOST_WIDE_INT nelts_op;
-      if (!constant_multiple_p (size, elem_size, &nelts_op)
-         || !pow2p_hwi (nelts_op))
-       return false;
-      /* Clamp vec_perm_expr index.  */
-      unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx)) % (2 * 
nelts);
-      unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - 1))
-                    % (2 * nelts);
-      /* Be in the same vector.  */
-      if ((start < nelts) != (end < nelts))
-       return false;
-      for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
-       {
-         /* Continuous area.  */
-         if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) % (2 * nelts) - 1
-             != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1))
-                % (2 * nelts))
-           return false;
-       }
-      /* Alignment not worse than before.  */
-      if (start % nelts_op)
-       return false;
-      idx = start;
-    }
-
-  if (idx < nelts)
-    p = gimple_assign_rhs1 (def_stmt);
-  else
-    {
-      p = gimple_assign_rhs2 (def_stmt);
-      idx -= nelts;
-    }
-
-  tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
-               p, op1, bitsize_int (idx * elem_size));
-  gimple_assign_set_rhs1 (stmt, tem);
-  fold_stmt (gsi);
-  update_stmt (gsi_stmt (*gsi));
-  return true;
-}
-
 /* Determine whether applying the 2 permutations (mask1 then mask2)
    gives back one of the input.  */
 
@@ -4677,8 +4578,6 @@ pass_forwprop::execute (function *fun)
                          cfg_changed = true;
                        changed = did_something != 0;
                      }
-                   else if (code == BIT_FIELD_REF)
-                     changed = simplify_bitfield_ref (&gsi);
                    else if (code == CONSTRUCTOR
                             && TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE)
                      changed = simplify_vector_constructor (&gsi);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index eea0b89db69..07b19a22d42 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7086,6 +7086,11 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree 
scalar_dest,
       rhs = make_ssa_name (scalar_dest, stmt);
       gimple_assign_set_lhs (stmt, rhs);
       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+      /* Fold the vector extract, combining it with a previous reversal
+        like seen in PR90579.  */
+      auto gsi2 = gsi_for_stmt (stmt);
+      if (fold_stmt (&gsi2, follow_all_ssa_edges))
+       update_stmt (gsi_stmt  (gsi2));
 
       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
       tree new_name = make_ssa_name (scalar_dest, stmt);
-- 
2.43.0

Reply via email to