[RFC][PATCH 4/5] vect: Extend lane-reducing patterns to non-loop-reduction statement

Feng Xue OS Sun, 21 Jul 2024 02:16:23 -0700

Previously, only simple lane-reducing case is supported, in which one loop
reduction statement forms one pattern match:


  char *d0, *d1, *s0, *s1, *w;
  for (i) {
    sum += d0[i] * d1[i];      // sum = DOT_PROD(d0, d1, sum);
    sum += abs(s0[i] - s1[i]); // sum = SAD(s0, s1, sum);
    sum += w[i];               // sum = WIDEN_SUM(w, sum);
  }

This patch removes limitation of current lane-reducing matching strategy, and
extends candidate scope to the whole loop reduction affine closure. Thus, we
could optimize reduction with lane-reducing as many as possible, which ends up
with generalized pattern recognition as ("opX" denotes an operation for
lane-reducing pattern):

 for (i)
   sum += cst0 * op0 + cst1 * op1 + ... + cstN * opN + h(i);

A lane-reducing operation contains two aspects: main primitive operation and
appendant result-accumulation. Original design handles match of the compound
semantics in single pattern, but the means is not suitable for operation that
does not directly participate in loop reduction. In this patch, we only focus
on the basic aspect, and leave another patch to cover the rest. An example
with dot-product:

    sum = DOT_PROD(d0, d1, sum);       // original
    sum = DOT_PROD(d0, d1, 0) + sum;   // now

Thanks,
Feng
---
gcc/
        * tree-vect-patterns (vect_reassociating_reduction_p): Remove the
        function.
        (vect_recog_dot_prod_pattern): Relax check to allow any statement in
        reduction affine closure.
        (vect_recog_sad_pattern): Likewise.
        (vect_recog_widen_sum_pattern): Likewise. And use dot-product if
        widen-sum is not supported.
        (vect_vect_recog_func_ptrs): Move lane-reducing patterns to the topmost.

gcc/testsuite/
        * gcc.dg/vect/vect-reduc-affine-1.c
        * gcc.dg/vect/vect-reduc-affine-2.c
        * gcc.dg/vect/vect-reduc-affine-slp-1.c
---
 .../gcc.dg/vect/vect-reduc-affine-1.c         | 112 ++++++
 .../gcc.dg/vect/vect-reduc-affine-2.c         |  81 +++++
 .../gcc.dg/vect/vect-reduc-affine-slp-1.c     |  74 ++++
 gcc/tree-vect-patterns.cc                     | 321 ++++++------------
 4 files changed, 372 insertions(+), 216 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-affine-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-affine-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-affine-slp-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-1.c
new file mode 100644
index 00000000000..a5e99ce703b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-1.c
@@ -0,0 +1,112 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#define FN(name, S1, S2)                               \
+S1 int __attribute__ ((noipa))                         \
+name (S1 int res,                                      \
+      S2 char *restrict a,                             \
+      S2 char *restrict b,                             \
+      S2 int *restrict c,                              \
+      S2 int cst1,                                     \
+      S2 int cst2,                                     \
+      int shift)                                       \
+{                                                      \
+  for (int i = 0; i < N; i++)                          \
+    res += a[i] * b[i] + 16;                           \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  for (int i = 0; i < N; i++)                          \
+    res += a[i] * b[i] + cst1;                         \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  for (int i = 0; i < N; i++)                          \
+    res += a[i] * b[i] + c[i];                         \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  for (int i = 0; i < N; i++)                          \
+    res += a[i] * b[i] * 23;                           \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  for (int i = 0; i < N; i++)                          \
+    res += a[i] * b[i] << 6;                           \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  for (int i = 0; i < N; i++)                          \
+    res += a[i] * b[i] * cst2;                         \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  for (int i = 0; i < N; i++)                          \
+    res += a[i] * b[i] << shift;                       \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  for (int i = 0; i < N; i++)                          \
+    res += cst1 * 5 - a[i] * b[i];                     \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  for (int i = 0; i < N; i++)                          \
+    res += ~(((a[i] * b[i] + 3) << shift) - c[i]);     \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  for (int i = 0; i < N; i++)                          \
+    {                                                  \
+      S2 int t = a[i] * b[i];                          \
+      res += (t * cst2) + ~((t - cst1) << 3);          \
+    }                                                  \
+                                                       \
+  asm volatile ("" ::: "memory");                      \
+  S1 int res1 = 1;                                     \
+  S1 int res2 = 2;                                     \
+  for (int i = 0; i < N; i++)                          \
+    {                                                  \
+      S2 int t = a[i] * b[i];                          \
+      res1 += (t * cst2) + 18;                         \
+      res2 += (t - cst1) << shift;                     \
+    }                                                  \
+  res += res1 ^ res2;                                  \
+  return res;                                          \
+}
+
+FN(f1_vec_s, signed, signed)
+FN(f1_vec_u, unsigned, signed)
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+FN(f1_novec_s, signed, signed)
+FN(f1_novec_u, unsigned, signed)
+#pragma GCC pop_options
+
+#define BASE ((int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  signed char a[N], b[N];
+  int c[N];
+
+  #pragma GCC novector
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      c[i] = i;
+    }
+
+  if (f1_vec_s (0x12345, a, b, c, -5, 17, 3) != f1_novec_s (0x12345, a, b, c, 
-5, 17, 3))
+    __builtin_abort ();
+
+  if (f1_vec_u (0x12345, a, b, c, -5, 17, 3) != f1_novec_u (0x12345, a, b, c, 
-5, 17, 3))
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" 
} } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = 
DOT_PROD_EXPR" 20 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-2.c
new file mode 100644
index 00000000000..a160bc72082
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-2.c
@@ -0,0 +1,81 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+#define FN(name, S1, S2, S3, S4)                                               
\
+S1 int __attribute__ ((noipa))                                                 
\
+name (S1 int res,                                                              
\
+   S2 char *restrict a,                                                        
        \
+   S2 char *restrict b,                                                        
        \
+   S3 char *restrict c,                                                        
        \
+   S3 char *restrict d,                                                        
        \
+   S4 short *restrict e,                                                       
\
+   S4 short *restrict f,                                                       
\
+   S1 int *restrict g,                                                         
\
+   S1 int cst1)                                                                
        \
+{                                                                              
\
+  for (int i = 0; i < N; ++i)                                                  
\
+    {                                                                          
\
+      short diff = a[i] - b[i];                                                
        \
+      S2 short abs = diff < 0 ? -diff : diff;                                  
\
+      res += ((abs + i) << 3) - (c[i] + 1) * cst1 + d[i] * 3 + e[i]  - g[i];   
\
+    }                                                                          
\
+                                                                               
\
+  return res;                                                                  
\
+}
+
+FN(f1_vec, signed, unsigned, signed, signed)
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+FN(f1_novec, signed, unsigned, signed, signed)
+#pragma GCC pop_options
+
+#define BASE2 ((unsigned int) -1 < 0 ? -126 : 4)
+#define BASE3 ((signed int) -1 < 0 ? -126 : 4)
+#define BASE4 ((signed int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  unsigned char a[N], b[N];
+  signed char c[N], d[N];
+  signed short e[N], f[N];
+  signed int g[N];
+
+#pragma GCC novector
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = BASE4 + i * 6;
+      f[i] = BASE4 + OFFSET + i * 5;
+      g[i] = i;
+    }
+
+  if (f1_vec (0x12345, a, b, c, d, e, f, g, 17) != f1_novec (0x12345, a, b, c, 
d, e, f, g, 17))
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" 
} } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" 
"vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" 
"vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" 
"vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-slp-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-slp-1.c
new file mode 100644
index 00000000000..0e76536925e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-slp-1.c
@@ -0,0 +1,74 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 100
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+#define FN(name, S1, S2)                                       \
+S1 int __attribute__ ((noipa))                                 \
+name (S1 int res,                                              \
+      S2 char *restrict a,                                     \
+      S2 char *restrict b,                                     \
+      S2 short *restrict c,                                    \
+      S2 int *restrict d,                                      \
+      S1 int cst1,                                             \
+      S1 int cst2)                                             \
+{                                                              \
+  for (int i = 0; i < N / 2; ++i)                              \
+    {                                                          \
+      res += ~((a[2 * i + 0] * b[2 * i + 0] + 1) << 3)         \
+            - (c[2 * i + 0] + cst1) * cst2 + d[2 * i + 0];     \
+      res += ~((a[2 * i + 1] * b[2 * i + 1] + 1) << 3)         \
+            - (c[2 * i + 1] + cst1) * cst2 + d[2 * i + 1];     \
+    }                                                          \
+                                                               \
+  return res;                                                  \
+}
+
+FN(f1_vec, signed, signed)
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+FN(f1_novec, signed, signed)
+#pragma GCC pop_options
+
+#define BASE2 ((signed int) -1 < 0 ? -126 : 4)
+#define BASE3 ((signed int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  signed char a[N], b[N];
+  signed short c[N];
+  signed int d[N];
+
+#pragma GCC novector
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 6;
+      d[i] = i;
+    }
+
+  if (f1_vec (0x12345, a, b, c, d, -5, 17) != f1_novec (0x12345, a, b, c, d, 
-5, 17))
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" 
} } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" 
"vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" 
"vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 02f6b942026..bb037af0b68 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1029,54 +1029,6 @@ vect_convert_output (vec_info *vinfo, stmt_vec_info 
stmt_info, tree type,
   return pattern_stmt;
 }
 
-/* Return true if STMT_VINFO describes a reduction for which reassociation
-   is allowed.  If STMT_INFO is part of a group, assume that it's part of
-   a reduction chain and optimistically assume that all statements
-   except the last allow reassociation.
-   Also require it to have code CODE and to be a reduction
-   in the outermost loop.  When returning true, store the operands in
-   *OP0_OUT and *OP1_OUT.  */
-
-static bool
-vect_reassociating_reduction_p (vec_info *vinfo,
-                               stmt_vec_info stmt_info, tree_code code,
-                               tree *op0_out, tree *op1_out)
-{
-  loop_vec_info loop_info = dyn_cast <loop_vec_info> (vinfo);
-  if (!loop_info)
-    return false;
-
-  /* As a candidate of lane-reducing pattern matching, the statement must
-     be inside affine closure of loop reduction.  */
-  if (!(stmt_info->reduc_pattern_status & rpatt_allow))
-    return false;
-
-  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
-  if (!assign || gimple_assign_rhs_code (assign) != code)
-    return false;
-
-  /* We don't allow changing the order of the computation in the inner-loop
-     when doing outer-loop vectorization.  */
-  class loop *loop = LOOP_VINFO_LOOP (loop_info);
-  if (loop && nested_in_vect_loop_p (loop, stmt_info))
-    return false;
-
-  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
-    {
-      if (needs_fold_left_reduction_p (TREE_TYPE (gimple_assign_lhs (assign)),
-                                      code))
-       return false;
-    }
-  else if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) == NULL)
-    return false;
-
-  *op0_out = gimple_assign_rhs1 (assign);
-  *op1_out = gimple_assign_rhs2 (assign);
-  if (commutative_tree_code (code) && STMT_VINFO_REDUC_IDX (stmt_info) == 0)
-    std::swap (*op0_out, *op1_out);
-  return true;
-}
-
 /* match.pd function to match
    (cond (cmp@3 a b) (convert@1 c) (convert@2 d))
    with conditions:
@@ -1189,96 +1141,60 @@ vect_recog_cond_expr_convert_pattern (vec_info *vinfo,
      S3  x_T = (TYPE1) x_t;
      S4  y_T = (TYPE1) y_t;
      S5  prod = x_T * y_T;
-     [S6  prod = (TYPE2) prod;  #optional]
-     S7  sum_1 = prod + sum_0;
+     [S6+ value = affine_fn (prod, ...);  #optional]
+     S7  sum_1 = value + sum_0;
 
-   where 'TYPE1' is exactly double the size of type 'type1a' and 'type1b',
-   the sign of 'TYPE1' must be one of 'type1a' or 'type1b' but the sign of
-   'type1a' and 'type1b' can differ.
+   There exisits natural widening conversion from both 'type1a' and 'type1b'
+   to 'TYPE1'.  The function 'affine_fn' represents a linear transform in
+   concept of math, and may be composed by a series of statements.
 
    Input:
 
    * STMT_VINFO: The stmt from which the pattern search begins.  In the
-   example, when this function is called with S7, the pattern {S3,S4,S5,S6,S7}
-   will be detected.
+   example, when this function is called with S5, the pattern {S3,S4,S5} will
+   be detected if S5 is known to be in affine closure of reduction for 'sum'.
 
    Output:
 
-   * TYPE_OUT: The type of the output  of this pattern.
+   * TYPE_OUT: The type of the output of this pattern.
 
    * Return value: A new stmt that will be used to replace the sequence of
    stmts that constitute the pattern. In this case it will be:
-        WIDEN_DOT_PRODUCT <x_t, y_t, sum_0>
+       DOT_PROD_EXPR <x_t, y_t, 0>
 
    Note: The dot-prod idiom is a widening reduction pattern that is
-         vectorized without preserving all the intermediate results. It
-         produces only N/2 (widened) results (by summing up pairs of
-         intermediate results) rather than all N results.  Therefore, we
-         cannot allow this pattern when we want to get all the results and in
-         the correct order (as is the case when this computation is in an
-         inner-loop nested in an outer-loop that us being vectorized).  */
+        vectorized without preserving all the intermediate results. It
+        produces less than N (widened) results (by summing up pairs of
+        intermediate results) rather than all N results.  Therefore, we
+        cannot allow this pattern when we want to get all the results and in
+        the correct order (as is the case when this computation is in an
+        inner-loop nested in an outer-loop that us being vectorized).  */
 
 static gimple *
 vect_recog_dot_prod_pattern (vec_info *vinfo,
                             stmt_vec_info stmt_vinfo, tree *type_out)
 {
-  tree oprnd0, oprnd1;
-  gimple *last_stmt = stmt_vinfo->stmt;
-  tree type, half_type;
-  gimple *pattern_stmt;
-  tree var;
-
-  /* Look for the following pattern
-          DX = (TYPE1) X;
-          DY = (TYPE1) Y;
-          DPROD = DX * DY;
-          DDPROD = (TYPE2) DPROD;
-          sum_1 = DDPROD + sum_0;
-     In which
-     - DX is double the size of X
-     - DY is double the size of Y
-     - DX, DY, DPROD all have the same type but the sign
-       between X, Y and DPROD can differ.
-     - sum is the same size of DPROD or bigger
-     - sum has been recognized as a reduction variable.
-
-     This is equivalent to:
-       DPROD = X w* Y;          #widen mult
-       sum_1 = DPROD w+ sum_0;  #widen summation
-     or
-       DPROD = X w* Y;          #widen mult
-       sum_1 = DPROD + sum_0;   #summation
-   */
-
-  /* Starting from LAST_STMT, follow the defs of its uses in search
-     of the above pattern.  */
-
-  if (!vect_reassociating_reduction_p (vinfo, stmt_vinfo, PLUS_EXPR,
-                                      &oprnd0, &oprnd1))
+  if (!(stmt_vinfo->reduc_pattern_status & rpatt_allow))
     return NULL;
 
-  type = TREE_TYPE (gimple_get_lhs (last_stmt));
-
+  gimple *last_stmt = stmt_vinfo->stmt;
+  tree value = gimple_get_lhs (last_stmt);
+  tree type = TREE_TYPE (value);
+  tree half_type;
   vect_unpromoted_value unprom_mult;
-  oprnd0 = vect_look_through_possible_promotion (vinfo, oprnd0, &unprom_mult);
 
-  /* So far so good.  Since last_stmt was detected as a (summation) reduction,
-     we know that oprnd1 is the reduction variable (defined by a loop-header
-     phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
-     Left to check that oprnd0 is defined by a (widen_)mult_expr  */
-  if (!oprnd0)
+  value = vect_look_through_possible_promotion (vinfo, value, &unprom_mult);
+  if (!value)
     return NULL;
 
-  stmt_vec_info mult_vinfo = vect_get_internal_def (vinfo, oprnd0);
+  stmt_vec_info mult_vinfo = vect_get_internal_def (vinfo, value);
   if (!mult_vinfo)
     return NULL;
 
-  /* FORNOW.  Can continue analyzing the def-use chain when this stmt in a phi
-     inside the loop (in case we are analyzing an outer-loop).  */
-  vect_unpromoted_value unprom0[2];
+  vect_unpromoted_value unprom[2];
   enum optab_subtype subtype = optab_vector;
   if (!vect_widened_op_tree (vinfo, mult_vinfo, MULT_EXPR, WIDEN_MULT_EXPR,
-                            false, 2, unprom0, &half_type, &subtype))
+                            false, 2, unprom, &half_type, &subtype))
     return NULL;
 
   /* If there are two widening operations, make sure they agree on the sign
@@ -1318,16 +1234,15 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
   /* Get the inputs in the appropriate types.  */
   tree mult_oprnd[2];
   vect_convert_inputs (vinfo, stmt_vinfo, 2, mult_oprnd, half_type,
-                      unprom0, half_vectype, subtype);
-
-  var = vect_recog_temp_ssa_var (type, NULL);
-  pattern_stmt = gimple_build_assign (var, DOT_PROD_EXPR,
-                                     mult_oprnd[0], mult_oprnd[1], oprnd1);
+                      unprom, half_vectype, subtype);
 
+  tree var = vect_recog_temp_ssa_var (type, NULL);
+  gimple *pattern_stmt = gimple_build_assign (var, DOT_PROD_EXPR,
+                                             mult_oprnd[0], mult_oprnd[1],
+                                             build_zero_cst (type));
   return pattern_stmt;
 }
 
-
 /* Function vect_recog_sad_pattern
 
    Try to find the following Sum of Absolute Difference (SAD) pattern:
@@ -1343,18 +1258,20 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
      S4  y_T = (TYPE1) y_t;
      S5  diff = x_T - y_T;
      S6  abs_diff = ABS_EXPR <diff>;
-     [S7  abs_diff = (TYPE2) abs_diff;  #optional]
-     S8  sum_1 = abs_diff + sum_0;
+     [S7+ value = affine_fn (abs_diff, ...);  #optional]
+     S8  sum_1 = value + sum_0;
 
    where 'TYPE1' is at least double the size of type 'type', and 'TYPE2' is the
-   same size of 'TYPE1' or bigger. This is a special case of a reduction
-   computation.
+   same size of 'TYPE1' or bigger.  The function 'affine_fn' represents a
+   linear transform in concept of math, and may be composed by a series of
+   statements.  This is a special case of a reduction computation.
 
    Input:
 
    * STMT_VINFO: The stmt from which the pattern search begins.  In the
-   example, when this function is called with S8, the pattern
-   {S3,S4,S5,S6,S7,S8} will be detected.
+   example, when this function is called with S6, the pattern {S3,S4,S5,S6}
+   will be detected if S6 is known to be in affine closure of reduction for
+   'sum'.
 
    Output:
 
@@ -1362,49 +1279,24 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
 
    * Return value: A new stmt that will be used to replace the sequence of
    stmts that constitute the pattern. In this case it will be:
-        SAD_EXPR <x_t, y_t, sum_0>
+       SAD_EXPR <x_t, y_t, 0>
   */
 
 static gimple *
 vect_recog_sad_pattern (vec_info *vinfo,
                        stmt_vec_info stmt_vinfo, tree *type_out)
 {
+  if (!(stmt_vinfo->reduc_pattern_status & rpatt_allow))
+    return NULL;
+
   gimple *last_stmt = stmt_vinfo->stmt;
   tree half_type;
 
-  /* Look for the following pattern
-          DX = (TYPE1) X;
-          DY = (TYPE1) Y;
-          DDIFF = DX - DY;
-          DAD = ABS_EXPR <DDIFF>;
-          DDPROD = (TYPE2) DPROD;
-          sum_1 = DAD + sum_0;
-     In which
-     - DX is at least double the size of X
-     - DY is at least double the size of Y
-     - DX, DY, DDIFF, DAD all have the same type
-     - sum is the same size of DAD or bigger
-     - sum has been recognized as a reduction variable.
-
-     This is equivalent to:
-       DDIFF = X w- Y;          #widen sub
-       DAD = ABS_EXPR <DDIFF>;
-       sum_1 = DAD w+ sum_0;    #widen summation
-     or
-       DDIFF = X w- Y;          #widen sub
-       DAD = ABS_EXPR <DDIFF>;
-       sum_1 = DAD + sum_0;     #summation
-   */
-
   /* Starting from LAST_STMT, follow the defs of its uses in search
      of the above pattern.  */
 
-  tree plus_oprnd0, plus_oprnd1;
-  if (!vect_reassociating_reduction_p (vinfo, stmt_vinfo, PLUS_EXPR,
-                                      &plus_oprnd0, &plus_oprnd1))
-    return NULL;
-
-  tree sum_type = TREE_TYPE (gimple_get_lhs (last_stmt));
+  tree value = gimple_get_lhs (last_stmt);
+  tree type = TREE_TYPE (value);
 
   /* Any non-truncating sequence of conversions is OK here, since
      with a successful match, the result of the ABS(U) is known to fit
@@ -1412,23 +1304,15 @@ vect_recog_sad_pattern (vec_info *vinfo,
      negative of the minimum signed value due to the range of the widening
      MINUS_EXPR.)  */
   vect_unpromoted_value unprom_abs;
-  plus_oprnd0 = vect_look_through_possible_promotion (vinfo, plus_oprnd0,
-                                                     &unprom_abs);
-
-  /* So far so good.  Since last_stmt was detected as a (summation) reduction,
-     we know that plus_oprnd1 is the reduction variable (defined by a 
loop-header
-     phi), and plus_oprnd0 is an ssa-name defined by a stmt in the loop body.
-     Then check that plus_oprnd0 is defined by an abs_expr.  */
 
-  if (!plus_oprnd0)
+  value = vect_look_through_possible_promotion (vinfo, value, &unprom_abs);
+  if (!value)
     return NULL;
 
-  stmt_vec_info abs_stmt_vinfo = vect_get_internal_def (vinfo, plus_oprnd0);
+  stmt_vec_info abs_stmt_vinfo = vect_get_internal_def (vinfo, value);
   if (!abs_stmt_vinfo)
     return NULL;
 
-  /* FORNOW.  Can continue analyzing the def-use chain when this stmt in a phi
-     inside the loop (in case we are analyzing an outer-loop).  */
   gassign *abs_stmt = dyn_cast <gassign *> (abs_stmt_vinfo->stmt);
   vect_unpromoted_value unprom[2];
 
@@ -1467,22 +1351,22 @@ vect_recog_sad_pattern (vec_info *vinfo,
                                            unprom, NULL))
     return NULL;
 
-  vect_pattern_detected ("vect_recog_sad_pattern", last_stmt);
-
   tree half_vectype;
-  if (!vect_supportable_direct_optab_p (vinfo, sum_type, SAD_EXPR, half_type,
+  if (!vect_supportable_direct_optab_p (vinfo, type, SAD_EXPR, half_type,
                                        type_out, &half_vectype))
     return NULL;
 
+  vect_pattern_detected ("vect_recog_sad_pattern", last_stmt);
+
   /* Get the inputs to the SAD_EXPR in the appropriate types.  */
   tree sad_oprnd[2];
   vect_convert_inputs (vinfo, stmt_vinfo, 2, sad_oprnd, half_type,
                       unprom, half_vectype);
 
-  tree var = vect_recog_temp_ssa_var (sum_type, NULL);
+  tree var = vect_recog_temp_ssa_var (type, NULL);
   gimple *pattern_stmt = gimple_build_assign (var, SAD_EXPR, sad_oprnd[0],
-                                             sad_oprnd[1], plus_oprnd1);
-
+                                             sad_oprnd[1],
+                                             build_zero_cst (type));
   return pattern_stmt;
 }
 
@@ -2492,30 +2376,35 @@ vect_recog_pow_pattern (vec_info *vinfo,
      TYPE x_T, sum = init;
    loop:
      sum_0 = phi <init, sum_1>
-     S1  x_t = *p;
+     S1  x_t = ...;
      S2  x_T = (TYPE) x_t;
-     S3  sum_1 = x_T + sum_0;
+     [S3+ value = affine_fn (x_T, ...);  #optional]
+     S4  sum_1 = value + sum_0;
 
    where type 'TYPE' is at least double the size of type 'type', i.e - we're
-   summing elements of type 'type' into an accumulator of type 'TYPE'. This is
-   a special case of a reduction computation.
+   summing elements of type 'type' into an accumulator of type 'TYPE'.  The
+   function 'affine_fn' represents a linear transform in concept of math, and
+   may be composed by a series of statements.  This is a special case of a
+   reduction computation.
 
    Input:
 
    * STMT_VINFO: The stmt from which the pattern search begins. In the example,
-   when this function is called with S3, the pattern {S2,S3} will be detected.
+   when this function is called with S2, the pattern {S2} will be detected if
+   S2 is known to be in affine closure of reduction for 'sum'.
 
    Output:
 
    * TYPE_OUT: The type of the output of this pattern.
 
    * Return value: A new stmt that will be used to replace the sequence of
-   stmts that constitute the pattern. In this case it will be:
-        WIDEN_SUM <x_t, sum_0>
+   stmts that constitute the pattern.  In this case it will be
+   WIDEN_SUM_EXPR <x_t, 0> if the operation is supported by target, otherwise,
+   DOT_PROD_EXPR <x_t, 1, 0> if dot-product could be used.
 
    Note: The widening-sum idiom is a widening reduction pattern that is
         vectorized without preserving all the intermediate results. It
-         produces only N/2 (widened) results (by summing up pairs of
+        produces less than N (widened) results (by summing up pairs of
         intermediate results) rather than all N results.  Therefore, we
         cannot allow this pattern when we want to get all the results and in
         the correct order (as is the case when this computation is in an
@@ -2525,49 +2414,42 @@ static gimple *
 vect_recog_widen_sum_pattern (vec_info *vinfo,
                              stmt_vec_info stmt_vinfo, tree *type_out)
 {
+  if (!(stmt_vinfo->reduc_pattern_status & rpatt_allow))
+    return NULL;
+
   gimple *last_stmt = stmt_vinfo->stmt;
-  tree oprnd0, oprnd1;
-  tree type;
-  gimple *pattern_stmt;
+  tree value = gimple_get_lhs (last_stmt);
+  tree type = TREE_TYPE (value);
+  gimple *pattern_stmt = NULL;
+  vect_unpromoted_value unprom;
   tree var;
 
-  /* Look for the following pattern
-          DX = (TYPE) X;
-          sum_1 = DX + sum_0;
-     In which DX is at least double the size of X, and sum_1 has been
-     recognized as a reduction variable.
-   */
-
-  /* Starting from LAST_STMT, follow the defs of its uses in search
-     of the above pattern.  */
-
-  if (!vect_reassociating_reduction_p (vinfo, stmt_vinfo, PLUS_EXPR,
-                                      &oprnd0, &oprnd1)
-      || TREE_CODE (oprnd0) != SSA_NAME
-      || !vinfo->lookup_def (oprnd0))
+  /* Check that value is defined by a widening cast.  */
+  if (!vect_look_through_possible_promotion (vinfo, value, &unprom)
+      || TYPE_PRECISION (unprom.type) * 2 > TYPE_PRECISION (type))
     return NULL;
 
-  type = TREE_TYPE (gimple_get_lhs (last_stmt));
-
-  /* So far so good.  Since last_stmt was detected as a (summation) reduction,
-     we know that oprnd1 is the reduction variable (defined by a loop-header
-     phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
-     Left to check that oprnd0 is defined by a cast from type 'type' to type
-     'TYPE'.  */
-
-  vect_unpromoted_value unprom0;
-  if (!vect_look_through_possible_promotion (vinfo, oprnd0, &unprom0)
-      || TYPE_PRECISION (unprom0.type) * 2 > TYPE_PRECISION (type))
+  /* TODO: Support widening-sum on boolean value.  */
+  if (TREE_CODE (unprom.type) != INTEGER_TYPE)
     return NULL;
 
-  vect_pattern_detected ("vect_recog_widen_sum_pattern", last_stmt);
-
-  if (!vect_supportable_direct_optab_p (vinfo, type, WIDEN_SUM_EXPR,
-                                       unprom0.type, type_out))
-    return NULL;
+  if (vect_supportable_direct_optab_p (vinfo, type, WIDEN_SUM_EXPR,
+                                      unprom.type, type_out))
+    {
+      var = vect_recog_temp_ssa_var (type, NULL);
+      pattern_stmt = gimple_build_assign (var, WIDEN_SUM_EXPR, unprom.op,
+                                         build_zero_cst (type));
+    }
+  else if (vect_supportable_direct_optab_p (vinfo, type, DOT_PROD_EXPR,
+                                           unprom.type, type_out))
+    {
+      var = vect_recog_temp_ssa_var (type, NULL);
+      pattern_stmt = gimple_build_assign (var, DOT_PROD_EXPR, unprom.op,
+                                         build_one_cst (unprom.type),
+                                         build_zero_cst (type));
+    }
 
-  var = vect_recog_temp_ssa_var (type, NULL);
-  pattern_stmt = gimple_build_assign (var, WIDEN_SUM_EXPR, unprom0.op, oprnd1);
+  vect_pattern_detected ("vect_recog_widen_sum_pattern", last_stmt);
 
   return pattern_stmt;
 }
@@ -7191,8 +7073,18 @@ struct vect_recog_func
 
 /* Note that ordering matters - the first pattern matching on a stmt is
    taken which means usually the more complex one needs to preceed the
-   less comples onex (widen_sum only after dot_prod or sad for example).  */
+   less complex ones (widen_sum only after dot_prod or sad for example).  */
 static vect_recog_func vect_vect_recog_func_ptrs[] = {
+
+  /* Lane-reducing patterns(dot_prod/sad/widen_sum) are not that sort of
+     local statement-based patterns, in that they require knowledge of
+     loop structure.  Naturally, it is anticipated that these patterns
+     would benefit loop vectorization much more than peephole-like
+     patterns.  So give lane-reducing patterns overriding priorities.  */
+  { vect_recog_dot_prod_pattern, "dot_prod" },
+  { vect_recog_sad_pattern, "sad" },
+  { vect_recog_widen_sum_pattern, "widen_sum" },
+
   { vect_recog_bitfield_ref_pattern, "bitfield_ref" },
   { vect_recog_bit_insert_pattern, "bit_insert" },
   { vect_recog_abd_pattern, "abd" },
@@ -7204,9 +7096,6 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
   { vect_recog_mulhs_pattern, "mult_high" },
   { vect_recog_cast_forwprop_pattern, "cast_forwprop" },
   { vect_recog_widen_mult_pattern, "widen_mult" },
-  { vect_recog_dot_prod_pattern, "dot_prod" },
-  { vect_recog_sad_pattern, "sad" },
-  { vect_recog_widen_sum_pattern, "widen_sum" },
   { vect_recog_pow_pattern, "pow" },
   { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" },
   { vect_recog_ctz_ffs_pattern, "ctz_ffs" },
-- 
2.17.1

From 548026f343a3291a38cdf06575046be5d85fe33d Mon Sep 17 00:00:00 2001
From: Feng Xue <f...@os.amperecomputing.com>
Date: Fri, 14 Jun 2024 15:45:26 +0800
Subject: [PATCH 4/5] vect: Extend lane-reducing patterns to non-loop-reduction
 statement

Previously, only simple lane-reducing case is supported, in which one loop
reduction statement forms one pattern match:

  char *d0, *d1, *s0, *s1, *w;
  for (i) {
    sum += d0[i] * d1[i];      // sum = DOT_PROD(d0, d1, sum);
    sum += abs(s0[i] - s1[i]); // sum = SAD(s0, s1, sum);
    sum += w[i];               // sum = WIDEN_SUM(w, sum);
  }

This patch removes limitation of current lane-reducing matching strategy, and
extends candidate scope to the whole loop reduction affine closure. Thus, we
could optimize reduction with lane-reducing as many as possible, which ends up
with generalized pattern recognition as ("opX" denotes an operation for
lane-reducing pattern):

 for (i)
   sum += cst0 * op0 + cst1 * op1 + ... + cstN * opN + h(i);

A lane-reducing operation contains two aspects: main primitive operation and
appendant result-accumulation. Original design handles match of the compound
semantics in single pattern, but the means is not suitable for operation that
does not directly participate in loop reduction. In this patch, we only focus
on the basic aspect, and leave another patch to cover the rest. An example
with dot-product:

    sum = DOT_PROD(d0, d1, sum);       // original
    sum = DOT_PROD(d0, d1, 0) + sum;   // now

2024-06-14 Feng Xue <f...@os.amperecomputing.com>

gcc/
	* tree-vect-patterns (vect_reassociating_reduction_p): Remove the
	function.
	(vect_recog_dot_prod_pattern): Relax check to allow any statement in
	reduction affine closure.
	(vect_recog_sad_pattern): Likewise.
	(vect_recog_widen_sum_pattern): Likewise. And use dot-product if
	widen-sum is not supported.
	(vect_vect_recog_func_ptrs): Move lane-reducing patterns to the topmost.

gcc/testsuite/
    	* gcc.dg/vect/vect-reduc-affine-1.c
    	* gcc.dg/vect/vect-reduc-affine-2.c
    	* gcc.dg/vect/vect-reduc-affine-slp-1.c
---
 .../gcc.dg/vect/vect-reduc-affine-1.c         | 112 ++++++
 .../gcc.dg/vect/vect-reduc-affine-2.c         |  81 +++++
 .../gcc.dg/vect/vect-reduc-affine-slp-1.c     |  74 ++++
 gcc/tree-vect-patterns.cc                     | 321 ++++++------------
 4 files changed, 372 insertions(+), 216 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-affine-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-affine-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-affine-slp-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-1.c
new file mode 100644
index 00000000000..a5e99ce703b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-1.c
@@ -0,0 +1,112 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#define FN(name, S1, S2)				\
+S1 int __attribute__ ((noipa))				\
+name (S1 int res,					\
+      S2 char *restrict a,				\
+      S2 char *restrict b,				\
+      S2 int *restrict c,				\
+      S2 int cst1,					\
+      S2 int cst2,					\
+      int shift)					\
+{							\
+  for (int i = 0; i < N; i++)				\
+    res += a[i] * b[i] + 16;				\
+							\
+  asm volatile ("" ::: "memory");			\
+  for (int i = 0; i < N; i++)				\
+    res += a[i] * b[i] + cst1;				\
+							\
+  asm volatile ("" ::: "memory");			\
+  for (int i = 0; i < N; i++)				\
+    res += a[i] * b[i] + c[i];				\
+							\
+  asm volatile ("" ::: "memory");			\
+  for (int i = 0; i < N; i++)				\
+    res += a[i] * b[i] * 23;				\
+							\
+  asm volatile ("" ::: "memory");			\
+  for (int i = 0; i < N; i++)				\
+    res += a[i] * b[i] << 6;				\
+							\
+  asm volatile ("" ::: "memory");			\
+  for (int i = 0; i < N; i++)				\
+    res += a[i] * b[i] * cst2;				\
+							\
+  asm volatile ("" ::: "memory");			\
+  for (int i = 0; i < N; i++)				\
+    res += a[i] * b[i] << shift;			\
+							\
+  asm volatile ("" ::: "memory");			\
+  for (int i = 0; i < N; i++)				\
+    res += cst1 * 5 - a[i] * b[i];			\
+							\
+  asm volatile ("" ::: "memory");			\
+  for (int i = 0; i < N; i++)				\
+    res += ~(((a[i] * b[i] + 3) << shift) - c[i]);	\
+							\
+  asm volatile ("" ::: "memory");			\
+  for (int i = 0; i < N; i++)				\
+    {							\
+      S2 int t = a[i] * b[i];				\
+      res += (t * cst2) + ~((t - cst1) << 3);		\
+    }							\
+							\
+  asm volatile ("" ::: "memory");			\
+  S1 int res1 = 1;					\
+  S1 int res2 = 2;					\
+  for (int i = 0; i < N; i++)				\
+    {							\
+      S2 int t = a[i] * b[i];				\
+      res1 += (t * cst2) + 18;				\
+      res2 += (t - cst1) << shift;			\
+    }							\
+  res += res1 ^ res2;					\
+  return res;						\
+}
+
+FN(f1_vec_s, signed, signed)
+FN(f1_vec_u, unsigned, signed)
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+FN(f1_novec_s, signed, signed)
+FN(f1_novec_u, unsigned, signed)
+#pragma GCC pop_options
+
+#define BASE ((int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  signed char a[N], b[N];
+  int c[N];
+
+  #pragma GCC novector
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 5;
+      b[i] = BASE + OFFSET + i * 4;
+      c[i] = i;
+    }
+
+  if (f1_vec_s (0x12345, a, b, c, -5, 17, 3) != f1_novec_s (0x12345, a, b, c, -5, 17, 3))
+    __builtin_abort ();
+
+  if (f1_vec_u (0x12345, a, b, c, -5, 17, 3) != f1_novec_u (0x12345, a, b, c, -5, 17, 3))
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 20 "vect" { target vect_sdot_qi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-2.c
new file mode 100644
index 00000000000..a160bc72082
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-2.c
@@ -0,0 +1,81 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+#define FN(name, S1, S2, S3, S4)						\
+S1 int __attribute__ ((noipa))							\
+name (S1 int res,								\
+   S2 char *restrict a,								\
+   S2 char *restrict b,								\
+   S3 char *restrict c,								\
+   S3 char *restrict d,								\
+   S4 short *restrict e,							\
+   S4 short *restrict f,							\
+   S1 int *restrict g,								\
+   S1 int cst1)									\
+{										\
+  for (int i = 0; i < N; ++i)							\
+    {										\
+      short diff = a[i] - b[i];							\
+      S2 short abs = diff < 0 ? -diff : diff;					\
+      res += ((abs + i) << 3) - (c[i] + 1) * cst1 + d[i] * 3 + e[i]  - g[i];	\
+    }										\
+										\
+  return res;									\
+}
+
+FN(f1_vec, signed, unsigned, signed, signed)
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+FN(f1_novec, signed, unsigned, signed, signed)
+#pragma GCC pop_options
+
+#define BASE2 ((unsigned int) -1 < 0 ? -126 : 4)
+#define BASE3 ((signed int) -1 < 0 ? -126 : 4)
+#define BASE4 ((signed int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  unsigned char a[N], b[N];
+  signed char c[N], d[N];
+  signed short e[N], f[N];
+  signed int g[N];
+
+#pragma GCC novector
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 2;
+      d[i] = BASE3 + OFFSET + i * 3;
+      e[i] = BASE4 + i * 6;
+      f[i] = BASE4 + OFFSET + i * 5;
+      g[i] = i;
+    }
+
+  if (f1_vec (0x12345, a, b, c, d, e, f, g, 17) != f1_novec (0x12345, a, b, c, d, e, f, g, 17))
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-slp-1.c
new file mode 100644
index 00000000000..0e76536925e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-affine-slp-1.c
@@ -0,0 +1,74 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+
+#include "tree-vect.h"
+
+#define N 100
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+#define FN(name, S1, S2)					\
+S1 int __attribute__ ((noipa))					\
+name (S1 int res,						\
+      S2 char *restrict a,					\
+      S2 char *restrict b,					\
+      S2 short *restrict c,					\
+      S2 int *restrict d,					\
+      S1 int cst1,						\
+      S1 int cst2)						\
+{								\
+  for (int i = 0; i < N / 2; ++i)				\
+    {								\
+      res += ~((a[2 * i + 0] * b[2 * i + 0] + 1) << 3)		\
+	     - (c[2 * i + 0] + cst1) * cst2 + d[2 * i + 0];	\
+      res += ~((a[2 * i + 1] * b[2 * i + 1] + 1) << 3)		\
+	     - (c[2 * i + 1] + cst1) * cst2 + d[2 * i + 1];	\
+    }								\
+								\
+  return res;							\
+}
+
+FN(f1_vec, signed, signed)
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+FN(f1_novec, signed, signed)
+#pragma GCC pop_options
+
+#define BASE2 ((signed int) -1 < 0 ? -126 : 4)
+#define BASE3 ((signed int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+  check_vect ();
+
+  signed char a[N], b[N];
+  signed short c[N];
+  signed int d[N];
+
+#pragma GCC novector
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE2 + i * 5;
+      b[i] = BASE2 + OFFSET + i * 4;
+      c[i] = BASE3 + i * 6;
+      d[i] = i;
+    }
+
+  if (f1_vec (0x12345, a, b, c, d, -5, 17) != f1_novec (0x12345, a, b, c, d, -5, 17))
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 02f6b942026..bb037af0b68 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1029,54 +1029,6 @@ vect_convert_output (vec_info *vinfo, stmt_vec_info stmt_info, tree type,
   return pattern_stmt;
 }
 
-/* Return true if STMT_VINFO describes a reduction for which reassociation
-   is allowed.  If STMT_INFO is part of a group, assume that it's part of
-   a reduction chain and optimistically assume that all statements
-   except the last allow reassociation.
-   Also require it to have code CODE and to be a reduction
-   in the outermost loop.  When returning true, store the operands in
-   *OP0_OUT and *OP1_OUT.  */
-
-static bool
-vect_reassociating_reduction_p (vec_info *vinfo,
-				stmt_vec_info stmt_info, tree_code code,
-				tree *op0_out, tree *op1_out)
-{
-  loop_vec_info loop_info = dyn_cast <loop_vec_info> (vinfo);
-  if (!loop_info)
-    return false;
-
-  /* As a candidate of lane-reducing pattern matching, the statement must
-     be inside affine closure of loop reduction.  */
-  if (!(stmt_info->reduc_pattern_status & rpatt_allow))
-    return false;
-
-  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
-  if (!assign || gimple_assign_rhs_code (assign) != code)
-    return false;
-
-  /* We don't allow changing the order of the computation in the inner-loop
-     when doing outer-loop vectorization.  */
-  class loop *loop = LOOP_VINFO_LOOP (loop_info);
-  if (loop && nested_in_vect_loop_p (loop, stmt_info))
-    return false;
-
-  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
-    {
-      if (needs_fold_left_reduction_p (TREE_TYPE (gimple_assign_lhs (assign)),
-				       code))
-	return false;
-    }
-  else if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) == NULL)
-    return false;
-
-  *op0_out = gimple_assign_rhs1 (assign);
-  *op1_out = gimple_assign_rhs2 (assign);
-  if (commutative_tree_code (code) && STMT_VINFO_REDUC_IDX (stmt_info) == 0)
-    std::swap (*op0_out, *op1_out);
-  return true;
-}
-
 /* match.pd function to match
    (cond (cmp@3 a b) (convert@1 c) (convert@2 d))
    with conditions:
@@ -1189,96 +1141,60 @@ vect_recog_cond_expr_convert_pattern (vec_info *vinfo,
      S3  x_T = (TYPE1) x_t;
      S4  y_T = (TYPE1) y_t;
      S5  prod = x_T * y_T;
-     [S6  prod = (TYPE2) prod;  #optional]
-     S7  sum_1 = prod + sum_0;
+     [S6+ value = affine_fn (prod, ...);  #optional]
+     S7  sum_1 = value + sum_0;
 
-   where 'TYPE1' is exactly double the size of type 'type1a' and 'type1b',
-   the sign of 'TYPE1' must be one of 'type1a' or 'type1b' but the sign of
-   'type1a' and 'type1b' can differ.
+   There exisits natural widening conversion from both 'type1a' and 'type1b'
+   to 'TYPE1'.  The function 'affine_fn' represents a linear transform in
+   concept of math, and may be composed by a series of statements.
 
    Input:
 
    * STMT_VINFO: The stmt from which the pattern search begins.  In the
-   example, when this function is called with S7, the pattern {S3,S4,S5,S6,S7}
-   will be detected.
+   example, when this function is called with S5, the pattern {S3,S4,S5} will
+   be detected if S5 is known to be in affine closure of reduction for 'sum'.
 
    Output:
 
-   * TYPE_OUT: The type of the output  of this pattern.
+   * TYPE_OUT: The type of the output of this pattern.
 
    * Return value: A new stmt that will be used to replace the sequence of
    stmts that constitute the pattern. In this case it will be:
-        WIDEN_DOT_PRODUCT <x_t, y_t, sum_0>
+	DOT_PROD_EXPR <x_t, y_t, 0>
 
    Note: The dot-prod idiom is a widening reduction pattern that is
-         vectorized without preserving all the intermediate results. It
-         produces only N/2 (widened) results (by summing up pairs of
-         intermediate results) rather than all N results.  Therefore, we
-         cannot allow this pattern when we want to get all the results and in
-         the correct order (as is the case when this computation is in an
-         inner-loop nested in an outer-loop that us being vectorized).  */
+	 vectorized without preserving all the intermediate results. It
+	 produces less than N (widened) results (by summing up pairs of
+	 intermediate results) rather than all N results.  Therefore, we
+	 cannot allow this pattern when we want to get all the results and in
+	 the correct order (as is the case when this computation is in an
+	 inner-loop nested in an outer-loop that us being vectorized).  */
 
 static gimple *
 vect_recog_dot_prod_pattern (vec_info *vinfo,
 			     stmt_vec_info stmt_vinfo, tree *type_out)
 {
-  tree oprnd0, oprnd1;
-  gimple *last_stmt = stmt_vinfo->stmt;
-  tree type, half_type;
-  gimple *pattern_stmt;
-  tree var;
-
-  /* Look for the following pattern
-          DX = (TYPE1) X;
-          DY = (TYPE1) Y;
-          DPROD = DX * DY;
-          DDPROD = (TYPE2) DPROD;
-          sum_1 = DDPROD + sum_0;
-     In which
-     - DX is double the size of X
-     - DY is double the size of Y
-     - DX, DY, DPROD all have the same type but the sign
-       between X, Y and DPROD can differ.
-     - sum is the same size of DPROD or bigger
-     - sum has been recognized as a reduction variable.
-
-     This is equivalent to:
-       DPROD = X w* Y;          #widen mult
-       sum_1 = DPROD w+ sum_0;  #widen summation
-     or
-       DPROD = X w* Y;          #widen mult
-       sum_1 = DPROD + sum_0;   #summation
-   */
-
-  /* Starting from LAST_STMT, follow the defs of its uses in search
-     of the above pattern.  */
-
-  if (!vect_reassociating_reduction_p (vinfo, stmt_vinfo, PLUS_EXPR,
-				       &oprnd0, &oprnd1))
+  if (!(stmt_vinfo->reduc_pattern_status & rpatt_allow))
     return NULL;
 
-  type = TREE_TYPE (gimple_get_lhs (last_stmt));
-
+  gimple *last_stmt = stmt_vinfo->stmt;
+  tree value = gimple_get_lhs (last_stmt);
+  tree type = TREE_TYPE (value);
+  tree half_type;
   vect_unpromoted_value unprom_mult;
-  oprnd0 = vect_look_through_possible_promotion (vinfo, oprnd0, &unprom_mult);
 
-  /* So far so good.  Since last_stmt was detected as a (summation) reduction,
-     we know that oprnd1 is the reduction variable (defined by a loop-header
-     phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
-     Left to check that oprnd0 is defined by a (widen_)mult_expr  */
-  if (!oprnd0)
+  value = vect_look_through_possible_promotion (vinfo, value, &unprom_mult);
+  if (!value)
     return NULL;
 
-  stmt_vec_info mult_vinfo = vect_get_internal_def (vinfo, oprnd0);
+  stmt_vec_info mult_vinfo = vect_get_internal_def (vinfo, value);
   if (!mult_vinfo)
     return NULL;
 
-  /* FORNOW.  Can continue analyzing the def-use chain when this stmt in a phi
-     inside the loop (in case we are analyzing an outer-loop).  */
-  vect_unpromoted_value unprom0[2];
+  vect_unpromoted_value unprom[2];
   enum optab_subtype subtype = optab_vector;
   if (!vect_widened_op_tree (vinfo, mult_vinfo, MULT_EXPR, WIDEN_MULT_EXPR,
-			     false, 2, unprom0, &half_type, &subtype))
+			     false, 2, unprom, &half_type, &subtype))
     return NULL;
 
   /* If there are two widening operations, make sure they agree on the sign
@@ -1318,16 +1234,15 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
   /* Get the inputs in the appropriate types.  */
   tree mult_oprnd[2];
   vect_convert_inputs (vinfo, stmt_vinfo, 2, mult_oprnd, half_type,
-		       unprom0, half_vectype, subtype);
-
-  var = vect_recog_temp_ssa_var (type, NULL);
-  pattern_stmt = gimple_build_assign (var, DOT_PROD_EXPR,
-				      mult_oprnd[0], mult_oprnd[1], oprnd1);
+		       unprom, half_vectype, subtype);
 
+  tree var = vect_recog_temp_ssa_var (type, NULL);
+  gimple *pattern_stmt = gimple_build_assign (var, DOT_PROD_EXPR,
+					      mult_oprnd[0], mult_oprnd[1],
+					      build_zero_cst (type));
   return pattern_stmt;
 }
 
-
 /* Function vect_recog_sad_pattern
 
    Try to find the following Sum of Absolute Difference (SAD) pattern:
@@ -1343,18 +1258,20 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
      S4  y_T = (TYPE1) y_t;
      S5  diff = x_T - y_T;
      S6  abs_diff = ABS_EXPR <diff>;
-     [S7  abs_diff = (TYPE2) abs_diff;  #optional]
-     S8  sum_1 = abs_diff + sum_0;
+     [S7+ value = affine_fn (abs_diff, ...);  #optional]
+     S8  sum_1 = value + sum_0;
 
    where 'TYPE1' is at least double the size of type 'type', and 'TYPE2' is the
-   same size of 'TYPE1' or bigger. This is a special case of a reduction
-   computation.
+   same size of 'TYPE1' or bigger.  The function 'affine_fn' represents a
+   linear transform in concept of math, and may be composed by a series of
+   statements.  This is a special case of a reduction computation.
 
    Input:
 
    * STMT_VINFO: The stmt from which the pattern search begins.  In the
-   example, when this function is called with S8, the pattern
-   {S3,S4,S5,S6,S7,S8} will be detected.
+   example, when this function is called with S6, the pattern {S3,S4,S5,S6}
+   will be detected if S6 is known to be in affine closure of reduction for
+   'sum'.
 
    Output:
 
@@ -1362,49 +1279,24 @@ vect_recog_dot_prod_pattern (vec_info *vinfo,
 
    * Return value: A new stmt that will be used to replace the sequence of
    stmts that constitute the pattern. In this case it will be:
-        SAD_EXPR <x_t, y_t, sum_0>
+	SAD_EXPR <x_t, y_t, 0>
   */
 
 static gimple *
 vect_recog_sad_pattern (vec_info *vinfo,
 			stmt_vec_info stmt_vinfo, tree *type_out)
 {
+  if (!(stmt_vinfo->reduc_pattern_status & rpatt_allow))
+    return NULL;
+
   gimple *last_stmt = stmt_vinfo->stmt;
   tree half_type;
 
-  /* Look for the following pattern
-          DX = (TYPE1) X;
-          DY = (TYPE1) Y;
-          DDIFF = DX - DY;
-          DAD = ABS_EXPR <DDIFF>;
-          DDPROD = (TYPE2) DPROD;
-          sum_1 = DAD + sum_0;
-     In which
-     - DX is at least double the size of X
-     - DY is at least double the size of Y
-     - DX, DY, DDIFF, DAD all have the same type
-     - sum is the same size of DAD or bigger
-     - sum has been recognized as a reduction variable.
-
-     This is equivalent to:
-       DDIFF = X w- Y;          #widen sub
-       DAD = ABS_EXPR <DDIFF>;
-       sum_1 = DAD w+ sum_0;    #widen summation
-     or
-       DDIFF = X w- Y;          #widen sub
-       DAD = ABS_EXPR <DDIFF>;
-       sum_1 = DAD + sum_0;     #summation
-   */
-
   /* Starting from LAST_STMT, follow the defs of its uses in search
      of the above pattern.  */
 
-  tree plus_oprnd0, plus_oprnd1;
-  if (!vect_reassociating_reduction_p (vinfo, stmt_vinfo, PLUS_EXPR,
-				       &plus_oprnd0, &plus_oprnd1))
-    return NULL;
-
-  tree sum_type = TREE_TYPE (gimple_get_lhs (last_stmt));
+  tree value = gimple_get_lhs (last_stmt);
+  tree type = TREE_TYPE (value);
 
   /* Any non-truncating sequence of conversions is OK here, since
      with a successful match, the result of the ABS(U) is known to fit
@@ -1412,23 +1304,15 @@ vect_recog_sad_pattern (vec_info *vinfo,
      negative of the minimum signed value due to the range of the widening
      MINUS_EXPR.)  */
   vect_unpromoted_value unprom_abs;
-  plus_oprnd0 = vect_look_through_possible_promotion (vinfo, plus_oprnd0,
-						      &unprom_abs);
-
-  /* So far so good.  Since last_stmt was detected as a (summation) reduction,
-     we know that plus_oprnd1 is the reduction variable (defined by a loop-header
-     phi), and plus_oprnd0 is an ssa-name defined by a stmt in the loop body.
-     Then check that plus_oprnd0 is defined by an abs_expr.  */
 
-  if (!plus_oprnd0)
+  value = vect_look_through_possible_promotion (vinfo, value, &unprom_abs);
+  if (!value)
     return NULL;
 
-  stmt_vec_info abs_stmt_vinfo = vect_get_internal_def (vinfo, plus_oprnd0);
+  stmt_vec_info abs_stmt_vinfo = vect_get_internal_def (vinfo, value);
   if (!abs_stmt_vinfo)
     return NULL;
 
-  /* FORNOW.  Can continue analyzing the def-use chain when this stmt in a phi
-     inside the loop (in case we are analyzing an outer-loop).  */
   gassign *abs_stmt = dyn_cast <gassign *> (abs_stmt_vinfo->stmt);
   vect_unpromoted_value unprom[2];
 
@@ -1467,22 +1351,22 @@ vect_recog_sad_pattern (vec_info *vinfo,
 					    unprom, NULL))
     return NULL;
 
-  vect_pattern_detected ("vect_recog_sad_pattern", last_stmt);
-
   tree half_vectype;
-  if (!vect_supportable_direct_optab_p (vinfo, sum_type, SAD_EXPR, half_type,
+  if (!vect_supportable_direct_optab_p (vinfo, type, SAD_EXPR, half_type,
 					type_out, &half_vectype))
     return NULL;
 
+  vect_pattern_detected ("vect_recog_sad_pattern", last_stmt);
+
   /* Get the inputs to the SAD_EXPR in the appropriate types.  */
   tree sad_oprnd[2];
   vect_convert_inputs (vinfo, stmt_vinfo, 2, sad_oprnd, half_type,
 		       unprom, half_vectype);
 
-  tree var = vect_recog_temp_ssa_var (sum_type, NULL);
+  tree var = vect_recog_temp_ssa_var (type, NULL);
   gimple *pattern_stmt = gimple_build_assign (var, SAD_EXPR, sad_oprnd[0],
-					      sad_oprnd[1], plus_oprnd1);
-
+					      sad_oprnd[1],
+					      build_zero_cst (type));
   return pattern_stmt;
 }
 
@@ -2492,30 +2376,35 @@ vect_recog_pow_pattern (vec_info *vinfo,
      TYPE x_T, sum = init;
    loop:
      sum_0 = phi <init, sum_1>
-     S1  x_t = *p;
+     S1  x_t = ...;
      S2  x_T = (TYPE) x_t;
-     S3  sum_1 = x_T + sum_0;
+     [S3+ value = affine_fn (x_T, ...);  #optional]
+     S4  sum_1 = value + sum_0;
 
    where type 'TYPE' is at least double the size of type 'type', i.e - we're
-   summing elements of type 'type' into an accumulator of type 'TYPE'. This is
-   a special case of a reduction computation.
+   summing elements of type 'type' into an accumulator of type 'TYPE'.  The
+   function 'affine_fn' represents a linear transform in concept of math, and
+   may be composed by a series of statements.  This is a special case of a
+   reduction computation.
 
    Input:
 
    * STMT_VINFO: The stmt from which the pattern search begins. In the example,
-   when this function is called with S3, the pattern {S2,S3} will be detected.
+   when this function is called with S2, the pattern {S2} will be detected if
+   S2 is known to be in affine closure of reduction for 'sum'.
 
    Output:
 
    * TYPE_OUT: The type of the output of this pattern.
 
    * Return value: A new stmt that will be used to replace the sequence of
-   stmts that constitute the pattern. In this case it will be:
-        WIDEN_SUM <x_t, sum_0>
+   stmts that constitute the pattern.  In this case it will be
+   WIDEN_SUM_EXPR <x_t, 0> if the operation is supported by target, otherwise,
+   DOT_PROD_EXPR <x_t, 1, 0> if dot-product could be used.
 
    Note: The widening-sum idiom is a widening reduction pattern that is
 	 vectorized without preserving all the intermediate results. It
-         produces only N/2 (widened) results (by summing up pairs of
+	 produces less than N (widened) results (by summing up pairs of
 	 intermediate results) rather than all N results.  Therefore, we
 	 cannot allow this pattern when we want to get all the results and in
 	 the correct order (as is the case when this computation is in an
@@ -2525,49 +2414,42 @@ static gimple *
 vect_recog_widen_sum_pattern (vec_info *vinfo,
 			      stmt_vec_info stmt_vinfo, tree *type_out)
 {
+  if (!(stmt_vinfo->reduc_pattern_status & rpatt_allow))
+    return NULL;
+
   gimple *last_stmt = stmt_vinfo->stmt;
-  tree oprnd0, oprnd1;
-  tree type;
-  gimple *pattern_stmt;
+  tree value = gimple_get_lhs (last_stmt);
+  tree type = TREE_TYPE (value);
+  gimple *pattern_stmt = NULL;
+  vect_unpromoted_value unprom;
   tree var;
 
-  /* Look for the following pattern
-          DX = (TYPE) X;
-          sum_1 = DX + sum_0;
-     In which DX is at least double the size of X, and sum_1 has been
-     recognized as a reduction variable.
-   */
-
-  /* Starting from LAST_STMT, follow the defs of its uses in search
-     of the above pattern.  */
-
-  if (!vect_reassociating_reduction_p (vinfo, stmt_vinfo, PLUS_EXPR,
-				       &oprnd0, &oprnd1)
-      || TREE_CODE (oprnd0) != SSA_NAME
-      || !vinfo->lookup_def (oprnd0))
+  /* Check that value is defined by a widening cast.  */
+  if (!vect_look_through_possible_promotion (vinfo, value, &unprom)
+      || TYPE_PRECISION (unprom.type) * 2 > TYPE_PRECISION (type))
     return NULL;
 
-  type = TREE_TYPE (gimple_get_lhs (last_stmt));
-
-  /* So far so good.  Since last_stmt was detected as a (summation) reduction,
-     we know that oprnd1 is the reduction variable (defined by a loop-header
-     phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
-     Left to check that oprnd0 is defined by a cast from type 'type' to type
-     'TYPE'.  */
-
-  vect_unpromoted_value unprom0;
-  if (!vect_look_through_possible_promotion (vinfo, oprnd0, &unprom0)
-      || TYPE_PRECISION (unprom0.type) * 2 > TYPE_PRECISION (type))
+  /* TODO: Support widening-sum on boolean value.  */
+  if (TREE_CODE (unprom.type) != INTEGER_TYPE)
     return NULL;
 
-  vect_pattern_detected ("vect_recog_widen_sum_pattern", last_stmt);
-
-  if (!vect_supportable_direct_optab_p (vinfo, type, WIDEN_SUM_EXPR,
-					unprom0.type, type_out))
-    return NULL;
+  if (vect_supportable_direct_optab_p (vinfo, type, WIDEN_SUM_EXPR,
+				       unprom.type, type_out))
+    {
+      var = vect_recog_temp_ssa_var (type, NULL);
+      pattern_stmt = gimple_build_assign (var, WIDEN_SUM_EXPR, unprom.op,
+					  build_zero_cst (type));
+    }
+  else if (vect_supportable_direct_optab_p (vinfo, type, DOT_PROD_EXPR,
+					    unprom.type, type_out))
+    {
+      var = vect_recog_temp_ssa_var (type, NULL);
+      pattern_stmt = gimple_build_assign (var, DOT_PROD_EXPR, unprom.op,
+					  build_one_cst (unprom.type),
+					  build_zero_cst (type));
+    }
 
-  var = vect_recog_temp_ssa_var (type, NULL);
-  pattern_stmt = gimple_build_assign (var, WIDEN_SUM_EXPR, unprom0.op, oprnd1);
+  vect_pattern_detected ("vect_recog_widen_sum_pattern", last_stmt);
 
   return pattern_stmt;
 }
@@ -7191,8 +7073,18 @@ struct vect_recog_func
 
 /* Note that ordering matters - the first pattern matching on a stmt is
    taken which means usually the more complex one needs to preceed the
-   less comples onex (widen_sum only after dot_prod or sad for example).  */
+   less complex ones (widen_sum only after dot_prod or sad for example).  */
 static vect_recog_func vect_vect_recog_func_ptrs[] = {
+
+  /* Lane-reducing patterns(dot_prod/sad/widen_sum) are not that sort of
+     local statement-based patterns, in that they require knowledge of
+     loop structure.  Naturally, it is anticipated that these patterns
+     would benefit loop vectorization much more than peephole-like
+     patterns.  So give lane-reducing patterns overriding priorities.  */
+  { vect_recog_dot_prod_pattern, "dot_prod" },
+  { vect_recog_sad_pattern, "sad" },
+  { vect_recog_widen_sum_pattern, "widen_sum" },
+
   { vect_recog_bitfield_ref_pattern, "bitfield_ref" },
   { vect_recog_bit_insert_pattern, "bit_insert" },
   { vect_recog_abd_pattern, "abd" },
@@ -7204,9 +7096,6 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
   { vect_recog_mulhs_pattern, "mult_high" },
   { vect_recog_cast_forwprop_pattern, "cast_forwprop" },
   { vect_recog_widen_mult_pattern, "widen_mult" },
-  { vect_recog_dot_prod_pattern, "dot_prod" },
-  { vect_recog_sad_pattern, "sad" },
-  { vect_recog_widen_sum_pattern, "widen_sum" },
   { vect_recog_pow_pattern, "pow" },
   { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" },
   { vect_recog_ctz_ffs_pattern, "ctz_ffs" },
-- 
2.17.1

[RFC][PATCH 4/5] vect: Extend lane-reducing patterns to non-loop-reduction statement

Reply via email to