Re: [PATCH 3/4] vect: Support multiple lane-reducing operations for loop reduction [PR114440]

Richard Biener Mon, 15 Jul 2024 07:04:37 -0700

On Sat, Jul 13, 2024 at 5:48 PM Feng Xue OS <f...@os.amperecomputing.com> wrote:
>
> For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
> vectorizer could only handle the pattern if the reduction chain does not
> contain other operation, no matter the other is normal or lane-reducing.
>
> This patches removes some constraints in reduction analysis to allow multiple
> arbitrary lane-reducing operations with mixed input vectypes in a loop
> reduction chain. For example:
>
>    int sum = 1;
>    for (i)
>      {
>        sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
>        sum += w[i];               // widen-sum <vector(16) char>
>        sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
>      }
>
> The vector size is 128-bit vectorization factor is 16. Reduction statements
> would be transformed as:
>
>    vector<4> int sum_v0 = { 0, 0, 0, 1 };
>    vector<4> int sum_v1 = { 0, 0, 0, 0 };
>    vector<4> int sum_v2 = { 0, 0, 0, 0 };
>    vector<4> int sum_v3 = { 0, 0, 0, 0 };
>
>    for (i / 16)
>      {
>        sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
>        sum_v1 = sum_v1;  // copy
>        sum_v2 = sum_v2;  // copy
>        sum_v3 = sum_v3;  // copy
>
>        sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
>        sum_v1 = sum_v1;  // copy
>        sum_v2 = sum_v2;  // copy
>        sum_v3 = sum_v3;  // copy
>
>        sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
>        sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
>        sum_v2 = sum_v2;  // copy
>        sum_v3 = sum_v3;  // copy
>      }
>
>     sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3;   // = sum_v0 + sum_v1


OK.

Thanks,
Richard.

> Thanks,
> Feng
> ---
> gcc/
>         PR tree-optimization/114440
>         * tree-vectorizer.h (vectorizable_lane_reducing): New function
>         declaration.
>         * tree-vect-stmts.cc (vect_analyze_stmt): Call new function
>         vectorizable_lane_reducing to analyze lane-reducing operation.
>         * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost 
> computation
>         code related to emulated_mixed_dot_prod.
>         (vectorizable_lane_reducing): New function.
>         (vectorizable_reduction): Allow multiple lane-reducing operations in
>         loop reduction. Move some original lane-reducing related code to
>         vectorizable_lane_reducing.
>         (vect_transform_reduction): Adjust comments with updated example.
>
> gcc/testsuite/
>         PR tree-optimization/114440
>         * gcc.dg/vect/vect-reduc-chain-1.c
>         * gcc.dg/vect/vect-reduc-chain-2.c
>         * gcc.dg/vect/vect-reduc-chain-3.c
>         * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
>         * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
>         * gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
>         * gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
>         * gcc.dg/vect/vect-reduc-dot-slp-1.c
> ---
>  .../gcc.dg/vect/vect-reduc-chain-1.c          |  64 +++++
>  .../gcc.dg/vect/vect-reduc-chain-2.c          |  79 ++++++
>  .../gcc.dg/vect/vect-reduc-chain-3.c          |  68 +++++
>  .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c  |  95 +++++++
>  .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c  |  67 +++++
>  .../gcc.dg/vect/vect-reduc-chain-dot-slp-3.c  |  79 ++++++
>  .../gcc.dg/vect/vect-reduc-chain-dot-slp-4.c  |  63 +++++
>  .../gcc.dg/vect/vect-reduc-dot-slp-1.c        |  60 +++++
>  gcc/tree-vect-loop.cc                         | 240 +++++++++++++-----
>  gcc/tree-vect-stmts.cc                        |   2 +
>  gcc/tree-vectorizer.h                         |   2 +
>  11 files changed, 750 insertions(+), 69 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c 
> b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
> new file mode 100644
> index 00000000000..80b0089ea0f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
> @@ -0,0 +1,64 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
> aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#define N 50
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 char *restrict a,
> +   SIGNEDNESS_2 char *restrict b,
> +   SIGNEDNESS_2 char *restrict c,
> +   SIGNEDNESS_2 char *restrict d,
> +   SIGNEDNESS_1 int *restrict e)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      res += a[i] * b[i];
> +      res += c[i] * d[i];
> +      res += e[i];
> +    }
> +  return res;
> +}
> +
> +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 char a[N], b[N];
> +  SIGNEDNESS_2 char c[N], d[N];
> +  SIGNEDNESS_1 int e[N];
> +  int expected = 0x12345;
> +
> +  #pragma GCC novector
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 5;
> +      b[i] = BASE + OFFSET + i * 4;
> +      c[i] = BASE + i * 2;
> +      d[i] = BASE + OFFSET + i * 3;
> +      e[i] = i;
> +      expected += a[i] * b[i];
> +      expected += c[i] * d[i];
> +      expected += e[i];
> +    }
> +
> +  if (f (0x12345, a, b, c, d, e) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" 
> "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = 
> DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c 
> b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
> new file mode 100644
> index 00000000000..5bc2686fc9d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
> @@ -0,0 +1,79 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
> aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#define N 50
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 unsigned
> +#define SIGNEDNESS_3 signed
> +#define SIGNEDNESS_4 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +fn (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 char *restrict a,
> +   SIGNEDNESS_2 char *restrict b,
> +   SIGNEDNESS_3 char *restrict c,
> +   SIGNEDNESS_3 char *restrict d,
> +   SIGNEDNESS_4 short *restrict e,
> +   SIGNEDNESS_4 short *restrict f,
> +   SIGNEDNESS_1 int *restrict g)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      res += a[i] * b[i];
> +      res += i + 1;
> +      res += c[i] * d[i];
> +      res += e[i] * f[i];
> +      res += g[i];
> +    }
> +  return res;
> +}
> +
> +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
> +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
> +#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 char a[N], b[N];
> +  SIGNEDNESS_3 char c[N], d[N];
> +  SIGNEDNESS_4 short e[N], f[N];
> +  SIGNEDNESS_1 int g[N];
> +  int expected = 0x12345;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE2 + i * 5;
> +      b[i] = BASE2 + OFFSET + i * 4;
> +      c[i] = BASE3 + i * 2;
> +      d[i] = BASE3 + OFFSET + i * 3;
> +      e[i] = BASE4 + i * 6;
> +      f[i] = BASE4 + OFFSET + i * 5;
> +      g[i] = i;
> +      expected += a[i] * b[i];
> +      expected += i + 1;
> +      expected += c[i] * d[i];
> +      expected += e[i] * f[i];
> +      expected += g[i];
> +    }
> +
> +  if (fn (0x12345, a, b, c, d, e, f, g) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" 
> "vect" } } */
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" 
> "vect" { target { vect_sdot_qi } } } } */
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" 
> "vect" { target { vect_udot_qi } } } } */
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" 
> "vect" { target { vect_sdot_hi } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c 
> b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
> new file mode 100644
> index 00000000000..6a733fbac53
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
> @@ -0,0 +1,68 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +
> +#include "tree-vect.h"
> +
> +#define N 50
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 unsigned
> +#define SIGNEDNESS_3 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 char *restrict a,
> +   SIGNEDNESS_2 char *restrict b,
> +   SIGNEDNESS_3 short *restrict c,
> +   SIGNEDNESS_3 short *restrict d,
> +   SIGNEDNESS_1 int *restrict e)
> +{
> +  for (int i = 0; i < N; ++i)
> +    {
> +      short diff = a[i] - b[i];
> +      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
> +      res += abs;
> +      res += c[i] * d[i];
> +      res += e[i];
> +    }
> +  return res;
> +}
> +
> +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
> +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 char a[N], b[N];
> +  SIGNEDNESS_3 short c[N], d[N];
> +  SIGNEDNESS_1 int e[N];
> +  int expected = 0x12345;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE2 + i * 5;
> +      b[i] = BASE2 - i * 4;
> +      c[i] = BASE3 + i * 2;
> +      d[i] = BASE3 + OFFSET + i * 3;
> +      e[i] = i;
> +      short diff = a[i] - b[i];
> +      SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
> +      expected += abs;
> +      expected += c[i] * d[i];
> +      expected += e[i];
> +    }
> +
> +  if (f (0x12345, a, b, c, d, e) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" 
> "vect" { target vect_udot_qi } } } */
> +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" 
> "vect" { target vect_sdot_hi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c 
> b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
> new file mode 100644
> index 00000000000..72a370ab3c0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
> @@ -0,0 +1,95 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
> aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 char *a,
> +   SIGNEDNESS_2 char *b,
> +   int step, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      res += a[0] * b[0];
> +      res += a[1] * b[1];
> +      res += a[2] * b[2];
> +      res += a[3] * b[3];
> +      res += a[4] * b[4];
> +      res += a[5] * b[5];
> +      res += a[6] * b[6];
> +      res += a[7] * b[7];
> +      res += a[8] * b[8];
> +      res += a[9] * b[9];
> +      res += a[10] * b[10];
> +      res += a[11] * b[11];
> +      res += a[12] * b[12];
> +      res += a[13] * b[13];
> +      res += a[14] * b[14];
> +      res += a[15] * b[15];
> +
> +      a += step;
> +      b += step;
> +    }
> +
> +  return res;
> +}
> +
> +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 char a[100], b[100];
> +  int expected = 0x12345;
> +  int step = 16;
> +  int n = 2;
> +  int t = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
> +    {
> +      a[i] = BASE + i * 5;
> +      b[i] = BASE + OFFSET + i * 4;
> +    }
> +
> +#pragma GCC novector
> +  for (int i = 0; i < n; i++)
> +    {
> +      expected += a[t + 0] * b[t + 0];
> +      expected += a[t + 1] * b[t + 1];
> +      expected += a[t + 2] * b[t + 2];
> +      expected += a[t + 3] * b[t + 3];
> +      expected += a[t + 4] * b[t + 4];
> +      expected += a[t + 5] * b[t + 5];
> +      expected += a[t + 6] * b[t + 6];
> +      expected += a[t + 7] * b[t + 7];
> +      expected += a[t + 8] * b[t + 8];
> +      expected += a[t + 9] * b[t + 9];
> +      expected += a[t + 10] * b[t + 10];
> +      expected += a[t + 11] * b[t + 11];
> +      expected += a[t + 12] * b[t + 12];
> +      expected += a[t + 13] * b[t + 13];
> +      expected += a[t + 14] * b[t + 14];
> +      expected += a[t + 15] * b[t + 15];
> +      t += step;
> +    }
> +
> +  if (f (0x12345, a, b, step, n) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" 
> "vect" } } */
> +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = 
> DOT_PROD_EXPR" 16 "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c 
> b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
> new file mode 100644
> index 00000000000..aab86ee2f1c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
> @@ -0,0 +1,67 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
> aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 char *a,
> +   SIGNEDNESS_2 char *b,
> +   int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      res += a[5 * i + 0] * b[5 * i + 0];
> +      res += a[5 * i + 1] * b[5 * i + 1];
> +      res += a[5 * i + 2] * b[5 * i + 2];
> +      res += a[5 * i + 3] * b[5 * i + 3];
> +      res += a[5 * i + 4] * b[5 * i + 4];
> +    }
> +
> +  return res;
> +}
> +
> +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 char a[100], b[100];
> +  int expected = 0x12345;
> +  int n = 18;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
> +    {
> +      a[i] = BASE + i * 5;
> +      b[i] = BASE + OFFSET + i * 4;
> +    }
> +
> +#pragma GCC novector
> +  for (int i = 0; i < n; i++)
> +    {
> +      expected += a[5 * i + 0] * b[5 * i + 0];
> +      expected += a[5 * i + 1] * b[5 * i + 1];
> +      expected += a[5 * i + 2] * b[5 * i + 2];
> +      expected += a[5 * i + 3] * b[5 * i + 3];
> +      expected += a[5 * i + 4] * b[5 * i + 4];
> +    }
> +
> +  if (f (0x12345, a, b, n) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" 
> "vect" } } */
> +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = 
> DOT_PROD_EXPR" 5 "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c 
> b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
> new file mode 100644
> index 00000000000..9f1d2136ab6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
> @@ -0,0 +1,79 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
> aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 short *a,
> +   SIGNEDNESS_2 short *b,
> +   int step, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      res += a[0] * b[0];
> +      res += a[1] * b[1];
> +      res += a[2] * b[2];
> +      res += a[3] * b[3];
> +      res += a[4] * b[4];
> +      res += a[5] * b[5];
> +      res += a[6] * b[6];
> +      res += a[7] * b[7];
> +
> +      a += step;
> +      b += step;
> +    }
> +
> +  return res;
> +}
> +
> +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 short a[100], b[100];
> +  int expected = 0x12345;
> +  int step = 8;
> +  int n = 2;
> +  int t = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
> +    {
> +      a[i] = BASE + i * 5;
> +      b[i] = BASE + OFFSET + i * 4;
> +    }
> +
> +#pragma GCC novector
> +  for (int i = 0; i < n; i++)
> +    {
> +      expected += a[t + 0] * b[t + 0];
> +      expected += a[t + 1] * b[t + 1];
> +      expected += a[t + 2] * b[t + 2];
> +      expected += a[t + 3] * b[t + 3];
> +      expected += a[t + 4] * b[t + 4];
> +      expected += a[t + 5] * b[t + 5];
> +      expected += a[t + 6] * b[t + 6];
> +      expected += a[t + 7] * b[t + 7];
> +      t += step;
> +    }
> +
> +  if (f (0x12345, a, b, step, n) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" 
> "vect" } } */
> +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = 
> DOT_PROD_EXPR" 8 "vect"  { target vect_sdot_hi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c 
> b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
> new file mode 100644
> index 00000000000..f4dcebdfa10
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
> @@ -0,0 +1,63 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
> aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res,
> +   SIGNEDNESS_2 short *a,
> +   SIGNEDNESS_2 short *b,
> +   int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      res += a[3 * i + 0] * b[3 * i + 0];
> +      res += a[3 * i + 1] * b[3 * i + 1];
> +      res += a[3 * i + 2] * b[3 * i + 2];
> +    }
> +
> +  return res;
> +}
> +
> +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
> +#define OFFSET 20
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  SIGNEDNESS_2 short a[100], b[100];
> +  int expected = 0x12345;
> +  int n = 18;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
> +    {
> +      a[i] = BASE + i * 5;
> +      b[i] = BASE + OFFSET + i * 4;
> +    }
> +
> +#pragma GCC novector
> +  for (int i = 0; i < n; i++)
> +    {
> +      expected += a[3 * i + 0] * b[3 * i + 0];
> +      expected += a[3 * i + 1] * b[3 * i + 1];
> +      expected += a[3 * i + 2] * b[3 * i + 2];
> +    }
> +
> +  if (f (0x12345, a, b, n) != expected)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" 
> "vect" } } */
> +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = 
> DOT_PROD_EXPR" 3 "vect"  { target vect_sdot_hi } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c 
> b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
> new file mode 100644
> index 00000000000..84c82b023d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
> @@ -0,0 +1,60 @@
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-do compile } */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0 
> -fdump-tree-optimized" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { 
> aarch64*-*-* || arm*-*-* } } } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +
> +#include "tree-vect.h"
> +
> +#ifndef SIGNEDNESS_1
> +#define SIGNEDNESS_1 signed
> +#define SIGNEDNESS_2 signed
> +#endif
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res0,
> +   SIGNEDNESS_1 int res1,
> +   SIGNEDNESS_1 int res2,
> +   SIGNEDNESS_1 int res3,
> +   SIGNEDNESS_1 int res4,
> +   SIGNEDNESS_1 int res5,
> +   SIGNEDNESS_1 int res6,
> +   SIGNEDNESS_1 int res7,
> +   SIGNEDNESS_1 int res8,
> +   SIGNEDNESS_1 int res9,
> +   SIGNEDNESS_1 int resA,
> +   SIGNEDNESS_1 int resB,
> +   SIGNEDNESS_1 int resC,
> +   SIGNEDNESS_1 int resD,
> +   SIGNEDNESS_1 int resE,
> +   SIGNEDNESS_1 int resF,
> +   SIGNEDNESS_2 char *a,
> +   SIGNEDNESS_2 char *b)
> +{
> +  for (int i = 0; i < 64; i += 16)
> +    {
> +      res0 += a[i + 0x00] * b[i + 0x00];
> +      res1 += a[i + 0x01] * b[i + 0x01];
> +      res2 += a[i + 0x02] * b[i + 0x02];
> +      res3 += a[i + 0x03] * b[i + 0x03];
> +      res4 += a[i + 0x04] * b[i + 0x04];
> +      res5 += a[i + 0x05] * b[i + 0x05];
> +      res6 += a[i + 0x06] * b[i + 0x06];
> +      res7 += a[i + 0x07] * b[i + 0x07];
> +      res8 += a[i + 0x08] * b[i + 0x08];
> +      res9 += a[i + 0x09] * b[i + 0x09];
> +      resA += a[i + 0x0A] * b[i + 0x0A];
> +      resB += a[i + 0x0B] * b[i + 0x0B];
> +      resC += a[i + 0x0C] * b[i + 0x0C];
> +      resD += a[i + 0x0D] * b[i + 0x0D];
> +      resE += a[i + 0x0E] * b[i + 0x0E];
> +      resF += a[i + 0x0F] * b[i + 0x0F];
> +    }
> +
> +  return res0 ^ res1 ^ res2 ^ res3 ^ res4 ^ res5 ^ res6 ^ res7 ^
> +         res8 ^ res9 ^ resA ^ resB ^ resC ^ resD ^ resE ^ resF;
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" 
> "vect" } } */
> +/* { dg-final { scan-tree-dump-not "DOT_PROD_EXPR" "optimized" } } */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 5ac83e76975..e72d692ffa3 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -5328,8 +5328,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>    if (!gimple_extract_op (orig_stmt_info->stmt, &op))
>      gcc_unreachable ();
>
> -  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> -
>    if (reduction_type == EXTRACT_LAST_REDUCTION)
>      /* No extra instructions are needed in the prologue.  The loop body
>         operations are costed in vectorizable_condition.  */
> @@ -5364,12 +5362,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
>            initial result of the data reduction, initial value of the index
>            reduction.  */
>         prologue_stmts = 4;
> -      else if (emulated_mixed_dot_prod)
> -       /* We need the initial reduction value and two invariants:
> -          one that contains the minimum signed value and one that
> -          contains half of its negative.  */
> -       prologue_stmts = 3;
>        else
> +       /* We need the initial reduction value.  */
>         prologue_stmts = 1;
>        prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
>                                          scalar_to_vec, stmt_info, 0,
> @@ -7478,6 +7472,143 @@ vect_reduction_update_partial_vector_usage 
> (loop_vec_info loop_vinfo,
>      }
>  }
>
> +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
> +   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
> +   Now there are three such kinds of operations: dot-prod/widen-sum/sad
> +   (sum-of-absolute-differences).
> +
> +   For a lane-reducing operation, the loop reduction path that it lies in,
> +   may contain normal operation, or other lane-reducing operation of 
> different
> +   input type size, an example as:
> +
> +     int sum = 0;
> +     for (i)
> +       {
> +         ...
> +         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
> +         sum += w[i];                // widen-sum <vector(16) char>
> +         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
> +         sum += n[i];                // normal <vector(4) int>
> +         ...
> +       }
> +
> +   Vectorization factor is essentially determined by operation whose input
> +   vectype has the most lanes ("vector(16) char" in the example), while we
> +   need to choose input vectype with the least lanes ("vector(4) int" in the
> +   example) for the reduction PHI statement.  */
> +
> +bool
> +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info 
> stmt_info,
> +                           slp_tree slp_node, stmt_vector_for_cost *cost_vec)
> +{
> +  gimple *stmt = stmt_info->stmt;
> +
> +  if (!lane_reducing_stmt_p (stmt))
> +    return false;
> +
> +  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
> +
> +  if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
> +    return false;
> +
> +  /* Do not try to vectorize bit-precision reductions.  */
> +  if (!type_has_mode_precision_p (type))
> +    return false;
> +
> +  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
> +    {
> +      stmt_vec_info def_stmt_info;
> +      slp_tree slp_op;
> +      tree op;
> +      tree vectype;
> +      enum vect_def_type dt;
> +
> +      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
> +                              &slp_op, &dt, &vectype, &def_stmt_info))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "use not simple.\n");
> +         return false;
> +       }
> +
> +      if (!vectype)
> +       {
> +         vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
> +                                                slp_op);
> +         if (!vectype)
> +           return false;
> +       }
> +
> +      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "incompatible vector types for invariants\n");
> +         return false;
> +       }
> +
> +      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> +       continue;
> +
> +      /* There should be at most one cycle def in the stmt.  */
> +      if (VECTORIZABLE_CYCLE_DEF (dt))
> +       return false;
> +    }
> +
> +  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt 
> (stmt_info));
> +
> +  /* TODO: Support lane-reducing operation that does not directly participate
> +     in loop reduction.  */
> +  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
> +    return false;
> +
> +  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
> +     recoginized.  */
> +  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
> +  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
> +
> +  tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
> +
> +  gcc_assert (vectype_in);
> +
> +  /* Compute number of effective vector statements for costing.  */
> +  unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node,
> +                                                      vectype_in);
> +  gcc_assert (ncopies_for_cost >= 1);
> +
> +  if (vect_is_emulated_mixed_dot_prod (stmt_info))
> +    {
> +      /* We need extra two invariants: one that contains the minimum signed
> +        value and one that contains half of its negative.  */
> +      int prologue_stmts = 2;
> +      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
> +                                       scalar_to_vec, stmt_info, 0,
> +                                       vect_prologue);
> +      if (dump_enabled_p ())
> +       dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
> +                    "extra prologue_cost = %d .\n", cost);
> +
> +      /* Three dot-products and a subtraction.  */
> +      ncopies_for_cost *= 4;
> +    }
> +
> +  record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info,
> +                   0, vect_body);
> +
> +  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> +    {
> +      enum tree_code code = gimple_assign_rhs_code (stmt);
> +      vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
> +                                                 slp_node, code, type,
> +                                                 vectype_in);
> +    }
> +
> +  /* Transform via vect_transform_reduction.  */
> +  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
> +  return true;
> +}
> +
>  /* Function vectorizable_reduction.
>
>     Check if STMT_INFO performs a reduction operation that can be vectorized.
> @@ -7811,18 +7942,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    if (!type_has_mode_precision_p (op.type))
>      return false;
>
> -  /* For lane-reducing ops we're reducing the number of reduction PHIs
> -     which means the only use of that may be in the lane-reducing operation. 
>  */
> -  if (lane_reducing
> -      && reduc_chain_length != 1
> -      && !only_slp_reduc_chain)
> -    {
> -      if (dump_enabled_p ())
> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "lane-reducing reduction with extra stmts.\n");
> -      return false;
> -    }
> -
>    /* Lane-reducing ops also never can be used in a SLP reduction group
>       since we'll mix lanes belonging to different reductions.  But it's
>       OK to use them in a reduction chain or when the reduction group
> @@ -8362,14 +8481,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        && loop_vinfo->suggested_unroll_factor == 1)
>      single_defuse_cycle = true;
>
> -  if (single_defuse_cycle || lane_reducing)
> +  if (single_defuse_cycle && !lane_reducing)
>      {
>        gcc_assert (op.code != COND_EXPR);
>
> -      /* 4. Supportable by target?  */
> -      bool ok = true;
> -
> -      /* 4.1. check support for the operation in the loop
> +      /* 4. check support for the operation in the loop
>
>          This isn't necessary for the lane reduction codes, since they
>          can only be produced by pattern matching, and it's up to the
> @@ -8378,14 +8494,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>          mixed-sign dot-products can be implemented using signed
>          dot-products.  */
>        machine_mode vec_mode = TYPE_MODE (vectype_in);
> -      if (!lane_reducing
> -         && !directly_supported_p (op.code, vectype_in, optab_vector))
> +      if (!directly_supported_p (op.code, vectype_in, optab_vector))
>          {
>            if (dump_enabled_p ())
>              dump_printf (MSG_NOTE, "op not supported by target.\n");
>           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
>               || !vect_can_vectorize_without_simd_p (op.code))
> -           ok = false;
> +           single_defuse_cycle = false;
>           else
>             if (dump_enabled_p ())
>               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
> @@ -8398,16 +8513,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>             dump_printf (MSG_NOTE, "using word mode not possible.\n");
>           return false;
>         }
> -
> -      /* lane-reducing operations have to go through 
> vect_transform_reduction.
> -         For the other cases try without the single cycle optimization.  */
> -      if (!ok)
> -       {
> -         if (lane_reducing)
> -           return false;
> -         else
> -           single_defuse_cycle = false;
> -       }
>      }
>    if (dump_enabled_p () && single_defuse_cycle)
>      dump_printf_loc (MSG_NOTE, vect_location,
> @@ -8415,22 +8520,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>                      "multiple vectors to one in the loop body\n");
>    STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
>
> -  /* If the reduction stmt is one of the patterns that have lane
> -     reduction embedded we cannot handle the case of ! single_defuse_cycle.  
> */
> -  if ((ncopies > 1 && ! single_defuse_cycle)
> -      && lane_reducing)
> -    {
> -      if (dump_enabled_p ())
> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "multi def-use cycle not possible for lane-reducing "
> -                        "reduction operation\n");
> -      return false;
> -    }
> +  /* For lane-reducing operation, the below processing related to single
> +     defuse-cycle will be done in its own vectorizable function.  One more
> +     thing to note is that the operation must not be involved in fold-left
> +     reduction.  */
> +  single_defuse_cycle &= !lane_reducing;
>
>    if (slp_node
> -      && !(!single_defuse_cycle
> -          && !lane_reducing
> -          && reduction_type != FOLD_LEFT_REDUCTION))
> +      && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION))
>      for (i = 0; i < (int) op.num_ops; i++)
>        if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
>         {
> @@ -8443,28 +8540,20 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
>                              reduction_type, ncopies, cost_vec);
>    /* Cost the reduction op inside the loop if transformed via
> -     vect_transform_reduction.  Otherwise this is costed by the
> -     separate vectorizable_* routines.  */
> -  if (single_defuse_cycle || lane_reducing)
> -    {
> -      int factor = 1;
> -      if (vect_is_emulated_mixed_dot_prod (stmt_info))
> -       /* Three dot-products and a subtraction.  */
> -       factor = 4;
> -      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
> -                       stmt_info, 0, vect_body);
> -    }
> +     vect_transform_reduction for non-lane-reducing operation.  Otherwise
> +     this is costed by the separate vectorizable_* routines.  */
> +  if (single_defuse_cycle)
> +    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, 
> vect_body);
>
>    if (dump_enabled_p ()
>        && reduction_type == FOLD_LEFT_REDUCTION)
>      dump_printf_loc (MSG_NOTE, vect_location,
>                      "using an in-order (fold-left) reduction.\n");
>    STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
> -  /* All but single defuse-cycle optimized, lane-reducing and fold-left
> -     reductions go through their own vectorizable_* routines.  */
> -  if (!single_defuse_cycle
> -      && !lane_reducing
> -      && reduction_type != FOLD_LEFT_REDUCTION)
> +
> +  /* All but single defuse-cycle optimized and fold-left reductions go
> +     through their own vectorizable_* routines.  */
> +  if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
>      {
>        stmt_vec_info tem
>         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
> @@ -8742,13 +8831,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>          And vector reduction PHIs are always generated to the full extent, no
>          matter lane-reducing op exists or not.  If some copies or PHIs are
>          actually superfluous, they would be cleaned up by passes after
> -        vectorization.  An example for single-lane slp is given as below.
> +        vectorization.  An example for single-lane slp, lane-reducing ops
> +        with mixed input vectypes in a reduction chain, is given as below.
>          Similarly, this handling is applicable for multiple-lane slp as well.
>
>            int sum = 1;
>            for (i)
>              {
>                sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
> +              sum += w[i];               // widen-sum <vector(16) char>
> +              sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
>              }
>
>          The vector size is 128-bit,vectorization factor is 16.  Reduction
> @@ -8765,9 +8857,19 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>                sum_v1 = sum_v1;  // copy
>                sum_v2 = sum_v2;  // copy
>                sum_v3 = sum_v3;  // copy
> +
> +              sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
> +              sum_v1 = sum_v1;  // copy
> +              sum_v2 = sum_v2;  // copy
> +              sum_v3 = sum_v3;  // copy
> +
> +              sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
> +              sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
> +              sum_v2 = sum_v2;  // copy
> +              sum_v3 = sum_v3;  // copy
>              }
>
> -          sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3;   // = sum_v0
> +          sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3;   // = sum_v0 + sum_v1
>         */
>        unsigned effec_ncopies = vec_oprnds[0].length ();
>        unsigned total_ncopies = vec_oprnds[reduc_index].length ();
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index fdcda0d2aba..135580d25d7 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -13286,6 +13286,8 @@ vect_analyze_stmt (vec_info *vinfo,
>                                       NULL, NULL, node, cost_vec)
>           || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
>           || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
> +         || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
> +                                        stmt_info, node, cost_vec)
>           || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
>                                      node, node_instance, cost_vec)
>           || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 09923b9b440..62121f63f18 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2486,6 +2486,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop 
> *, vec_info_shared *,
>  extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
>                                          slp_tree, slp_instance, int,
>                                          bool, stmt_vector_for_cost *);
> +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
> +                                       slp_tree, stmt_vector_for_cost *);
>  extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
>                                     slp_tree, slp_instance,
>                                     stmt_vector_for_cost *);
> --
> 2.17.1

Re: [PATCH 3/4] vect: Support multiple lane-reducing operations for loop reduction [PR114440]

Reply via email to