On Sat, Jul 13, 2024 at 5:48 PM Feng Xue OS <f...@os.amperecomputing.com> wrote: > > For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current > vectorizer could only handle the pattern if the reduction chain does not > contain other operation, no matter the other is normal or lane-reducing. > > This patches removes some constraints in reduction analysis to allow multiple > arbitrary lane-reducing operations with mixed input vectypes in a loop > reduction chain. For example: > > int sum = 1; > for (i) > { > sum += d0[i] * d1[i]; // dot-prod <vector(16) char> > sum += w[i]; // widen-sum <vector(16) char> > sum += abs(s0[i] - s1[i]); // sad <vector(8) short> > } > > The vector size is 128-bit vectorization factor is 16. Reduction statements > would be transformed as: > > vector<4> int sum_v0 = { 0, 0, 0, 1 }; > vector<4> int sum_v1 = { 0, 0, 0, 0 }; > vector<4> int sum_v2 = { 0, 0, 0, 0 }; > vector<4> int sum_v3 = { 0, 0, 0, 0 }; > > for (i / 16) > { > sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); > sum_v1 = sum_v1; // copy > sum_v2 = sum_v2; // copy > sum_v3 = sum_v3; // copy > > sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); > sum_v1 = sum_v1; // copy > sum_v2 = sum_v2; // copy > sum_v3 = sum_v3; // copy > > sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); > sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); > sum_v2 = sum_v2; // copy > sum_v3 = sum_v3; // copy > } > > sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1
OK. Thanks, Richard. > Thanks, > Feng > --- > gcc/ > PR tree-optimization/114440 > * tree-vectorizer.h (vectorizable_lane_reducing): New function > declaration. > * tree-vect-stmts.cc (vect_analyze_stmt): Call new function > vectorizable_lane_reducing to analyze lane-reducing operation. > * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost > computation > code related to emulated_mixed_dot_prod. > (vectorizable_lane_reducing): New function. > (vectorizable_reduction): Allow multiple lane-reducing operations in > loop reduction. Move some original lane-reducing related code to > vectorizable_lane_reducing. > (vect_transform_reduction): Adjust comments with updated example. > > gcc/testsuite/ > PR tree-optimization/114440 > * gcc.dg/vect/vect-reduc-chain-1.c > * gcc.dg/vect/vect-reduc-chain-2.c > * gcc.dg/vect/vect-reduc-chain-3.c > * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c > * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c > * gcc.dg/vect/vect-reduc-chain-dot-slp-3.c > * gcc.dg/vect/vect-reduc-chain-dot-slp-4.c > * gcc.dg/vect/vect-reduc-dot-slp-1.c > --- > .../gcc.dg/vect/vect-reduc-chain-1.c | 64 +++++ > .../gcc.dg/vect/vect-reduc-chain-2.c | 79 ++++++ > .../gcc.dg/vect/vect-reduc-chain-3.c | 68 +++++ > .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c | 95 +++++++ > .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c | 67 +++++ > .../gcc.dg/vect/vect-reduc-chain-dot-slp-3.c | 79 ++++++ > .../gcc.dg/vect/vect-reduc-chain-dot-slp-4.c | 63 +++++ > .../gcc.dg/vect/vect-reduc-dot-slp-1.c | 60 +++++ > gcc/tree-vect-loop.cc | 240 +++++++++++++----- > gcc/tree-vect-stmts.cc | 2 + > gcc/tree-vectorizer.h | 2 + > 11 files changed, 750 insertions(+), 69 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c > b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c > new file mode 100644 > index 00000000000..80b0089ea0f > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c > @@ -0,0 +1,64 @@ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { > aarch64*-*-* || arm*-*-* } } } */ > +/* { dg-add-options arm_v8_2a_dotprod_neon } */ > + > +#include "tree-vect.h" > + > +#define N 50 > + > +#ifndef SIGNEDNESS_1 > +#define SIGNEDNESS_1 signed > +#define SIGNEDNESS_2 signed > +#endif > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +f (SIGNEDNESS_1 int res, > + SIGNEDNESS_2 char *restrict a, > + SIGNEDNESS_2 char *restrict b, > + SIGNEDNESS_2 char *restrict c, > + SIGNEDNESS_2 char *restrict d, > + SIGNEDNESS_1 int *restrict e) > +{ > + for (int i = 0; i < N; ++i) > + { > + res += a[i] * b[i]; > + res += c[i] * d[i]; > + res += e[i]; > + } > + return res; > +} > + > +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) > +#define OFFSET 20 > + > +int > +main (void) > +{ > + check_vect (); > + > + SIGNEDNESS_2 char a[N], b[N]; > + SIGNEDNESS_2 char c[N], d[N]; > + SIGNEDNESS_1 int e[N]; > + int expected = 0x12345; > + > + #pragma GCC novector > + for (int i = 0; i < N; ++i) > + { > + a[i] = BASE + i * 5; > + b[i] = BASE + OFFSET + i * 4; > + c[i] = BASE + i * 2; > + d[i] = BASE + OFFSET + i * 3; > + e[i] = i; > + expected += a[i] * b[i]; > + expected += c[i] * d[i]; > + expected += e[i]; > + } > + > + if (f (0x12345, a, b, c, d, e) != expected) > + __builtin_abort (); > +} > + > +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" > "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = > DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c > b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c > new file mode 100644 > index 00000000000..5bc2686fc9d > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c > @@ -0,0 +1,79 @@ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { > aarch64*-*-* || arm*-*-* } } } */ > +/* { dg-add-options arm_v8_2a_dotprod_neon } */ > + > +#include "tree-vect.h" > + > +#define N 50 > + > +#ifndef SIGNEDNESS_1 > +#define SIGNEDNESS_1 signed > +#define SIGNEDNESS_2 unsigned > +#define SIGNEDNESS_3 signed > +#define SIGNEDNESS_4 signed > +#endif > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +fn (SIGNEDNESS_1 int res, > + SIGNEDNESS_2 char *restrict a, > + SIGNEDNESS_2 char *restrict b, > + SIGNEDNESS_3 char *restrict c, > + SIGNEDNESS_3 char *restrict d, > + SIGNEDNESS_4 short *restrict e, > + SIGNEDNESS_4 short *restrict f, > + SIGNEDNESS_1 int *restrict g) > +{ > + for (int i = 0; i < N; ++i) > + { > + res += a[i] * b[i]; > + res += i + 1; > + res += c[i] * d[i]; > + res += e[i] * f[i]; > + res += g[i]; > + } > + return res; > +} > + > +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) > +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4) > +#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373) > +#define OFFSET 20 > + > +int > +main (void) > +{ > + check_vect (); > + > + SIGNEDNESS_2 char a[N], b[N]; > + SIGNEDNESS_3 char c[N], d[N]; > + SIGNEDNESS_4 short e[N], f[N]; > + SIGNEDNESS_1 int g[N]; > + int expected = 0x12345; > + > +#pragma GCC novector > + for (int i = 0; i < N; ++i) > + { > + a[i] = BASE2 + i * 5; > + b[i] = BASE2 + OFFSET + i * 4; > + c[i] = BASE3 + i * 2; > + d[i] = BASE3 + OFFSET + i * 3; > + e[i] = BASE4 + i * 6; > + f[i] = BASE4 + OFFSET + i * 5; > + g[i] = i; > + expected += a[i] * b[i]; > + expected += i + 1; > + expected += c[i] * d[i]; > + expected += e[i] * f[i]; > + expected += g[i]; > + } > + > + if (fn (0x12345, a, b, c, d, e, f, g) != expected) > + __builtin_abort (); > +} > + > +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" > "vect" } } */ > +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" > "vect" { target { vect_sdot_qi } } } } */ > +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" > "vect" { target { vect_udot_qi } } } } */ > +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" > "vect" { target { vect_sdot_hi } } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c > b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c > new file mode 100644 > index 00000000000..6a733fbac53 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c > @@ -0,0 +1,68 @@ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > +/* { dg-require-effective-target vect_int } */ > + > +#include "tree-vect.h" > + > +#define N 50 > + > +#ifndef SIGNEDNESS_1 > +#define SIGNEDNESS_1 signed > +#define SIGNEDNESS_2 unsigned > +#define SIGNEDNESS_3 signed > +#endif > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +f (SIGNEDNESS_1 int res, > + SIGNEDNESS_2 char *restrict a, > + SIGNEDNESS_2 char *restrict b, > + SIGNEDNESS_3 short *restrict c, > + SIGNEDNESS_3 short *restrict d, > + SIGNEDNESS_1 int *restrict e) > +{ > + for (int i = 0; i < N; ++i) > + { > + short diff = a[i] - b[i]; > + SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff; > + res += abs; > + res += c[i] * d[i]; > + res += e[i]; > + } > + return res; > +} > + > +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) > +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373) > +#define OFFSET 20 > + > +int > +main (void) > +{ > + check_vect (); > + > + SIGNEDNESS_2 char a[N], b[N]; > + SIGNEDNESS_3 short c[N], d[N]; > + SIGNEDNESS_1 int e[N]; > + int expected = 0x12345; > + > +#pragma GCC novector > + for (int i = 0; i < N; ++i) > + { > + a[i] = BASE2 + i * 5; > + b[i] = BASE2 - i * 4; > + c[i] = BASE3 + i * 2; > + d[i] = BASE3 + OFFSET + i * 3; > + e[i] = i; > + short diff = a[i] - b[i]; > + SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff; > + expected += abs; > + expected += c[i] * d[i]; > + expected += e[i]; > + } > + > + if (f (0x12345, a, b, c, d, e) != expected) > + __builtin_abort (); > +} > + > +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" > "vect" { target vect_udot_qi } } } */ > +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" > "vect" { target vect_sdot_hi } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c > b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c > new file mode 100644 > index 00000000000..72a370ab3c0 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c > @@ -0,0 +1,95 @@ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { > aarch64*-*-* || arm*-*-* } } } */ > +/* { dg-add-options arm_v8_2a_dotprod_neon } */ > + > +#include "tree-vect.h" > + > +#ifndef SIGNEDNESS_1 > +#define SIGNEDNESS_1 signed > +#define SIGNEDNESS_2 signed > +#endif > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +f (SIGNEDNESS_1 int res, > + SIGNEDNESS_2 char *a, > + SIGNEDNESS_2 char *b, > + int step, int n) > +{ > + for (int i = 0; i < n; i++) > + { > + res += a[0] * b[0]; > + res += a[1] * b[1]; > + res += a[2] * b[2]; > + res += a[3] * b[3]; > + res += a[4] * b[4]; > + res += a[5] * b[5]; > + res += a[6] * b[6]; > + res += a[7] * b[7]; > + res += a[8] * b[8]; > + res += a[9] * b[9]; > + res += a[10] * b[10]; > + res += a[11] * b[11]; > + res += a[12] * b[12]; > + res += a[13] * b[13]; > + res += a[14] * b[14]; > + res += a[15] * b[15]; > + > + a += step; > + b += step; > + } > + > + return res; > +} > + > +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) > +#define OFFSET 20 > + > +int > +main (void) > +{ > + check_vect (); > + > + SIGNEDNESS_2 char a[100], b[100]; > + int expected = 0x12345; > + int step = 16; > + int n = 2; > + int t = 0; > + > +#pragma GCC novector > + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) > + { > + a[i] = BASE + i * 5; > + b[i] = BASE + OFFSET + i * 4; > + } > + > +#pragma GCC novector > + for (int i = 0; i < n; i++) > + { > + expected += a[t + 0] * b[t + 0]; > + expected += a[t + 1] * b[t + 1]; > + expected += a[t + 2] * b[t + 2]; > + expected += a[t + 3] * b[t + 3]; > + expected += a[t + 4] * b[t + 4]; > + expected += a[t + 5] * b[t + 5]; > + expected += a[t + 6] * b[t + 6]; > + expected += a[t + 7] * b[t + 7]; > + expected += a[t + 8] * b[t + 8]; > + expected += a[t + 9] * b[t + 9]; > + expected += a[t + 10] * b[t + 10]; > + expected += a[t + 11] * b[t + 11]; > + expected += a[t + 12] * b[t + 12]; > + expected += a[t + 13] * b[t + 13]; > + expected += a[t + 14] * b[t + 14]; > + expected += a[t + 15] * b[t + 15]; > + t += step; > + } > + > + if (f (0x12345, a, b, step, n) != expected) > + __builtin_abort (); > +} > + > +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" > "vect" } } */ > +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = > DOT_PROD_EXPR" 16 "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c > b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c > new file mode 100644 > index 00000000000..aab86ee2f1c > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c > @@ -0,0 +1,67 @@ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { > aarch64*-*-* || arm*-*-* } } } */ > +/* { dg-add-options arm_v8_2a_dotprod_neon } */ > + > +#include "tree-vect.h" > + > +#ifndef SIGNEDNESS_1 > +#define SIGNEDNESS_1 signed > +#define SIGNEDNESS_2 signed > +#endif > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +f (SIGNEDNESS_1 int res, > + SIGNEDNESS_2 char *a, > + SIGNEDNESS_2 char *b, > + int n) > +{ > + for (int i = 0; i < n; i++) > + { > + res += a[5 * i + 0] * b[5 * i + 0]; > + res += a[5 * i + 1] * b[5 * i + 1]; > + res += a[5 * i + 2] * b[5 * i + 2]; > + res += a[5 * i + 3] * b[5 * i + 3]; > + res += a[5 * i + 4] * b[5 * i + 4]; > + } > + > + return res; > +} > + > +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) > +#define OFFSET 20 > + > +int > +main (void) > +{ > + check_vect (); > + > + SIGNEDNESS_2 char a[100], b[100]; > + int expected = 0x12345; > + int n = 18; > + > +#pragma GCC novector > + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) > + { > + a[i] = BASE + i * 5; > + b[i] = BASE + OFFSET + i * 4; > + } > + > +#pragma GCC novector > + for (int i = 0; i < n; i++) > + { > + expected += a[5 * i + 0] * b[5 * i + 0]; > + expected += a[5 * i + 1] * b[5 * i + 1]; > + expected += a[5 * i + 2] * b[5 * i + 2]; > + expected += a[5 * i + 3] * b[5 * i + 3]; > + expected += a[5 * i + 4] * b[5 * i + 4]; > + } > + > + if (f (0x12345, a, b, n) != expected) > + __builtin_abort (); > +} > + > +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" > "vect" } } */ > +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = > DOT_PROD_EXPR" 5 "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c > b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c > new file mode 100644 > index 00000000000..9f1d2136ab6 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c > @@ -0,0 +1,79 @@ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { > aarch64*-*-* || arm*-*-* } } } */ > +/* { dg-add-options arm_v8_2a_dotprod_neon } */ > + > +#include "tree-vect.h" > + > +#ifndef SIGNEDNESS_1 > +#define SIGNEDNESS_1 signed > +#define SIGNEDNESS_2 signed > +#endif > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +f (SIGNEDNESS_1 int res, > + SIGNEDNESS_2 short *a, > + SIGNEDNESS_2 short *b, > + int step, int n) > +{ > + for (int i = 0; i < n; i++) > + { > + res += a[0] * b[0]; > + res += a[1] * b[1]; > + res += a[2] * b[2]; > + res += a[3] * b[3]; > + res += a[4] * b[4]; > + res += a[5] * b[5]; > + res += a[6] * b[6]; > + res += a[7] * b[7]; > + > + a += step; > + b += step; > + } > + > + return res; > +} > + > +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373) > +#define OFFSET 20 > + > +int > +main (void) > +{ > + check_vect (); > + > + SIGNEDNESS_2 short a[100], b[100]; > + int expected = 0x12345; > + int step = 8; > + int n = 2; > + int t = 0; > + > +#pragma GCC novector > + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) > + { > + a[i] = BASE + i * 5; > + b[i] = BASE + OFFSET + i * 4; > + } > + > +#pragma GCC novector > + for (int i = 0; i < n; i++) > + { > + expected += a[t + 0] * b[t + 0]; > + expected += a[t + 1] * b[t + 1]; > + expected += a[t + 2] * b[t + 2]; > + expected += a[t + 3] * b[t + 3]; > + expected += a[t + 4] * b[t + 4]; > + expected += a[t + 5] * b[t + 5]; > + expected += a[t + 6] * b[t + 6]; > + expected += a[t + 7] * b[t + 7]; > + t += step; > + } > + > + if (f (0x12345, a, b, step, n) != expected) > + __builtin_abort (); > +} > + > +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" > "vect" } } */ > +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = > DOT_PROD_EXPR" 8 "vect" { target vect_sdot_hi } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c > b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c > new file mode 100644 > index 00000000000..f4dcebdfa10 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c > @@ -0,0 +1,63 @@ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { > aarch64*-*-* || arm*-*-* } } } */ > +/* { dg-add-options arm_v8_2a_dotprod_neon } */ > + > +#include "tree-vect.h" > + > +#ifndef SIGNEDNESS_1 > +#define SIGNEDNESS_1 signed > +#define SIGNEDNESS_2 signed > +#endif > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +f (SIGNEDNESS_1 int res, > + SIGNEDNESS_2 short *a, > + SIGNEDNESS_2 short *b, > + int n) > +{ > + for (int i = 0; i < n; i++) > + { > + res += a[3 * i + 0] * b[3 * i + 0]; > + res += a[3 * i + 1] * b[3 * i + 1]; > + res += a[3 * i + 2] * b[3 * i + 2]; > + } > + > + return res; > +} > + > +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373) > +#define OFFSET 20 > + > +int > +main (void) > +{ > + check_vect (); > + > + SIGNEDNESS_2 short a[100], b[100]; > + int expected = 0x12345; > + int n = 18; > + > +#pragma GCC novector > + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) > + { > + a[i] = BASE + i * 5; > + b[i] = BASE + OFFSET + i * 4; > + } > + > +#pragma GCC novector > + for (int i = 0; i < n; i++) > + { > + expected += a[3 * i + 0] * b[3 * i + 0]; > + expected += a[3 * i + 1] * b[3 * i + 1]; > + expected += a[3 * i + 2] * b[3 * i + 2]; > + } > + > + if (f (0x12345, a, b, n) != expected) > + __builtin_abort (); > +} > + > +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" > "vect" } } */ > +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = > DOT_PROD_EXPR" 3 "vect" { target vect_sdot_hi } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c > b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c > new file mode 100644 > index 00000000000..84c82b023d4 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c > @@ -0,0 +1,60 @@ > +/* Disabling epilogues until we find a better way to deal with scans. */ > +/* { dg-do compile } */ > +/* { dg-additional-options "--param vect-epilogues-nomask=0 > -fdump-tree-optimized" } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { > aarch64*-*-* || arm*-*-* } } } */ > +/* { dg-add-options arm_v8_2a_dotprod_neon } */ > + > +#include "tree-vect.h" > + > +#ifndef SIGNEDNESS_1 > +#define SIGNEDNESS_1 signed > +#define SIGNEDNESS_2 signed > +#endif > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +f (SIGNEDNESS_1 int res0, > + SIGNEDNESS_1 int res1, > + SIGNEDNESS_1 int res2, > + SIGNEDNESS_1 int res3, > + SIGNEDNESS_1 int res4, > + SIGNEDNESS_1 int res5, > + SIGNEDNESS_1 int res6, > + SIGNEDNESS_1 int res7, > + SIGNEDNESS_1 int res8, > + SIGNEDNESS_1 int res9, > + SIGNEDNESS_1 int resA, > + SIGNEDNESS_1 int resB, > + SIGNEDNESS_1 int resC, > + SIGNEDNESS_1 int resD, > + SIGNEDNESS_1 int resE, > + SIGNEDNESS_1 int resF, > + SIGNEDNESS_2 char *a, > + SIGNEDNESS_2 char *b) > +{ > + for (int i = 0; i < 64; i += 16) > + { > + res0 += a[i + 0x00] * b[i + 0x00]; > + res1 += a[i + 0x01] * b[i + 0x01]; > + res2 += a[i + 0x02] * b[i + 0x02]; > + res3 += a[i + 0x03] * b[i + 0x03]; > + res4 += a[i + 0x04] * b[i + 0x04]; > + res5 += a[i + 0x05] * b[i + 0x05]; > + res6 += a[i + 0x06] * b[i + 0x06]; > + res7 += a[i + 0x07] * b[i + 0x07]; > + res8 += a[i + 0x08] * b[i + 0x08]; > + res9 += a[i + 0x09] * b[i + 0x09]; > + resA += a[i + 0x0A] * b[i + 0x0A]; > + resB += a[i + 0x0B] * b[i + 0x0B]; > + resC += a[i + 0x0C] * b[i + 0x0C]; > + resD += a[i + 0x0D] * b[i + 0x0D]; > + resE += a[i + 0x0E] * b[i + 0x0E]; > + resF += a[i + 0x0F] * b[i + 0x0F]; > + } > + > + return res0 ^ res1 ^ res2 ^ res3 ^ res4 ^ res5 ^ res6 ^ res7 ^ > + res8 ^ res9 ^ resA ^ resB ^ resC ^ resD ^ resE ^ resF; > +} > + > +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" > "vect" } } */ > +/* { dg-final { scan-tree-dump-not "DOT_PROD_EXPR" "optimized" } } */ > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 5ac83e76975..e72d692ffa3 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -5328,8 +5328,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, > if (!gimple_extract_op (orig_stmt_info->stmt, &op)) > gcc_unreachable (); > > - bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); > - > if (reduction_type == EXTRACT_LAST_REDUCTION) > /* No extra instructions are needed in the prologue. The loop body > operations are costed in vectorizable_condition. */ > @@ -5364,12 +5362,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, > initial result of the data reduction, initial value of the index > reduction. */ > prologue_stmts = 4; > - else if (emulated_mixed_dot_prod) > - /* We need the initial reduction value and two invariants: > - one that contains the minimum signed value and one that > - contains half of its negative. */ > - prologue_stmts = 3; > else > + /* We need the initial reduction value. */ > prologue_stmts = 1; > prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, > scalar_to_vec, stmt_info, 0, > @@ -7478,6 +7472,143 @@ vect_reduction_update_partial_vector_usage > (loop_vec_info loop_vinfo, > } > } > > +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in > + the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC. > + Now there are three such kinds of operations: dot-prod/widen-sum/sad > + (sum-of-absolute-differences). > + > + For a lane-reducing operation, the loop reduction path that it lies in, > + may contain normal operation, or other lane-reducing operation of > different > + input type size, an example as: > + > + int sum = 0; > + for (i) > + { > + ... > + sum += d0[i] * d1[i]; // dot-prod <vector(16) char> > + sum += w[i]; // widen-sum <vector(16) char> > + sum += abs(s0[i] - s1[i]); // sad <vector(8) short> > + sum += n[i]; // normal <vector(4) int> > + ... > + } > + > + Vectorization factor is essentially determined by operation whose input > + vectype has the most lanes ("vector(16) char" in the example), while we > + need to choose input vectype with the least lanes ("vector(4) int" in the > + example) for the reduction PHI statement. */ > + > +bool > +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info > stmt_info, > + slp_tree slp_node, stmt_vector_for_cost *cost_vec) > +{ > + gimple *stmt = stmt_info->stmt; > + > + if (!lane_reducing_stmt_p (stmt)) > + return false; > + > + tree type = TREE_TYPE (gimple_assign_lhs (stmt)); > + > + if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type)) > + return false; > + > + /* Do not try to vectorize bit-precision reductions. */ > + if (!type_has_mode_precision_p (type)) > + return false; > + > + for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++) > + { > + stmt_vec_info def_stmt_info; > + slp_tree slp_op; > + tree op; > + tree vectype; > + enum vect_def_type dt; > + > + if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op, > + &slp_op, &dt, &vectype, &def_stmt_info)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "use not simple.\n"); > + return false; > + } > + > + if (!vectype) > + { > + vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op), > + slp_op); > + if (!vectype) > + return false; > + } > + > + if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "incompatible vector types for invariants\n"); > + return false; > + } > + > + if (i == STMT_VINFO_REDUC_IDX (stmt_info)) > + continue; > + > + /* There should be at most one cycle def in the stmt. */ > + if (VECTORIZABLE_CYCLE_DEF (dt)) > + return false; > + } > + > + stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt > (stmt_info)); > + > + /* TODO: Support lane-reducing operation that does not directly participate > + in loop reduction. */ > + if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0) > + return false; > + > + /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not > + recoginized. */ > + gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def); > + gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION); > + > + tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info); > + > + gcc_assert (vectype_in); > + > + /* Compute number of effective vector statements for costing. */ > + unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node, > + vectype_in); > + gcc_assert (ncopies_for_cost >= 1); > + > + if (vect_is_emulated_mixed_dot_prod (stmt_info)) > + { > + /* We need extra two invariants: one that contains the minimum signed > + value and one that contains half of its negative. */ > + int prologue_stmts = 2; > + unsigned cost = record_stmt_cost (cost_vec, prologue_stmts, > + scalar_to_vec, stmt_info, 0, > + vect_prologue); > + if (dump_enabled_p ()) > + dump_printf (MSG_NOTE, "vectorizable_lane_reducing: " > + "extra prologue_cost = %d .\n", cost); > + > + /* Three dot-products and a subtraction. */ > + ncopies_for_cost *= 4; > + } > + > + record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info, > + 0, vect_body); > + > + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > + { > + enum tree_code code = gimple_assign_rhs_code (stmt); > + vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info, > + slp_node, code, type, > + vectype_in); > + } > + > + /* Transform via vect_transform_reduction. */ > + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; > + return true; > +} > + > /* Function vectorizable_reduction. > > Check if STMT_INFO performs a reduction operation that can be vectorized. > @@ -7811,18 +7942,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > if (!type_has_mode_precision_p (op.type)) > return false; > > - /* For lane-reducing ops we're reducing the number of reduction PHIs > - which means the only use of that may be in the lane-reducing operation. > */ > - if (lane_reducing > - && reduc_chain_length != 1 > - && !only_slp_reduc_chain) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "lane-reducing reduction with extra stmts.\n"); > - return false; > - } > - > /* Lane-reducing ops also never can be used in a SLP reduction group > since we'll mix lanes belonging to different reductions. But it's > OK to use them in a reduction chain or when the reduction group > @@ -8362,14 +8481,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > && loop_vinfo->suggested_unroll_factor == 1) > single_defuse_cycle = true; > > - if (single_defuse_cycle || lane_reducing) > + if (single_defuse_cycle && !lane_reducing) > { > gcc_assert (op.code != COND_EXPR); > > - /* 4. Supportable by target? */ > - bool ok = true; > - > - /* 4.1. check support for the operation in the loop > + /* 4. check support for the operation in the loop > > This isn't necessary for the lane reduction codes, since they > can only be produced by pattern matching, and it's up to the > @@ -8378,14 +8494,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > mixed-sign dot-products can be implemented using signed > dot-products. */ > machine_mode vec_mode = TYPE_MODE (vectype_in); > - if (!lane_reducing > - && !directly_supported_p (op.code, vectype_in, optab_vector)) > + if (!directly_supported_p (op.code, vectype_in, optab_vector)) > { > if (dump_enabled_p ()) > dump_printf (MSG_NOTE, "op not supported by target.\n"); > if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) > || !vect_can_vectorize_without_simd_p (op.code)) > - ok = false; > + single_defuse_cycle = false; > else > if (dump_enabled_p ()) > dump_printf (MSG_NOTE, "proceeding using word mode.\n"); > @@ -8398,16 +8513,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > dump_printf (MSG_NOTE, "using word mode not possible.\n"); > return false; > } > - > - /* lane-reducing operations have to go through > vect_transform_reduction. > - For the other cases try without the single cycle optimization. */ > - if (!ok) > - { > - if (lane_reducing) > - return false; > - else > - single_defuse_cycle = false; > - } > } > if (dump_enabled_p () && single_defuse_cycle) > dump_printf_loc (MSG_NOTE, vect_location, > @@ -8415,22 +8520,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > "multiple vectors to one in the loop body\n"); > STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; > > - /* If the reduction stmt is one of the patterns that have lane > - reduction embedded we cannot handle the case of ! single_defuse_cycle. > */ > - if ((ncopies > 1 && ! single_defuse_cycle) > - && lane_reducing) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "multi def-use cycle not possible for lane-reducing " > - "reduction operation\n"); > - return false; > - } > + /* For lane-reducing operation, the below processing related to single > + defuse-cycle will be done in its own vectorizable function. One more > + thing to note is that the operation must not be involved in fold-left > + reduction. */ > + single_defuse_cycle &= !lane_reducing; > > if (slp_node > - && !(!single_defuse_cycle > - && !lane_reducing > - && reduction_type != FOLD_LEFT_REDUCTION)) > + && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)) > for (i = 0; i < (int) op.num_ops; i++) > if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) > { > @@ -8443,28 +8540,20 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, > reduction_type, ncopies, cost_vec); > /* Cost the reduction op inside the loop if transformed via > - vect_transform_reduction. Otherwise this is costed by the > - separate vectorizable_* routines. */ > - if (single_defuse_cycle || lane_reducing) > - { > - int factor = 1; > - if (vect_is_emulated_mixed_dot_prod (stmt_info)) > - /* Three dot-products and a subtraction. */ > - factor = 4; > - record_stmt_cost (cost_vec, ncopies * factor, vector_stmt, > - stmt_info, 0, vect_body); > - } > + vect_transform_reduction for non-lane-reducing operation. Otherwise > + this is costed by the separate vectorizable_* routines. */ > + if (single_defuse_cycle) > + record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, > vect_body); > > if (dump_enabled_p () > && reduction_type == FOLD_LEFT_REDUCTION) > dump_printf_loc (MSG_NOTE, vect_location, > "using an in-order (fold-left) reduction.\n"); > STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; > - /* All but single defuse-cycle optimized, lane-reducing and fold-left > - reductions go through their own vectorizable_* routines. */ > - if (!single_defuse_cycle > - && !lane_reducing > - && reduction_type != FOLD_LEFT_REDUCTION) > + > + /* All but single defuse-cycle optimized and fold-left reductions go > + through their own vectorizable_* routines. */ > + if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION) > { > stmt_vec_info tem > = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); > @@ -8742,13 +8831,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > And vector reduction PHIs are always generated to the full extent, no > matter lane-reducing op exists or not. If some copies or PHIs are > actually superfluous, they would be cleaned up by passes after > - vectorization. An example for single-lane slp is given as below. > + vectorization. An example for single-lane slp, lane-reducing ops > + with mixed input vectypes in a reduction chain, is given as below. > Similarly, this handling is applicable for multiple-lane slp as well. > > int sum = 1; > for (i) > { > sum += d0[i] * d1[i]; // dot-prod <vector(16) char> > + sum += w[i]; // widen-sum <vector(16) char> > + sum += abs(s0[i] - s1[i]); // sad <vector(8) short> > } > > The vector size is 128-bit,vectorization factor is 16. Reduction > @@ -8765,9 +8857,19 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > sum_v1 = sum_v1; // copy > sum_v2 = sum_v2; // copy > sum_v3 = sum_v3; // copy > + > + sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); > + sum_v1 = sum_v1; // copy > + sum_v2 = sum_v2; // copy > + sum_v3 = sum_v3; // copy > + > + sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); > + sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); > + sum_v2 = sum_v2; // copy > + sum_v3 = sum_v3; // copy > } > > - sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 > + sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1 > */ > unsigned effec_ncopies = vec_oprnds[0].length (); > unsigned total_ncopies = vec_oprnds[reduc_index].length (); > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index fdcda0d2aba..135580d25d7 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -13286,6 +13286,8 @@ vect_analyze_stmt (vec_info *vinfo, > NULL, NULL, node, cost_vec) > || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec) > || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec) > + || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo), > + stmt_info, node, cost_vec) > || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info, > node, node_instance, cost_vec) > || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info, > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 09923b9b440..62121f63f18 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -2486,6 +2486,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop > *, vec_info_shared *, > extern bool vectorizable_live_operation (vec_info *, stmt_vec_info, > slp_tree, slp_instance, int, > bool, stmt_vector_for_cost *); > +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info, > + slp_tree, stmt_vector_for_cost *); > extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info, > slp_tree, slp_instance, > stmt_vector_for_cost *); > -- > 2.17.1