https://gcc.gnu.org/g:178cc419512f7e358f88dfe2336625aa99cd7438
commit r15-2096-g178cc419512f7e358f88dfe2336625aa99cd7438 Author: Feng Xue <f...@os.amperecomputing.com> Date: Wed May 29 17:22:36 2024 +0800 vect: Support multiple lane-reducing operations for loop reduction [PR114440] For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current vectorizer could only handle the pattern if the reduction chain does not contain other operation, no matter the other is normal or lane-reducing. This patches removes some constraints in reduction analysis to allow multiple arbitrary lane-reducing operations with mixed input vectypes in a loop reduction chain. For example: int sum = 1; for (i) { sum += d0[i] * d1[i]; // dot-prod <vector(16) char> sum += w[i]; // widen-sum <vector(16) char> sum += abs(s0[i] - s1[i]); // sad <vector(8) short> } The vector size is 128-bit vectorization factor is 16. Reduction statements would be transformed as: vector<4> int sum_v0 = { 0, 0, 0, 1 }; vector<4> int sum_v1 = { 0, 0, 0, 0 }; vector<4> int sum_v2 = { 0, 0, 0, 0 }; vector<4> int sum_v3 = { 0, 0, 0, 0 }; for (i / 16) { sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy } sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1 2024-03-22 Feng Xue <f...@os.amperecomputing.com> gcc/ PR tree-optimization/114440 * tree-vectorizer.h (vectorizable_lane_reducing): New function declaration. * tree-vect-stmts.cc (vect_analyze_stmt): Call new function vectorizable_lane_reducing to analyze lane-reducing operation. * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation code related to emulated_mixed_dot_prod. (vectorizable_lane_reducing): New function. (vectorizable_reduction): Allow multiple lane-reducing operations in loop reduction. Move some original lane-reducing related code to vectorizable_lane_reducing. (vect_transform_reduction): Adjust comments with updated example. gcc/testsuite/ PR tree-optimization/114440 * gcc.dg/vect/vect-reduc-chain-1.c * gcc.dg/vect/vect-reduc-chain-2.c * gcc.dg/vect/vect-reduc-chain-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c * gcc.dg/vect/vect-reduc-chain-dot-slp-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-4.c * gcc.dg/vect/vect-reduc-dot-slp-1.c Diff: --- gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c | 64 ++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c | 79 +++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c | 68 ++++++ .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c | 95 ++++++++ .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c | 67 ++++++ .../gcc.dg/vect/vect-reduc-chain-dot-slp-3.c | 79 +++++++ .../gcc.dg/vect/vect-reduc-chain-dot-slp-4.c | 63 ++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c | 60 +++++ gcc/tree-vect-loop.cc | 241 +++++++++++++++------ gcc/tree-vect-stmts.cc | 2 + gcc/tree-vectorizer.h | 2 + 11 files changed, 750 insertions(+), 70 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c new file mode 100644 index 000000000000..80b0089ea0fa --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c @@ -0,0 +1,64 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *restrict a, + SIGNEDNESS_2 char *restrict b, + SIGNEDNESS_2 char *restrict c, + SIGNEDNESS_2 char *restrict d, + SIGNEDNESS_1 int *restrict e) +{ + for (int i = 0; i < N; ++i) + { + res += a[i] * b[i]; + res += c[i] * d[i]; + res += e[i]; + } + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[N], b[N]; + SIGNEDNESS_2 char c[N], d[N]; + SIGNEDNESS_1 int e[N]; + int expected = 0x12345; + + #pragma GCC novector + for (int i = 0; i < N; ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + c[i] = BASE + i * 2; + d[i] = BASE + OFFSET + i * 3; + e[i] = i; + expected += a[i] * b[i]; + expected += c[i] * d[i]; + expected += e[i]; + } + + if (f (0x12345, a, b, c, d, e) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c new file mode 100644 index 000000000000..5bc2686fc9d6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c @@ -0,0 +1,79 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 unsigned +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +fn (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *restrict a, + SIGNEDNESS_2 char *restrict b, + SIGNEDNESS_3 char *restrict c, + SIGNEDNESS_3 char *restrict d, + SIGNEDNESS_4 short *restrict e, + SIGNEDNESS_4 short *restrict f, + SIGNEDNESS_1 int *restrict g) +{ + for (int i = 0; i < N; ++i) + { + res += a[i] * b[i]; + res += i + 1; + res += c[i] * d[i]; + res += e[i] * f[i]; + res += g[i]; + } + return res; +} + +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4) +#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[N], b[N]; + SIGNEDNESS_3 char c[N], d[N]; + SIGNEDNESS_4 short e[N], f[N]; + SIGNEDNESS_1 int g[N]; + int expected = 0x12345; + +#pragma GCC novector + for (int i = 0; i < N; ++i) + { + a[i] = BASE2 + i * 5; + b[i] = BASE2 + OFFSET + i * 4; + c[i] = BASE3 + i * 2; + d[i] = BASE3 + OFFSET + i * 3; + e[i] = BASE4 + i * 6; + f[i] = BASE4 + OFFSET + i * 5; + g[i] = i; + expected += a[i] * b[i]; + expected += i + 1; + expected += c[i] * d[i]; + expected += e[i] * f[i]; + expected += g[i]; + } + + if (fn (0x12345, a, b, c, d, e, f, g) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c new file mode 100644 index 000000000000..6a733fbac534 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c @@ -0,0 +1,68 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ + +#include "tree-vect.h" + +#define N 50 + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 unsigned +#define SIGNEDNESS_3 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *restrict a, + SIGNEDNESS_2 char *restrict b, + SIGNEDNESS_3 short *restrict c, + SIGNEDNESS_3 short *restrict d, + SIGNEDNESS_1 int *restrict e) +{ + for (int i = 0; i < N; ++i) + { + short diff = a[i] - b[i]; + SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff; + res += abs; + res += c[i] * d[i]; + res += e[i]; + } + return res; +} + +#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[N], b[N]; + SIGNEDNESS_3 short c[N], d[N]; + SIGNEDNESS_1 int e[N]; + int expected = 0x12345; + +#pragma GCC novector + for (int i = 0; i < N; ++i) + { + a[i] = BASE2 + i * 5; + b[i] = BASE2 - i * 4; + c[i] = BASE3 + i * 2; + d[i] = BASE3 + OFFSET + i * 3; + e[i] = i; + short diff = a[i] - b[i]; + SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff; + expected += abs; + expected += c[i] * d[i]; + expected += e[i]; + } + + if (f (0x12345, a, b, c, d, e) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */ +/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c new file mode 100644 index 000000000000..72a370ab3c01 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c @@ -0,0 +1,95 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *a, + SIGNEDNESS_2 char *b, + int step, int n) +{ + for (int i = 0; i < n; i++) + { + res += a[0] * b[0]; + res += a[1] * b[1]; + res += a[2] * b[2]; + res += a[3] * b[3]; + res += a[4] * b[4]; + res += a[5] * b[5]; + res += a[6] * b[6]; + res += a[7] * b[7]; + res += a[8] * b[8]; + res += a[9] * b[9]; + res += a[10] * b[10]; + res += a[11] * b[11]; + res += a[12] * b[12]; + res += a[13] * b[13]; + res += a[14] * b[14]; + res += a[15] * b[15]; + + a += step; + b += step; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[100], b[100]; + int expected = 0x12345; + int step = 16; + int n = 2; + int t = 0; + +#pragma GCC novector + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + } + +#pragma GCC novector + for (int i = 0; i < n; i++) + { + expected += a[t + 0] * b[t + 0]; + expected += a[t + 1] * b[t + 1]; + expected += a[t + 2] * b[t + 2]; + expected += a[t + 3] * b[t + 3]; + expected += a[t + 4] * b[t + 4]; + expected += a[t + 5] * b[t + 5]; + expected += a[t + 6] * b[t + 6]; + expected += a[t + 7] * b[t + 7]; + expected += a[t + 8] * b[t + 8]; + expected += a[t + 9] * b[t + 9]; + expected += a[t + 10] * b[t + 10]; + expected += a[t + 11] * b[t + 11]; + expected += a[t + 12] * b[t + 12]; + expected += a[t + 13] * b[t + 13]; + expected += a[t + 14] * b[t + 14]; + expected += a[t + 15] * b[t + 15]; + t += step; + } + + if (f (0x12345, a, b, step, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c new file mode 100644 index 000000000000..aab86ee2f1cd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c @@ -0,0 +1,67 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 char *a, + SIGNEDNESS_2 char *b, + int n) +{ + for (int i = 0; i < n; i++) + { + res += a[5 * i + 0] * b[5 * i + 0]; + res += a[5 * i + 1] * b[5 * i + 1]; + res += a[5 * i + 2] * b[5 * i + 2]; + res += a[5 * i + 3] * b[5 * i + 3]; + res += a[5 * i + 4] * b[5 * i + 4]; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 char a[100], b[100]; + int expected = 0x12345; + int n = 18; + +#pragma GCC novector + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + } + +#pragma GCC novector + for (int i = 0; i < n; i++) + { + expected += a[5 * i + 0] * b[5 * i + 0]; + expected += a[5 * i + 1] * b[5 * i + 1]; + expected += a[5 * i + 2] * b[5 * i + 2]; + expected += a[5 * i + 3] * b[5 * i + 3]; + expected += a[5 * i + 4] * b[5 * i + 4]; + } + + if (f (0x12345, a, b, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 5 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c new file mode 100644 index 000000000000..9f1d2136ab6e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c @@ -0,0 +1,79 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 short *a, + SIGNEDNESS_2 short *b, + int step, int n) +{ + for (int i = 0; i < n; i++) + { + res += a[0] * b[0]; + res += a[1] * b[1]; + res += a[2] * b[2]; + res += a[3] * b[3]; + res += a[4] * b[4]; + res += a[5] * b[5]; + res += a[6] * b[6]; + res += a[7] * b[7]; + + a += step; + b += step; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 short a[100], b[100]; + int expected = 0x12345; + int step = 8; + int n = 2; + int t = 0; + +#pragma GCC novector + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + } + +#pragma GCC novector + for (int i = 0; i < n; i++) + { + expected += a[t + 0] * b[t + 0]; + expected += a[t + 1] * b[t + 1]; + expected += a[t + 2] * b[t + 2]; + expected += a[t + 3] * b[t + 3]; + expected += a[t + 4] * b[t + 4]; + expected += a[t + 5] * b[t + 5]; + expected += a[t + 6] * b[t + 6]; + expected += a[t + 7] * b[t + 7]; + t += step; + } + + if (f (0x12345, a, b, step, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect" { target vect_sdot_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c new file mode 100644 index 000000000000..f4dcebdfa100 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c @@ -0,0 +1,63 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, + SIGNEDNESS_2 short *a, + SIGNEDNESS_2 short *b, + int n) +{ + for (int i = 0; i < n; i++) + { + res += a[3 * i + 0] * b[3 * i + 0]; + res += a[3 * i + 1] * b[3 * i + 1]; + res += a[3 * i + 2] * b[3 * i + 2]; + } + + return res; +} + +#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373) +#define OFFSET 20 + +int +main (void) +{ + check_vect (); + + SIGNEDNESS_2 short a[100], b[100]; + int expected = 0x12345; + int n = 18; + +#pragma GCC novector + for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i) + { + a[i] = BASE + i * 5; + b[i] = BASE + OFFSET + i * 4; + } + +#pragma GCC novector + for (int i = 0; i < n; i++) + { + expected += a[3 * i + 0] * b[3 * i + 0]; + expected += a[3 * i + 1] * b[3 * i + 1]; + expected += a[3 * i + 2] * b[3 * i + 2]; + } + + if (f (0x12345, a, b, n) != expected) + __builtin_abort (); +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 3 "vect" { target vect_sdot_hi } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c new file mode 100644 index 000000000000..84c82b023d40 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c @@ -0,0 +1,60 @@ +/* Disabling epilogues until we find a better way to deal with scans. */ +/* { dg-do compile } */ +/* { dg-additional-options "--param vect-epilogues-nomask=0 -fdump-tree-optimized" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_dotprod_neon } */ + +#include "tree-vect.h" + +#ifndef SIGNEDNESS_1 +#define SIGNEDNESS_1 signed +#define SIGNEDNESS_2 signed +#endif + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res0, + SIGNEDNESS_1 int res1, + SIGNEDNESS_1 int res2, + SIGNEDNESS_1 int res3, + SIGNEDNESS_1 int res4, + SIGNEDNESS_1 int res5, + SIGNEDNESS_1 int res6, + SIGNEDNESS_1 int res7, + SIGNEDNESS_1 int res8, + SIGNEDNESS_1 int res9, + SIGNEDNESS_1 int resA, + SIGNEDNESS_1 int resB, + SIGNEDNESS_1 int resC, + SIGNEDNESS_1 int resD, + SIGNEDNESS_1 int resE, + SIGNEDNESS_1 int resF, + SIGNEDNESS_2 char *a, + SIGNEDNESS_2 char *b) +{ + for (int i = 0; i < 64; i += 16) + { + res0 += a[i + 0x00] * b[i + 0x00]; + res1 += a[i + 0x01] * b[i + 0x01]; + res2 += a[i + 0x02] * b[i + 0x02]; + res3 += a[i + 0x03] * b[i + 0x03]; + res4 += a[i + 0x04] * b[i + 0x04]; + res5 += a[i + 0x05] * b[i + 0x05]; + res6 += a[i + 0x06] * b[i + 0x06]; + res7 += a[i + 0x07] * b[i + 0x07]; + res8 += a[i + 0x08] * b[i + 0x08]; + res9 += a[i + 0x09] * b[i + 0x09]; + resA += a[i + 0x0A] * b[i + 0x0A]; + resB += a[i + 0x0B] * b[i + 0x0B]; + resC += a[i + 0x0C] * b[i + 0x0C]; + resD += a[i + 0x0D] * b[i + 0x0D]; + resE += a[i + 0x0E] * b[i + 0x0E]; + resF += a[i + 0x0F] * b[i + 0x0F]; + } + + return res0 ^ res1 ^ res2 ^ res3 ^ res4 ^ res5 ^ res6 ^ res7 ^ + res8 ^ res9 ^ resA ^ resB ^ resC ^ resD ^ resE ^ resF; +} + +/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */ +/* { dg-final { scan-tree-dump-not "DOT_PROD_EXPR" "optimized" } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 9c5c30535713..1c3dbf4bc71b 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -5328,8 +5328,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, if (!gimple_extract_op (orig_stmt_info->stmt, &op)) gcc_unreachable (); - bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); - if (reduction_type == EXTRACT_LAST_REDUCTION) /* No extra instructions are needed in the prologue. The loop body operations are costed in vectorizable_condition. */ @@ -5364,12 +5362,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, initial result of the data reduction, initial value of the index reduction. */ prologue_stmts = 4; - else if (emulated_mixed_dot_prod) - /* We need the initial reduction value and two invariants: - one that contains the minimum signed value and one that - contains half of its negative. */ - prologue_stmts = 3; else + /* We need the initial reduction value. */ prologue_stmts = 1; prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, scalar_to_vec, stmt_info, 0, @@ -7482,6 +7476,142 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo, } } +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in + the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC, + and the analysis is for slp if SLP_NODE is not NULL. + + For a lane-reducing operation, the loop reduction path that it lies in, + may contain normal operation, or other lane-reducing operation of different + input type size, an example as: + + int sum = 0; + for (i) + { + ... + sum += d0[i] * d1[i]; // dot-prod <vector(16) char> + sum += w[i]; // widen-sum <vector(16) char> + sum += abs(s0[i] - s1[i]); // sad <vector(8) short> + sum += n[i]; // normal <vector(4) int> + ... + } + + Vectorization factor is essentially determined by operation whose input + vectype has the most lanes ("vector(16) char" in the example), while we + need to choose input vectype with the least lanes ("vector(4) int" in the + example) to determine effective number of vector reduction PHIs. */ + +bool +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, + slp_tree slp_node, stmt_vector_for_cost *cost_vec) +{ + gimple *stmt = stmt_info->stmt; + + if (!lane_reducing_stmt_p (stmt)) + return false; + + tree type = TREE_TYPE (gimple_assign_lhs (stmt)); + + if (!INTEGRAL_TYPE_P (type)) + return false; + + /* Do not try to vectorize bit-precision reductions. */ + if (!type_has_mode_precision_p (type)) + return false; + + stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); + + /* TODO: Support lane-reducing operation that does not directly participate + in loop reduction. */ + if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0) + return false; + + /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not + recoginized. */ + gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def); + gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION); + + for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++) + { + stmt_vec_info def_stmt_info; + slp_tree slp_op; + tree op; + tree vectype; + enum vect_def_type dt; + + if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op, + &slp_op, &dt, &vectype, &def_stmt_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "use not simple.\n"); + return false; + } + + if (!vectype) + { + vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op), + slp_op); + if (!vectype) + return false; + } + + if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for invariants\n"); + return false; + } + + if (i == STMT_VINFO_REDUC_IDX (stmt_info)) + continue; + + /* There should be at most one cycle def in the stmt. */ + if (VECTORIZABLE_CYCLE_DEF (dt)) + return false; + } + + tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info); + + gcc_assert (vectype_in); + + /* Compute number of effective vector statements for costing. */ + unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node, + vectype_in); + gcc_assert (ncopies_for_cost >= 1); + + if (vect_is_emulated_mixed_dot_prod (stmt_info)) + { + /* We need extra two invariants: one that contains the minimum signed + value and one that contains half of its negative. */ + int prologue_stmts = 2; + unsigned cost = record_stmt_cost (cost_vec, prologue_stmts, + scalar_to_vec, stmt_info, 0, + vect_prologue); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "vectorizable_lane_reducing: " + "extra prologue_cost = %d .\n", cost); + + /* Three dot-products and a subtraction. */ + ncopies_for_cost *= 4; + } + + record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info, + 0, vect_body); + + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + { + enum tree_code code = gimple_assign_rhs_code (stmt); + vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info, + slp_node, code, type, + vectype_in); + } + + /* Transform via vect_transform_reduction. */ + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; + return true; +} + /* Function vectorizable_reduction. Check if STMT_INFO performs a reduction operation that can be vectorized. @@ -7721,7 +7851,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } /* For lane-reducing operation vectorizable analysis needs the - reduction PHI information */ + reduction PHI information. */ STMT_VINFO_REDUC_DEF (def) = phi_info; /* Each lane-reducing operation has its own input vectype, while @@ -7815,18 +7945,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, if (!type_has_mode_precision_p (op.type)) return false; - /* For lane-reducing ops we're reducing the number of reduction PHIs - which means the only use of that may be in the lane-reducing operation. */ - if (lane_reducing - && reduc_chain_length != 1 - && !only_slp_reduc_chain) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "lane-reducing reduction with extra stmts.\n"); - return false; - } - /* Lane-reducing ops also never can be used in a SLP reduction group since we'll mix lanes belonging to different reductions. But it's OK to use them in a reduction chain or when the reduction group @@ -8366,14 +8484,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, && loop_vinfo->suggested_unroll_factor == 1) single_defuse_cycle = true; - if (single_defuse_cycle || lane_reducing) + if (single_defuse_cycle && !lane_reducing) { gcc_assert (op.code != COND_EXPR); - /* 4. Supportable by target? */ - bool ok = true; - - /* 4.1. check support for the operation in the loop + /* 4. check support for the operation in the loop This isn't necessary for the lane reduction codes, since they can only be produced by pattern matching, and it's up to the @@ -8382,14 +8497,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, mixed-sign dot-products can be implemented using signed dot-products. */ machine_mode vec_mode = TYPE_MODE (vectype_in); - if (!lane_reducing - && !directly_supported_p (op.code, vectype_in, optab_vector)) + if (!directly_supported_p (op.code, vectype_in, optab_vector)) { if (dump_enabled_p ()) dump_printf (MSG_NOTE, "op not supported by target.\n"); if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) || !vect_can_vectorize_without_simd_p (op.code)) - ok = false; + single_defuse_cycle = false; else if (dump_enabled_p ()) dump_printf (MSG_NOTE, "proceeding using word mode.\n"); @@ -8402,16 +8516,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, dump_printf (MSG_NOTE, "using word mode not possible.\n"); return false; } - - /* lane-reducing operations have to go through vect_transform_reduction. - For the other cases try without the single cycle optimization. */ - if (!ok) - { - if (lane_reducing) - return false; - else - single_defuse_cycle = false; - } } if (dump_enabled_p () && single_defuse_cycle) dump_printf_loc (MSG_NOTE, vect_location, @@ -8419,22 +8523,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo, "multiple vectors to one in the loop body\n"); STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; - /* If the reduction stmt is one of the patterns that have lane - reduction embedded we cannot handle the case of ! single_defuse_cycle. */ - if ((ncopies > 1 && ! single_defuse_cycle) - && lane_reducing) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "multi def-use cycle not possible for lane-reducing " - "reduction operation\n"); - return false; - } + /* For lane-reducing operation, the below processing related to single + defuse-cycle will be done in its own vectorizable function. One more + thing to note is that the operation must not be involved in fold-left + reduction. */ + single_defuse_cycle &= !lane_reducing; if (slp_node - && !(!single_defuse_cycle - && !lane_reducing - && reduction_type != FOLD_LEFT_REDUCTION)) + && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)) for (i = 0; i < (int) op.num_ops; i++) if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) { @@ -8447,28 +8543,20 @@ vectorizable_reduction (loop_vec_info loop_vinfo, vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, reduction_type, ncopies, cost_vec); /* Cost the reduction op inside the loop if transformed via - vect_transform_reduction. Otherwise this is costed by the - separate vectorizable_* routines. */ - if (single_defuse_cycle || lane_reducing) - { - int factor = 1; - if (vect_is_emulated_mixed_dot_prod (stmt_info)) - /* Three dot-products and a subtraction. */ - factor = 4; - record_stmt_cost (cost_vec, ncopies * factor, vector_stmt, - stmt_info, 0, vect_body); - } + vect_transform_reduction for non-lane-reducing operation. Otherwise + this is costed by the separate vectorizable_* routines. */ + if (single_defuse_cycle) + record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body); if (dump_enabled_p () && reduction_type == FOLD_LEFT_REDUCTION) dump_printf_loc (MSG_NOTE, vect_location, "using an in-order (fold-left) reduction.\n"); STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; - /* All but single defuse-cycle optimized, lane-reducing and fold-left - reductions go through their own vectorizable_* routines. */ - if (!single_defuse_cycle - && !lane_reducing - && reduction_type != FOLD_LEFT_REDUCTION) + + /* All but single defuse-cycle optimized and fold-left reductions go + through their own vectorizable_* routines. */ + if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION) { stmt_vec_info tem = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); @@ -8746,13 +8834,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo, And vector reduction PHIs are always generated to the full extent, no matter lane-reducing op exists or not. If some copies or PHIs are actually superfluous, they would be cleaned up by passes after - vectorization. An example for single-lane slp is given as below. + vectorization. An example for single-lane slp, lane-reducing ops + with mixed input vectypes in a reduction chain, is given as below. Similarly, this handling is applicable for multiple-lane slp as well. int sum = 1; for (i) { sum += d0[i] * d1[i]; // dot-prod <vector(16) char> + sum += w[i]; // widen-sum <vector(16) char> + sum += abs(s0[i] - s1[i]); // sad <vector(8) short> } The vector size is 128-bit,vectorization factor is 16. Reduction @@ -8769,9 +8860,19 @@ vect_transform_reduction (loop_vec_info loop_vinfo, sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy + + sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); + sum_v1 = sum_v1; // copy + sum_v2 = sum_v2; // copy + sum_v3 = sum_v3; // copy + + sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); + sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); + sum_v2 = sum_v2; // copy + sum_v3 = sum_v3; // copy } - sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1 */ unsigned effec_ncopies = vec_oprnds[0].length (); unsigned total_ncopies = vec_oprnds[reduc_index].length (); diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 8530a98e6d69..d717704f57cc 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo, NULL, NULL, node, cost_vec) || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec) || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec) + || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo), + stmt_info, node, cost_vec) || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info, node, node_instance, cost_vec) || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info, diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 1e2121abaffc..d8be89c37891 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2485,6 +2485,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *, extern bool vectorizable_live_operation (vec_info *, stmt_vec_info, slp_tree, slp_instance, int, bool, stmt_vector_for_cost *); +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info, + slp_tree, stmt_vector_for_cost *); extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info, slp_tree, slp_instance, stmt_vector_for_cost *);