From: Victor Do Nascimento <vicdo...@e125768.arm.com> Given the novel treatment of the dot product optab as a conversion, we are now able to targe different relationships between output modes and input modes.
This is made clearer by way of example. Previously, on AArch64, the following loop was vectorizable: uint32_t udot4(int n, uint8_t* data) { uint32_t sum = 0; for (int i=0; i<n; i+=1) sum += data[i] * data[i]; return sum; } while the following was not: uint32_t udot2(int n, uint16_t* data) { uint32_t sum = 0; for (int i=0; i<n; i+=1) sum += data[i] * data[i]; return sum; } Under the new treatment of the dot product optab, they are both now vectorizable. This adds the relevant target-agnostic check to ensure this behaviour in the autovectorizer, gated behind the new check_effective_target `vect_dotprod_twoway' as well a runtime check targetting aarch64. gcc/testsuite/ChangeLog: * lib/target-supports.exp (check_effective_target_vect_dotprod_twoway): New. * gcc.dg/vect/vect-dotprod-twoway.c: Likewise. * gcc.target/aarch64/vect-dotprod-twoway.c: Likewise. --- .../gcc.dg/vect/vect-dotprod-twoway.c | 39 +++++++++++ .../gcc.target/aarch64/vect-dotprod-twoway.c | 65 +++++++++++++++++++ gcc/testsuite/lib/target-supports.exp | 8 +++ 3 files changed, 112 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c diff --git a/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c new file mode 100644 index 00000000000..ff6a2559dee --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c @@ -0,0 +1,39 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_dotprod_twoway } */ +/* Ensure both the two-way and four-way dot products are autovectorized. */ +#include <stdint.h> + +uint32_t udot4(int n, uint8_t* data) { + uint32_t sum = 0; + for (int i=0; i<n; i+=1) { + sum += data[i] * data[i]; + } + return sum; +} + +int32_t sdot4(int n, int8_t* data) { + int32_t sum = 0; + for (int i=0; i<n; i+=1) { + sum += data[i] * data[i]; + } + return sum; +} + +uint32_t udot2(int n, uint16_t* data) { + uint32_t sum = 0; + for (int i=0; i<n; i+=1) { + sum += data[i] * data[i]; + } + return sum; +} + +int32_t sdot2(int n, int16_t* data) { + int32_t sum = 0; + for (int i=0; i<n; i+=1) { + sum += data[i] * data[i]; + } + return sum; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c new file mode 100644 index 00000000000..bac1e1846da --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c @@ -0,0 +1,65 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_dotprod_twoway } */ +/* { dg-options "-march=armv8-a+sme2 -static -O3 -ftree-vectorize -fdump-tree-vect-details -save-temps" } */ +/* Ensure runtime correctness in the autovectorized two-way dot product operations. */ + +#include <stdint.h> +#include <stdlib.h> + +uint32_t +udot2 (int n, uint16_t* data) __arm_streaming +{ + uint32_t sum = 0; + for (int i=0; i<n; i+=1) { + sum += data[i] * data[i]; + } + return sum; +} + +int32_t +sdot2 (int n, int16_t* data) __arm_streaming +{ + int32_t sum = 0; + for (int i=0; i<n; i+=1) { + sum += data[i] * data[i]; + } + return sum; +} + +int +main () +{ + + uint16_t u_input_nil[] = { [0 ... 3] = 0 }; + uint16_t u_input_min[] = { [0 ... 3] = 1 }; + uint16_t u_input_max[] = { [0 ... 3] = 32767}; + + uint32_t u_nil_dotprod = udot2 (4, u_input_nil); + uint32_t u_min_dotprod = udot2 (4, u_input_min); + uint32_t u_max_dotprod = udot2 (4, u_input_max); + + if (u_nil_dotprod != 0 + || u_min_dotprod != 4 + || u_max_dotprod != 4294705156) + abort (); + + int16_t s_input_nil[] = { [0 ... 3] = 0 }; + int16_t s_input_min[] = { [0 ... 3] = -23170 }; + int16_t s_input_max[] = { [0 ... 3] = 23170 }; + + int32_t s_nil_dotprod = sdot2 (4, s_input_nil); + int32_t s_min_dotprod = sdot2 (4, s_input_min); + int32_t s_max_dotprod = sdot2 (4, s_input_max); + + if (s_nil_dotprod != 0 + || s_min_dotprod != 2147395600 + || s_max_dotprod != 2147395600) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 46 "vect" } } */ +/* { dg-final { scan-assembler "\[ \t\]udot\tz\[0-9\]+.s, z\[0-9\]+.h, z\[0-9\]+.h" } } */ +/* { dg-final { scan-assembler "\[ \t\]sdot\tz\[0-9\]+.s, z\[0-9\]+.h, z\[0-9\]+.h" } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 11ba77ca404..41618d399a3 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -4258,6 +4258,14 @@ proc check_effective_target_vect_int { } { }}] } +# Return 1 if the target supports two-way dot products, or 0 otherwise. + +proc check_effective_target_vect_dotprod_twoway { } { + return [check_cached_effective_target_indexed aarch64_sme2 { + expr { [check_effective_target_aarch64_sme2] + }}] +} + # Return 1 if the target supports vectorization of early breaks, # 0 otherwise. # -- 2.34.1