On Thu, Aug 15, 2024 at 10:46 AM Victor Do Nascimento <victor.donascime...@arm.com> wrote: > > From: Victor Do Nascimento <vicdo...@e125768.arm.com> > > Given the novel treatment of the dot product optab as a conversion, we > are now able to targe different relationships between output modes and > input modes. > > This is made clearer by way of example. Previously, on AArch64, the > following loop was vectorizable: > > uint32_t udot4(int n, uint8_t* data) { > uint32_t sum = 0; > for (int i=0; i<n; i+=1) > sum += data[i] * data[i]; > return sum; > } > > while the following was not: > > uint32_t udot2(int n, uint16_t* data) { > uint32_t sum = 0; > for (int i=0; i<n; i+=1) > sum += data[i] * data[i]; > return sum; > } > > Under the new treatment of the dot product optab, they are both now > vectorizable. > > This adds the relevant target-agnostic check to ensure this behaviour > in the autovectorizer, gated behind the new check_effective_target > `vect_dotprod_twoway' as well a runtime check targetting aarch64.
I think vect_dotprod_twoway is not clear - does aarch64 now support all of qi->hi, hi->si, si->di variants while formerly only "fourway" qi->si and hi->di? If it's just hi->si that's now supported please use vect_dotprod_hisi to be specific as to what is required/supported. Richard. > gcc/testsuite/ChangeLog: > > * lib/target-supports.exp > (check_effective_target_vect_dotprod_twoway): > New. > * gcc.dg/vect/vect-dotprod-twoway.c: Likewise. > * gcc.target/aarch64/vect-dotprod-twoway.c: Likewise. > --- > .../gcc.dg/vect/vect-dotprod-twoway.c | 39 +++++++++++ > .../gcc.target/aarch64/vect-dotprod-twoway.c | 65 +++++++++++++++++++ > gcc/testsuite/lib/target-supports.exp | 8 +++ > 3 files changed, 112 insertions(+) > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c > b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c > new file mode 100644 > index 00000000000..ff6a2559dee > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c > @@ -0,0 +1,39 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_dotprod_twoway } */ > +/* Ensure both the two-way and four-way dot products are autovectorized. */ > +#include <stdint.h> > + > +uint32_t udot4(int n, uint8_t* data) { > + uint32_t sum = 0; > + for (int i=0; i<n; i+=1) { > + sum += data[i] * data[i]; > + } > + return sum; > +} > + > +int32_t sdot4(int n, int8_t* data) { > + int32_t sum = 0; > + for (int i=0; i<n; i+=1) { > + sum += data[i] * data[i]; > + } > + return sum; > +} > + > +uint32_t udot2(int n, uint16_t* data) { > + uint32_t sum = 0; > + for (int i=0; i<n; i+=1) { > + sum += data[i] * data[i]; > + } > + return sum; > +} > + > +int32_t sdot2(int n, int16_t* data) { > + int32_t sum = 0; > + for (int i=0; i<n; i+=1) { > + sum += data[i] * data[i]; > + } > + return sum; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" > 4 "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c > b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c > new file mode 100644 > index 00000000000..bac1e1846da > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c > @@ -0,0 +1,65 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target vect_dotprod_twoway } */ > +/* { dg-options "-march=armv8-a+sme2 -static -O3 -ftree-vectorize > -fdump-tree-vect-details -save-temps" } */ > +/* Ensure runtime correctness in the autovectorized two-way dot product > operations. */ > + > +#include <stdint.h> > +#include <stdlib.h> > + > +uint32_t > +udot2 (int n, uint16_t* data) __arm_streaming > +{ > + uint32_t sum = 0; > + for (int i=0; i<n; i+=1) { > + sum += data[i] * data[i]; > + } > + return sum; > +} > + > +int32_t > +sdot2 (int n, int16_t* data) __arm_streaming > +{ > + int32_t sum = 0; > + for (int i=0; i<n; i+=1) { > + sum += data[i] * data[i]; > + } > + return sum; > +} > + > +int > +main () > +{ > + > + uint16_t u_input_nil[] = { [0 ... 3] = 0 }; > + uint16_t u_input_min[] = { [0 ... 3] = 1 }; > + uint16_t u_input_max[] = { [0 ... 3] = 32767}; > + > + uint32_t u_nil_dotprod = udot2 (4, u_input_nil); > + uint32_t u_min_dotprod = udot2 (4, u_input_min); > + uint32_t u_max_dotprod = udot2 (4, u_input_max); > + > + if (u_nil_dotprod != 0 > + || u_min_dotprod != 4 > + || u_max_dotprod != 4294705156) > + abort (); > + > + int16_t s_input_nil[] = { [0 ... 3] = 0 }; > + int16_t s_input_min[] = { [0 ... 3] = -23170 }; > + int16_t s_input_max[] = { [0 ... 3] = 23170 }; > + > + int32_t s_nil_dotprod = sdot2 (4, s_input_nil); > + int32_t s_min_dotprod = sdot2 (4, s_input_min); > + int32_t s_max_dotprod = sdot2 (4, s_input_max); > + > + if (s_nil_dotprod != 0 > + || s_min_dotprod != 2147395600 > + || s_max_dotprod != 2147395600) > + abort (); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" > 46 "vect" } } */ > +/* { dg-final { scan-assembler "\[ \t\]udot\tz\[0-9\]+.s, z\[0-9\]+.h, > z\[0-9\]+.h" } } */ > +/* { dg-final { scan-assembler "\[ \t\]sdot\tz\[0-9\]+.s, z\[0-9\]+.h, > z\[0-9\]+.h" } } */ > diff --git a/gcc/testsuite/lib/target-supports.exp > b/gcc/testsuite/lib/target-supports.exp > index 11ba77ca404..41618d399a3 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -4258,6 +4258,14 @@ proc check_effective_target_vect_int { } { > }}] > } > > +# Return 1 if the target supports two-way dot products, or 0 otherwise. > + > +proc check_effective_target_vect_dotprod_twoway { } { > + return [check_cached_effective_target_indexed aarch64_sme2 { > + expr { [check_effective_target_aarch64_sme2] > + }}] > +} > + > # Return 1 if the target supports vectorization of early breaks, > # 0 otherwise. > # > -- > 2.34.1 >