> -----Original Message----- > From: Richard Biener <richard.guent...@gmail.com> > Sent: Tuesday, August 20, 2024 1:54 PM > To: Tamar Christina <tamar.christ...@arm.com> > Cc: Victor Do Nascimento <victor.donascime...@arm.com>; gcc- > patc...@gcc.gnu.org; claz...@gmail.com; hongtao....@intel.com; > s...@gcc.gnu.org; bernds_...@t-online.de; al...@redhat.com; Victor Do > Nascimento <vicdo...@e125768.arm.com> > Subject: Re: [PATCH V3 10/10] autovectorizer: Test autovectorization of > different > dot-prod modes. > > On Tue, Aug 20, 2024 at 12:04 PM Tamar Christina > <tamar.christ...@arm.com> wrote: > > > > > -----Original Message----- > > > From: Richard Biener <richard.guent...@gmail.com> > > > Sent: Tuesday, August 20, 2024 10:37 AM > > > To: Victor Do Nascimento <victor.donascime...@arm.com> > > > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <tamar.christ...@arm.com>; > > > claz...@gmail.com; hongtao....@intel.com; s...@gcc.gnu.org; bernds_cb1@t- > > > online.de; al...@redhat.com; Victor Do Nascimento > > > <vicdo...@e125768.arm.com> > > > Subject: Re: [PATCH V3 10/10] autovectorizer: Test autovectorization of > different > > > dot-prod modes. > > > > > > On Thu, Aug 15, 2024 at 10:46 AM Victor Do Nascimento > > > <victor.donascime...@arm.com> wrote: > > > > > > > > From: Victor Do Nascimento <vicdo...@e125768.arm.com> > > > > > > > > Given the novel treatment of the dot product optab as a conversion, we > > > > are now able to targe different relationships between output modes and > > > > input modes. > > > > > > > > This is made clearer by way of example. Previously, on AArch64, the > > > > following loop was vectorizable: > > > > > > > > uint32_t udot4(int n, uint8_t* data) { > > > > uint32_t sum = 0; > > > > for (int i=0; i<n; i+=1) > > > > sum += data[i] * data[i]; > > > > return sum; > > > > } > > > > > > > > while the following was not: > > > > > > > > uint32_t udot2(int n, uint16_t* data) { > > > > uint32_t sum = 0; > > > > for (int i=0; i<n; i+=1) > > > > sum += data[i] * data[i]; > > > > return sum; > > > > } > > > > > > > > Under the new treatment of the dot product optab, they are both now > > > > vectorizable. > > > > > > > > This adds the relevant target-agnostic check to ensure this behaviour > > > > in the autovectorizer, gated behind the new check_effective_target > > > > `vect_dotprod_twoway' as well a runtime check targetting aarch64. > > > > > > I think vect_dotprod_twoway is not clear - does aarch64 now support > > > all of qi->hi, hi->si, si->di variants while formerly only "fourway" > > > qi->si and hi->di? > > > > > > If it's just hi->si that's now supported please use vect_dotprod_hisi > > > to be specific as to what is required/supported. > > > > At the moment we support only H->S, but other variants are being considered. > > So perhaps vect_dotprod_single_step or something to make it clearer what it > > does but still be general? > > But to be useful in the testsuite it has to be specific, not general.
Oh I see, you were talking specifically about the testsuite function and not the optab itself. Yes, agreed. Sorry misunderstood! Tamar > > > Thanks, > > Tamar > > > > > > > > Richard. > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > * lib/target-supports.exp > (check_effective_target_vect_dotprod_twoway): > > > > New. > > > > * gcc.dg/vect/vect-dotprod-twoway.c: Likewise. > > > > * gcc.target/aarch64/vect-dotprod-twoway.c: Likewise. > > > > --- > > > > .../gcc.dg/vect/vect-dotprod-twoway.c | 39 +++++++++++ > > > > .../gcc.target/aarch64/vect-dotprod-twoway.c | 65 > +++++++++++++++++++ > > > > gcc/testsuite/lib/target-supports.exp | 8 +++ > > > > 3 files changed, 112 insertions(+) > > > > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c > > > > create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-dotprod- > twoway.c > > > > > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c > > > b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c > > > > new file mode 100644 > > > > index 00000000000..ff6a2559dee > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c > > > > @@ -0,0 +1,39 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-require-effective-target vect_dotprod_twoway } */ > > > > +/* Ensure both the two-way and four-way dot products are > > > > autovectorized. > */ > > > > +#include <stdint.h> > > > > + > > > > +uint32_t udot4(int n, uint8_t* data) { > > > > + uint32_t sum = 0; > > > > + for (int i=0; i<n; i+=1) { > > > > + sum += data[i] * data[i]; > > > > + } > > > > + return sum; > > > > +} > > > > + > > > > +int32_t sdot4(int n, int8_t* data) { > > > > + int32_t sum = 0; > > > > + for (int i=0; i<n; i+=1) { > > > > + sum += data[i] * data[i]; > > > > + } > > > > + return sum; > > > > +} > > > > + > > > > +uint32_t udot2(int n, uint16_t* data) { > > > > + uint32_t sum = 0; > > > > + for (int i=0; i<n; i+=1) { > > > > + sum += data[i] * data[i]; > > > > + } > > > > + return sum; > > > > +} > > > > + > > > > +int32_t sdot2(int n, int16_t* data) { > > > > + int32_t sum = 0; > > > > + for (int i=0; i<n; i+=1) { > > > > + sum += data[i] * data[i]; > > > > + } > > > > + return sum; > > > > +} > > > > + > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } > > > > */ > > > > +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: > detected" > > > 4 "vect" } } */ > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c > > > b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c > > > > new file mode 100644 > > > > index 00000000000..bac1e1846da > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c > > > > @@ -0,0 +1,65 @@ > > > > +/* { dg-do run } */ > > > > +/* { dg-require-effective-target vect_dotprod_twoway } */ > > > > +/* { dg-options "-march=armv8-a+sme2 -static -O3 -ftree-vectorize > > > > -fdump- > > > tree-vect-details -save-temps" } */ > > > > +/* Ensure runtime correctness in the autovectorized two-way dot product > > > operations. */ > > > > + > > > > +#include <stdint.h> > > > > +#include <stdlib.h> > > > > + > > > > +uint32_t > > > > +udot2 (int n, uint16_t* data) __arm_streaming > > > > +{ > > > > + uint32_t sum = 0; > > > > + for (int i=0; i<n; i+=1) { > > > > + sum += data[i] * data[i]; > > > > + } > > > > + return sum; > > > > +} > > > > + > > > > +int32_t > > > > +sdot2 (int n, int16_t* data) __arm_streaming > > > > +{ > > > > + int32_t sum = 0; > > > > + for (int i=0; i<n; i+=1) { > > > > + sum += data[i] * data[i]; > > > > + } > > > > + return sum; > > > > +} > > > > + > > > > +int > > > > +main () > > > > +{ > > > > + > > > > + uint16_t u_input_nil[] = { [0 ... 3] = 0 }; > > > > + uint16_t u_input_min[] = { [0 ... 3] = 1 }; > > > > + uint16_t u_input_max[] = { [0 ... 3] = 32767}; > > > > + > > > > + uint32_t u_nil_dotprod = udot2 (4, u_input_nil); > > > > + uint32_t u_min_dotprod = udot2 (4, u_input_min); > > > > + uint32_t u_max_dotprod = udot2 (4, u_input_max); > > > > + > > > > + if (u_nil_dotprod != 0 > > > > + || u_min_dotprod != 4 > > > > + || u_max_dotprod != 4294705156) > > > > + abort (); > > > > + > > > > + int16_t s_input_nil[] = { [0 ... 3] = 0 }; > > > > + int16_t s_input_min[] = { [0 ... 3] = -23170 }; > > > > + int16_t s_input_max[] = { [0 ... 3] = 23170 }; > > > > + > > > > + int32_t s_nil_dotprod = sdot2 (4, s_input_nil); > > > > + int32_t s_min_dotprod = sdot2 (4, s_input_min); > > > > + int32_t s_max_dotprod = sdot2 (4, s_input_max); > > > > + > > > > + if (s_nil_dotprod != 0 > > > > + || s_min_dotprod != 2147395600 > > > > + || s_max_dotprod != 2147395600) > > > > + abort (); > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } > > > > */ > > > > +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: > detected" > > > 46 "vect" } } */ > > > > +/* { dg-final { scan-assembler "\[ \t\]udot\tz\[0-9\]+.s, z\[0-9\]+.h, > > > > z\[0- > > > 9\]+.h" } } */ > > > > +/* { dg-final { scan-assembler "\[ \t\]sdot\tz\[0-9\]+.s, z\[0-9\]+.h, > > > > z\[0- > 9\]+.h" > > > } } */ > > > > diff --git a/gcc/testsuite/lib/target-supports.exp > > > > b/gcc/testsuite/lib/target- > > > supports.exp > > > > index 11ba77ca404..41618d399a3 100644 > > > > --- a/gcc/testsuite/lib/target-supports.exp > > > > +++ b/gcc/testsuite/lib/target-supports.exp > > > > @@ -4258,6 +4258,14 @@ proc check_effective_target_vect_int { } { > > > > }}] > > > > } > > > > > > > > +# Return 1 if the target supports two-way dot products, or 0 otherwise. > > > > + > > > > +proc check_effective_target_vect_dotprod_twoway { } { > > > > + return [check_cached_effective_target_indexed aarch64_sme2 { > > > > + expr { [check_effective_target_aarch64_sme2] > > > > + }}] > > > > +} > > > > + > > > > # Return 1 if the target supports vectorization of early breaks, > > > > # 0 otherwise. > > > > # > > > > -- > > > > 2.34.1 > > > >