On Thu, 26 Sep 2024, Victor Do Nascimento wrote: > Hello, > > Gentle reminder for this simple renaming update in response to the feedback > from the last iteration. 🙂
OK. Thanks, Richard. > Thanks, > Victor > > > On 9/5/24 12:05, Victor Do Nascimento wrote: > > Changes from previous revision: > > > > Rename new `check_effective_target' and tests to make their intent > > clearer. > > > > * lib/target-supports.exp: For new `check_effective_target', > > s/vect_dotprod_twoway/vect_dotprod_hisi/. > > * One test is renamed to `vect-dotprod-conv-optab.c' to emphasize > > aim of checking the new dotprod convert optab allows > > autovectorization of a given datatype to distinct target > > data-types. > > * The aarch64 runtime-correctness check has had the mode supported > > for its two-way dot-product added to the test name, resulting in > > the new `vect-dotprod-twoway-hisi.c' name. > > > > ------ > > > > Given the novel treatment of the dot product optab as a conversion, we > > are now able to target different relationships between output modes and > > input modes. > > > > This is made clearer by way of example. Previously, on AArch64, the > > following loop was vectorizable: > > > > uint32_t udot4(int n, uint8_t* data) { > > uint32_t sum = 0; > > for (int i=0; i<n; i+=1) > > sum += data[i] * data[i]; > > return sum; > > } > > > > while the following was not: > > > > uint32_t udot2(int n, uint16_t* data) { > > uint32_t sum = 0; > > for (int i=0; i<n; i+=1) > > sum += data[i] * data[i]; > > return sum; > > } > > > > Under the new treatment of the dot product optab, they are both now > > vectorizable. > > > > This adds the relevant target-agnostic check to ensure this behavior > > in the autovectorizer, gated behind the new check_effective_target > > `vect_dotprod_hisi' as well a runtime check targeting aarch64. > > > > gcc/testsuite/ChangeLog: > > > > * lib/target-supports.exp (check_effective_target_vect_dotprod_hisi): > > New. > > * gcc.dg/vect/vect-dotprod-conv-optab.c: Likewise. > > * gcc.target/aarch64/vect-dotprod-twoway-hisi.c: Likewise. > > --- > > .../gcc.dg/vect/vect-dotprod-conv-optab.c | 41 ++++++++++++ > > .../aarch64/vect-dotprod-twoway-hisi.c | 66 +++++++++++++++++++ > > gcc/testsuite/lib/target-supports.exp | 9 +++ > > 3 files changed, 116 insertions(+) > > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c > > create mode 100644 > > gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c > > b/gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c > > new file mode 100644 > > index 00000000000..63e6c95480d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c > > @@ -0,0 +1,41 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target vect_dotprod_hisi } */ > > +/* Ensure that, given the same input datatype, both the two-way and > > four-way > > + dot products are autovectorized, with the correct operation then > > selected > > + based on the distinct output types. */ > > +#include <stdint.h> > > + > > +uint32_t udot4(int n, uint8_t* data) { > > + uint32_t sum = 0; > > + for (int i=0; i<n; i+=1) { > > + sum += data[i] * data[i]; > > + } > > + return sum; > > +} > > + > > +int32_t sdot4(int n, int8_t* data) { > > + int32_t sum = 0; > > + for (int i=0; i<n; i+=1) { > > + sum += data[i] * data[i]; > > + } > > + return sum; > > +} > > + > > +uint32_t udot2(int n, uint16_t* data) { > > + uint32_t sum = 0; > > + for (int i=0; i<n; i+=1) { > > + sum += data[i] * data[i]; > > + } > > + return sum; > > +} > > + > > +int32_t sdot2(int n, int16_t* data) { > > + int32_t sum = 0; > > + for (int i=0; i<n; i+=1) { > > + sum += data[i] * data[i]; > > + } > > + return sum; > > +} > > + > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */ > > +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: > > detected" 4 "vect" } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c > > b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c > > new file mode 100644 > > index 00000000000..0490faa2c94 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c > > @@ -0,0 +1,66 @@ > > +/* { dg-do run } */ > > +/* { dg-require-effective-target vect_dotprod_hisi } */ > > +/* { dg-options "-static -O3 -ftree-vectorize -fdump-tree-vect-details > > -save-temps" } */ > > +/* Ensure runtime correctness in the autovectorized two-way dot product > > operations. */ > > + > > +#include <stdint.h> > > +#include <stdlib.h> > > +#pragma GCC target "+sme2" > > + > > +uint32_t > > +udot2 (int n, uint16_t* data) __arm_streaming > > +{ > > + uint32_t sum = 0; > > + for (int i=0; i<n; i+=1) { > > + sum += data[i] * data[i]; > > + } > > + return sum; > > +} > > + > > +int32_t > > +sdot2 (int n, int16_t* data) __arm_streaming > > +{ > > + int32_t sum = 0; > > + for (int i=0; i<n; i+=1) { > > + sum += data[i] * data[i]; > > + } > > + return sum; > > +} > > + > > +int > > +main () > > +{ > > + > > + uint16_t u_input_nil[] = { [0 ... 3] = 0 }; > > + uint16_t u_input_min[] = { [0 ... 3] = 1 }; > > + uint16_t u_input_max[] = { [0 ... 3] = 32767}; > > + > > + uint32_t u_nil_dotprod = udot2 (4, u_input_nil); > > + uint32_t u_min_dotprod = udot2 (4, u_input_min); > > + uint32_t u_max_dotprod = udot2 (4, u_input_max); > > + > > + if (u_nil_dotprod != 0 > > + || u_min_dotprod != 4 > > + || u_max_dotprod != 4294705156) > > + abort (); > > + > > + int16_t s_input_nil[] = { [0 ... 3] = 0 }; > > + int16_t s_input_min[] = { [0 ... 3] = -23170 }; > > + int16_t s_input_max[] = { [0 ... 3] = 23170 }; > > + > > + int32_t s_nil_dotprod = sdot2 (4, s_input_nil); > > + int32_t s_min_dotprod = sdot2 (4, s_input_min); > > + int32_t s_max_dotprod = sdot2 (4, s_input_max); > > + > > + if (s_nil_dotprod != 0 > > + || s_min_dotprod != 2147395600 > > + || s_max_dotprod != 2147395600) > > + abort (); > > + > > + return 0; > > +} > > + > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > > +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: > > detected" 46 "vect" } } */ > > +/* { dg-final { scan-assembler "\[ \t\]udot\tz\[0-9\]+.s, z\[0-9\]+.h, > > z\[0-9\]+.h" } } */ > > +/* { dg-final { scan-assembler "\[ \t\]sdot\tz\[0-9\]+.s, z\[0-9\]+.h, > > z\[0-9\]+.h" } } */ > > diff --git a/gcc/testsuite/lib/target-supports.exp > > b/gcc/testsuite/lib/target-supports.exp > > index 11ba77ca404..ebbc2fb8015 100644 > > --- a/gcc/testsuite/lib/target-supports.exp > > +++ b/gcc/testsuite/lib/target-supports.exp > > @@ -4258,6 +4258,15 @@ proc check_effective_target_vect_int { } { > > }}] > > } > > > > +# Return 1 if the target supports two-way dot products on inputs of hi mode > > +# producing si outputs, 0 otherwise. > > + > > +proc check_effective_target_vect_dotprod_hisi { } { > > + return [check_cached_effective_target_indexed aarch64_sme2 { > > + expr { [check_effective_target_aarch64_sme2] > > + }}] > > +} > > + > > # Return 1 if the target supports vectorization of early breaks, > > # 0 otherwise. > > # > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)