Re: [PING][PATCH V4 10/10] autovectorizer: Test autovectorization of different dot-prod modes.

Richard Biener Thu, 26 Sep 2024 23:50:13 -0700

On Thu, 26 Sep 2024, Victor Do Nascimento wrote:

> Hello,
> 
> Gentle reminder for this simple renaming update in response to the feedback
> from the last iteration. 🙂


OK.

Thanks,
Richard.

> Thanks,
> Victor
> 
> 
> On 9/5/24 12:05, Victor Do Nascimento wrote:
> > Changes from previous revision:
> > 
> > Rename new `check_effective_target' and tests to make their intent
> > clearer.
> > 
> >    * lib/target-supports.exp: For new `check_effective_target',
> >      s/vect_dotprod_twoway/vect_dotprod_hisi/.
> >    * One test is renamed to `vect-dotprod-conv-optab.c' to emphasize
> >      aim of checking the new dotprod convert optab allows
> >      autovectorization of a given datatype to distinct target
> >      data-types.
> >    * The aarch64 runtime-correctness check has had the mode supported
> >      for its two-way dot-product added to the test name, resulting in
> >      the new `vect-dotprod-twoway-hisi.c' name.
> > 
> > ------
> > 
> > Given the novel treatment of the dot product optab as a conversion, we
> > are now able to target different relationships between output modes and
> > input modes.
> > 
> > This is made clearer by way of example. Previously, on AArch64, the
> > following loop was vectorizable:
> > 
> > uint32_t udot4(int n, uint8_t* data) {
> >    uint32_t sum = 0;
> >    for (int i=0; i<n; i+=1)
> >      sum += data[i] * data[i];
> >    return sum;
> > }
> > 
> > while the following was not:
> > 
> > uint32_t udot2(int n, uint16_t* data) {
> >    uint32_t sum = 0;
> >    for (int i=0; i<n; i+=1)
> >      sum += data[i] * data[i];
> >    return sum;
> > }
> > 
> > Under the new treatment of the dot product optab, they are both now
> > vectorizable.
> > 
> > This adds the relevant target-agnostic check to ensure this behavior
> > in the autovectorizer, gated behind the new check_effective_target
> > `vect_dotprod_hisi' as well a runtime check targeting aarch64.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> >  * lib/target-supports.exp (check_effective_target_vect_dotprod_hisi):
> >  New.
> >  * gcc.dg/vect/vect-dotprod-conv-optab.c: Likewise.
> >  * gcc.target/aarch64/vect-dotprod-twoway-hisi.c: Likewise.
> > ---
> >   .../gcc.dg/vect/vect-dotprod-conv-optab.c     | 41 ++++++++++++
> >   .../aarch64/vect-dotprod-twoway-hisi.c        | 66 +++++++++++++++++++
> >   gcc/testsuite/lib/target-supports.exp         |  9 +++
> >   3 files changed, 116 insertions(+)
> >   create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c
> >   create mode 100644
> >   gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c
> > 
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c
> > b/gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c
> > new file mode 100644
> > index 00000000000..63e6c95480d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c
> > @@ -0,0 +1,41 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target vect_dotprod_hisi } */
> > +/* Ensure that, given the same input datatype, both the two-way and
> > four-way
> > +   dot products are autovectorized, with the correct operation then
> > selected
> > +   based on the distinct output types.  */
> > +#include <stdint.h>
> > +
> > +uint32_t udot4(int n, uint8_t* data) {
> > +  uint32_t sum = 0;
> > +  for (int i=0; i<n; i+=1) {
> > +    sum += data[i] * data[i];
> > +  }
> > +  return sum;
> > +}
> > +
> > +int32_t sdot4(int n, int8_t* data) {
> > +  int32_t sum = 0;
> > +  for (int i=0; i<n; i+=1) {
> > +    sum += data[i] * data[i];
> > +  }
> > +  return sum;
> > +}
> > +
> > +uint32_t udot2(int n, uint16_t* data) {
> > +  uint32_t sum = 0;
> > +  for (int i=0; i<n; i+=1) {
> > +    sum += data[i] * data[i];
> > +  }
> > +  return sum;
> > +}
> > +
> > +int32_t sdot2(int n, int16_t* data) {
> > +  int32_t sum = 0;
> > +  for (int i=0; i<n; i+=1) {
> > +    sum += data[i] * data[i];
> > +  }
> > +  return sum;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern:
> > detected" 4 "vect" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c
> > b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c
> > new file mode 100644
> > index 00000000000..0490faa2c94
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c
> > @@ -0,0 +1,66 @@
> > +/* { dg-do run } */
> > +/* { dg-require-effective-target vect_dotprod_hisi } */
> > +/* { dg-options "-static -O3 -ftree-vectorize -fdump-tree-vect-details
> > -save-temps" } */
> > +/* Ensure runtime correctness in the autovectorized two-way dot product
> > operations.  */
> > +
> > +#include <stdint.h>
> > +#include <stdlib.h>
> > +#pragma GCC target "+sme2"
> > +
> > +uint32_t
> > +udot2 (int n, uint16_t* data)  __arm_streaming
> > +{
> > +  uint32_t sum = 0;
> > +  for (int i=0; i<n; i+=1) {
> > +    sum += data[i] * data[i];
> > +  }
> > +  return sum;
> > +}
> > +
> > +int32_t
> > +sdot2 (int n, int16_t* data)  __arm_streaming
> > +{
> > +  int32_t sum = 0;
> > +  for (int i=0; i<n; i+=1) {
> > +    sum += data[i] * data[i];
> > +  }
> > +  return sum;
> > +}
> > +
> > +int
> > +main ()
> > +{
> > +
> > +  uint16_t u_input_nil[] = { [0 ... 3] = 0 };
> > +  uint16_t u_input_min[] = { [0 ... 3] = 1 };
> > +  uint16_t u_input_max[] = { [0 ... 3] = 32767};
> > +
> > +  uint32_t u_nil_dotprod = udot2 (4, u_input_nil);
> > +  uint32_t u_min_dotprod = udot2 (4, u_input_min);
> > +  uint32_t u_max_dotprod = udot2 (4, u_input_max);
> > +
> > +  if (u_nil_dotprod != 0
> > +      || u_min_dotprod != 4
> > +      || u_max_dotprod != 4294705156)
> > +    abort ();
> > +
> > +  int16_t s_input_nil[] = { [0 ... 3] = 0 };
> > +  int16_t s_input_min[] = { [0 ... 3] = -23170 };
> > +  int16_t s_input_max[] = { [0 ... 3] =  23170 };
> > +
> > +  int32_t s_nil_dotprod = sdot2 (4, s_input_nil);
> > +  int32_t s_min_dotprod = sdot2 (4, s_input_min);
> > +  int32_t s_max_dotprod = sdot2 (4, s_input_max);
> > +
> > +  if (s_nil_dotprod != 0
> > +      || s_min_dotprod != 2147395600
> > +      || s_max_dotprod != 2147395600)
> > +      abort ();
> > +
> > +  return 0;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern:
> > detected" 46 "vect" } } */
> > +/* { dg-final { scan-assembler "\[ \t\]udot\tz\[0-9\]+.s, z\[0-9\]+.h,
> > z\[0-9\]+.h" } } */
> > +/* { dg-final { scan-assembler "\[ \t\]sdot\tz\[0-9\]+.s, z\[0-9\]+.h,
> > z\[0-9\]+.h" } } */
> > diff --git a/gcc/testsuite/lib/target-supports.exp
> > b/gcc/testsuite/lib/target-supports.exp
> > index 11ba77ca404..ebbc2fb8015 100644
> > --- a/gcc/testsuite/lib/target-supports.exp
> > +++ b/gcc/testsuite/lib/target-supports.exp
> > @@ -4258,6 +4258,15 @@ proc check_effective_target_vect_int { } {
> >     }}]
> >   }
> >   
> > +# Return 1 if the target supports two-way dot products on inputs of hi mode
> > +# producing si outputs, 0 otherwise.
> > +
> > +proc check_effective_target_vect_dotprod_hisi { } {
> > +    return [check_cached_effective_target_indexed aarch64_sme2 {
> > +   expr { [check_effective_target_aarch64_sme2]
> > +    }}]
> > +}
> > +
> >   # Return 1 if the target supports vectorization of early breaks,
> >   # 0 otherwise.
> >   #
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Re: [PING][PATCH V4 10/10] autovectorizer: Test autovectorization of different dot-prod modes.

Reply via email to