On Thu, Aug 15, 2024 at 10:46 AM Victor Do Nascimento
<victor.donascime...@arm.com> wrote:
>
> From: Victor Do Nascimento <vicdo...@e125768.arm.com>
>
> Given the novel treatment of the dot product optab as a conversion, we
> are now able to targe different relationships between output modes and
> input modes.
>
> This is made clearer by way of example. Previously, on AArch64, the
> following loop was vectorizable:
>
> uint32_t udot4(int n, uint8_t* data) {
>   uint32_t sum = 0;
>   for (int i=0; i<n; i+=1)
>     sum += data[i] * data[i];
>   return sum;
> }
>
> while the following was not:
>
> uint32_t udot2(int n, uint16_t* data) {
>   uint32_t sum = 0;
>   for (int i=0; i<n; i+=1)
>     sum += data[i] * data[i];
>   return sum;
> }
>
> Under the new treatment of the dot product optab, they are both now
> vectorizable.
>
> This adds the relevant target-agnostic check to ensure this behaviour
> in the autovectorizer, gated behind the new check_effective_target
> `vect_dotprod_twoway' as well a runtime check targetting aarch64.

I think vect_dotprod_twoway is not clear - does aarch64 now support
all of qi->hi, hi->si, si->di variants while formerly only "fourway"
qi->si and hi->di?

If it's just hi->si that's now supported please use vect_dotprod_hisi
to be specific as to what is required/supported.

Richard.

> gcc/testsuite/ChangeLog:
>
>         * lib/target-supports.exp 
> (check_effective_target_vect_dotprod_twoway):
>         New.
>         * gcc.dg/vect/vect-dotprod-twoway.c: Likewise.
>         * gcc.target/aarch64/vect-dotprod-twoway.c: Likewise.
> ---
>  .../gcc.dg/vect/vect-dotprod-twoway.c         | 39 +++++++++++
>  .../gcc.target/aarch64/vect-dotprod-twoway.c  | 65 +++++++++++++++++++
>  gcc/testsuite/lib/target-supports.exp         |  8 +++
>  3 files changed, 112 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c 
> b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c
> new file mode 100644
> index 00000000000..ff6a2559dee
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_dotprod_twoway } */
> +/* Ensure both the two-way and four-way dot products are autovectorized.  */
> +#include <stdint.h>
> +
> +uint32_t udot4(int n, uint8_t* data) {
> +  uint32_t sum = 0;
> +  for (int i=0; i<n; i+=1) {
> +    sum += data[i] * data[i];
> +  }
> +  return sum;
> +}
> +
> +int32_t sdot4(int n, int8_t* data) {
> +  int32_t sum = 0;
> +  for (int i=0; i<n; i+=1) {
> +    sum += data[i] * data[i];
> +  }
> +  return sum;
> +}
> +
> +uint32_t udot2(int n, uint16_t* data) {
> +  uint32_t sum = 0;
> +  for (int i=0; i<n; i+=1) {
> +    sum += data[i] * data[i];
> +  }
> +  return sum;
> +}
> +
> +int32_t sdot2(int n, int16_t* data) {
> +  int32_t sum = 0;
> +  for (int i=0; i<n; i+=1) {
> +    sum += data[i] * data[i];
> +  }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 
> 4 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c
> new file mode 100644
> index 00000000000..bac1e1846da
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway.c
> @@ -0,0 +1,65 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_dotprod_twoway } */
> +/* { dg-options "-march=armv8-a+sme2 -static -O3 -ftree-vectorize 
> -fdump-tree-vect-details -save-temps" } */
> +/* Ensure runtime correctness in the autovectorized two-way dot product 
> operations.  */
> +
> +#include <stdint.h>
> +#include <stdlib.h>
> +
> +uint32_t
> +udot2 (int n, uint16_t* data)  __arm_streaming
> +{
> +  uint32_t sum = 0;
> +  for (int i=0; i<n; i+=1) {
> +    sum += data[i] * data[i];
> +  }
> +  return sum;
> +}
> +
> +int32_t
> +sdot2 (int n, int16_t* data)  __arm_streaming
> +{
> +  int32_t sum = 0;
> +  for (int i=0; i<n; i+=1) {
> +    sum += data[i] * data[i];
> +  }
> +  return sum;
> +}
> +
> +int
> +main ()
> +{
> +
> +  uint16_t u_input_nil[] = { [0 ... 3] = 0 };
> +  uint16_t u_input_min[] = { [0 ... 3] = 1 };
> +  uint16_t u_input_max[] = { [0 ... 3] = 32767};
> +
> +  uint32_t u_nil_dotprod = udot2 (4, u_input_nil);
> +  uint32_t u_min_dotprod = udot2 (4, u_input_min);
> +  uint32_t u_max_dotprod = udot2 (4, u_input_max);
> +
> +  if (u_nil_dotprod != 0
> +      || u_min_dotprod != 4
> +      || u_max_dotprod != 4294705156)
> +    abort ();
> +
> +  int16_t s_input_nil[] = { [0 ... 3] = 0 };
> +  int16_t s_input_min[] = { [0 ... 3] = -23170 };
> +  int16_t s_input_max[] = { [0 ... 3] =  23170 };
> +
> +  int32_t s_nil_dotprod = sdot2 (4, s_input_nil);
> +  int32_t s_min_dotprod = sdot2 (4, s_input_min);
> +  int32_t s_max_dotprod = sdot2 (4, s_input_max);
> +
> +  if (s_nil_dotprod != 0
> +      || s_min_dotprod != 2147395600
> +      || s_max_dotprod != 2147395600)
> +      abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 
> 46 "vect" } } */
> +/* { dg-final { scan-assembler "\[ \t\]udot\tz\[0-9\]+.s, z\[0-9\]+.h, 
> z\[0-9\]+.h" } } */
> +/* { dg-final { scan-assembler "\[ \t\]sdot\tz\[0-9\]+.s, z\[0-9\]+.h, 
> z\[0-9\]+.h" } } */
> diff --git a/gcc/testsuite/lib/target-supports.exp 
> b/gcc/testsuite/lib/target-supports.exp
> index 11ba77ca404..41618d399a3 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -4258,6 +4258,14 @@ proc check_effective_target_vect_int { } {
>         }}]
>  }
>
> +# Return 1 if the target supports two-way dot products, or 0 otherwise.
> +
> +proc check_effective_target_vect_dotprod_twoway { } {
> +    return [check_cached_effective_target_indexed aarch64_sme2 {
> +       expr { [check_effective_target_aarch64_sme2]
> +    }}]
> +}
> +
>  # Return 1 if the target supports vectorization of early breaks,
>  # 0 otherwise.
>  #
> --
> 2.34.1
>

Reply via email to