[PING][PATCH V4 10/10] autovectorizer: Test autovectorization of different dot-prod modes.

Victor Do Nascimento Thu, 26 Sep 2024 13:48:13 -0700

Hello,

Gentle reminder for this simple renaming update in response to thefeedback from the last iteration. 🙂


Thanks,
Victor


On 9/5/24 12:05, Victor Do Nascimento wrote:

Changes from previous revision:

Rename new `check_effective_target' and tests to make their intent
clearer.

   * lib/target-supports.exp: For new `check_effective_target',
     s/vect_dotprod_twoway/vect_dotprod_hisi/.
   * One test is renamed to `vect-dotprod-conv-optab.c' to emphasize
     aim of checking the new dotprod convert optab allows
     autovectorization of a given datatype to distinct target
     data-types.
   * The aarch64 runtime-correctness check has had the mode supported
     for its two-way dot-product added to the test name, resulting in
     the new `vect-dotprod-twoway-hisi.c' name.

------

Given the novel treatment of the dot product optab as a conversion, we
are now able to target different relationships between output modes and
input modes.

This is made clearer by way of example. Previously, on AArch64, the
following loop was vectorizable:

uint32_t udot4(int n, uint8_t* data) {
   uint32_t sum = 0;
   for (int i=0; i<n; i+=1)
     sum += data[i] * data[i];
   return sum;
}

while the following was not:

uint32_t udot2(int n, uint16_t* data) {
   uint32_t sum = 0;
   for (int i=0; i<n; i+=1)
     sum += data[i] * data[i];
   return sum;
}

Under the new treatment of the dot product optab, they are both now
vectorizable.

This adds the relevant target-agnostic check to ensure this behavior
in the autovectorizer, gated behind the new check_effective_target
`vect_dotprod_hisi' as well a runtime check targeting aarch64.

gcc/testsuite/ChangeLog:

        * lib/target-supports.exp (check_effective_target_vect_dotprod_hisi):
        New.
        * gcc.dg/vect/vect-dotprod-conv-optab.c: Likewise.
        * gcc.target/aarch64/vect-dotprod-twoway-hisi.c: Likewise.
---
  .../gcc.dg/vect/vect-dotprod-conv-optab.c     | 41 ++++++++++++
  .../aarch64/vect-dotprod-twoway-hisi.c        | 66 +++++++++++++++++++
  gcc/testsuite/lib/target-supports.exp         |  9 +++
  3 files changed, 116 insertions(+)
  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c
  create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c 
b/gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c
new file mode 100644
index 00000000000..63e6c95480d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-dotprod-conv-optab.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_dotprod_hisi } */
+/* Ensure that, given the same input datatype, both the two-way and four-way
+   dot products are autovectorized, with the correct operation then selected
+   based on the distinct output types.  */
+#include <stdint.h>
+
+uint32_t udot4(int n, uint8_t* data) {
+  uint32_t sum = 0;
+  for (int i=0; i<n; i+=1) {
+    sum += data[i] * data[i];
+  }
+  return sum;
+}
+
+int32_t sdot4(int n, int8_t* data) {
+  int32_t sum = 0;
+  for (int i=0; i<n; i+=1) {
+    sum += data[i] * data[i];
+  }
+  return sum;
+}
+
+uint32_t udot2(int n, uint16_t* data) {
+  uint32_t sum = 0;
+  for (int i=0; i<n; i+=1) {
+    sum += data[i] * data[i];
+  }
+  return sum;
+}
+
+int32_t sdot2(int n, int16_t* data) {
+  int32_t sum = 0;
+  for (int i=0; i<n; i+=1) {
+    sum += data[i] * data[i];
+  }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 4 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c 
b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c
new file mode 100644
index 00000000000..0490faa2c94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-dotprod-twoway-hisi.c
@@ -0,0 +1,66 @@
+/* { dg-do run } */
+/* { dg-require-effective-target vect_dotprod_hisi } */
+/* { dg-options "-static -O3 -ftree-vectorize -fdump-tree-vect-details 
-save-temps" } */
+/* Ensure runtime correctness in the autovectorized two-way dot product 
operations.  */
+
+#include <stdint.h>
+#include <stdlib.h>
+#pragma GCC target "+sme2"
+
+uint32_t
+udot2 (int n, uint16_t* data)  __arm_streaming
+{
+  uint32_t sum = 0;
+  for (int i=0; i<n; i+=1) {
+    sum += data[i] * data[i];
+  }
+  return sum;
+}
+
+int32_t
+sdot2 (int n, int16_t* data)  __arm_streaming
+{
+  int32_t sum = 0;
+  for (int i=0; i<n; i+=1) {
+    sum += data[i] * data[i];
+  }
+  return sum;
+}
+
+int
+main ()
+{
+
+  uint16_t u_input_nil[] = { [0 ... 3] = 0 };
+  uint16_t u_input_min[] = { [0 ... 3] = 1 };
+  uint16_t u_input_max[] = { [0 ... 3] = 32767};
+
+  uint32_t u_nil_dotprod = udot2 (4, u_input_nil);
+  uint32_t u_min_dotprod = udot2 (4, u_input_min);
+  uint32_t u_max_dotprod = udot2 (4, u_input_max);
+
+  if (u_nil_dotprod != 0
+      || u_min_dotprod != 4
+      || u_max_dotprod != 4294705156)
+    abort ();
+
+  int16_t s_input_nil[] = { [0 ... 3] = 0 };
+  int16_t s_input_min[] = { [0 ... 3] = -23170 };
+  int16_t s_input_max[] = { [0 ... 3] =  23170 };
+
+  int32_t s_nil_dotprod = sdot2 (4, s_input_nil);
+  int32_t s_min_dotprod = sdot2 (4, s_input_min);
+  int32_t s_max_dotprod = sdot2 (4, s_input_max);
+
+  if (s_nil_dotprod != 0
+      || s_min_dotprod != 2147395600
+      || s_max_dotprod != 2147395600)
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 46 
"vect" } } */
+/* { dg-final { scan-assembler "\[ \t\]udot\tz\[0-9\]+.s, z\[0-9\]+.h, 
z\[0-9\]+.h" } } */
+/* { dg-final { scan-assembler "\[ \t\]sdot\tz\[0-9\]+.s, z\[0-9\]+.h, 
z\[0-9\]+.h" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 11ba77ca404..ebbc2fb8015 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4258,6 +4258,15 @@ proc check_effective_target_vect_int { } {
        }}]
  }

+# Return 1 if the target supports two-way dot products on inputs of hi mode

+# producing si outputs, 0 otherwise.
+
+proc check_effective_target_vect_dotprod_hisi { } {
+    return [check_cached_effective_target_indexed aarch64_sme2 {
+       expr { [check_effective_target_aarch64_sme2]
+    }}]
+}
+
  # Return 1 if the target supports vectorization of early breaks,
  # 0 otherwise.
  #

[PING][PATCH V4 10/10] autovectorizer: Test autovectorization of different dot-prod modes.

Reply via email to