SVE: Tests for use of predicated vector tails for BB SLP

Christopher Bazley Tue, 28 Oct 2025 03:25:11 -0700

New tests verify that GCC can generate predicated vector-length
specific code for AArch64 if the specified vector length is
shorter than, equal to, or longer than the number of elements to
be processed (including if the specified length is sufficient but
the minimum length would not be); other tests verify that GCC can
generate predicated vector-length agnostic code for AArch64 if
the minimum length (of 16 bytes) is shorter than, equal to, or
longer than the number of elements to be processed.
---
 .../gcc.target/aarch64/sve/slp_pred_1.c       | 33 ++++++++++++++++
 .../gcc.target/aarch64/sve/slp_pred_1_run.c   |  6 +++
 .../gcc.target/aarch64/sve/slp_pred_2.c       | 33 ++++++++++++++++
 .../gcc.target/aarch64/sve/slp_pred_3.c       | 33 ++++++++++++++++
 .../gcc.target/aarch64/sve/slp_pred_3_run.c   |  6 +++
 .../gcc.target/aarch64/sve/slp_pred_4.c       | 33 ++++++++++++++++
 .../gcc.target/aarch64/sve/slp_pred_5.c       | 36 +++++++++++++++++
 .../gcc.target/aarch64/sve/slp_pred_6.c       | 39 +++++++++++++++++++
 .../gcc.target/aarch64/sve/slp_pred_6_run.c   |  6 +++
 .../gcc.target/aarch64/sve/slp_pred_7.c       | 38 ++++++++++++++++++
 .../gcc.target/aarch64/sve/slp_pred_harness.h | 28 +++++++++++++
 11 files changed, 291 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_5.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_7.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_harness.h


diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1.c
new file mode 100644
index 00000000000..4e0a78de02a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+/* Test that we can vectorize with SVE predication when generating 
vector-length
+   agnostic code if the minimum possible vector length (of 16 bytes) is larger
+   than the number of elements to be processed.  */
+
+void
+f (uint8_t *x)
+{
+  x[0] += 1;
+  x[1] += 2;
+  x[2] += 1;
+  x[3] += 2;
+  x[4] += 1;
+  x[5] += 2;
+  x[6] += 1;
+  x[7] += 2;
+  x[8] += 1;
+  x[9] += 2;
+  x[10] += 1;
+  x[11] += 2;
+  x[12] += 1;
+  x[13] += 2;
+  x[14] += 1; // one less than the minimum vector length
+}
+
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, xzr, x[0-9]\n} 1 } 
} */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, 
\[x[0-9]\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, 
z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], 
\[x[0-9]\]\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1_run.c
new file mode 100644
index 00000000000..7d0a88fec2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1_run.c
@@ -0,0 +1,6 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=scalable" } */
+#include "slp_pred_harness.h"
+#include "slp_pred_1.c"
+
+HARNESS (15)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_2.c
new file mode 100644
index 00000000000..da120ad36f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=128" } */
+
+#include <stdint.h>
+
+/* Test that we can vectorize with SVE predication when generating 
vector-length
+   specific code if the configured vector length is larger than the number of
+   elements to be processed.  */
+
+void
+f (uint8_t *x)
+{
+  x[0] += 1;
+  x[1] += 2;
+  x[2] += 1;
+  x[3] += 2;
+  x[4] += 1;
+  x[5] += 2;
+  x[6] += 1;
+  x[7] += 2;
+  x[8] += 1;
+  x[9] += 2;
+  x[10] += 1;
+  x[11] += 2;
+  x[12] += 1;
+  x[13] += 2;
+  x[14] += 1; // one less than the configured vector length
+}
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7].b, mul3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, 
\[x[0-9]\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, 
z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], 
\[x[0-9]\]\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3.c
new file mode 100644
index 00000000000..184b9615cd9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+/* Test that we can vectorize with SVE predication when generating 
vector-length
+   agnostic code if the minimum possible vector length (of 16 bytes) is equal 
to
+   the number of elements to be processed.  */
+
+void
+f (uint8_t *x)
+{
+  x[0] += 1;
+  x[1] += 2;
+  x[2] += 1;
+  x[3] += 2;
+  x[4] += 1;
+  x[5] += 2;
+  x[6] += 1;
+  x[7] += 2;
+  x[8] += 1;
+  x[9] += 2;
+  x[10] += 1;
+  x[11] += 2;
+  x[12] += 1;
+  x[13] += 2;
+  x[14] += 1;
+  x[15] += 2; // exactly fits the minimum vector length
+}
+
+/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, 
z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3_run.c
new file mode 100644
index 00000000000..5c92b1e0b39
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3_run.c
@@ -0,0 +1,6 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=scalable" } */
+#include "slp_pred_harness.h"
+#include "slp_pred_3.c"
+
+HARNESS (16)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_4.c
new file mode 100644
index 00000000000..ecb6ee2304a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_4.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=128" } */
+
+#include <stdint.h>
+
+/* Test that we can vectorize with SVE predication when generating 
vector-length
+   specific code if the configured vector length is equal to the number of
+   elements to be processed.  */
+
+void
+f (uint8_t *x)
+{
+  x[0] += 1;
+  x[1] += 2;
+  x[2] += 1;
+  x[3] += 2;
+  x[4] += 1;
+  x[5] += 2;
+  x[6] += 1;
+  x[7] += 2;
+  x[8] += 1;
+  x[9] += 2;
+  x[10] += 1;
+  x[11] += 2;
+  x[12] += 1;
+  x[13] += 2;
+  x[14] += 1;
+  x[15] += 2; // exactly fits the configured vector length
+}
+
+/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, 
z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_5.c
new file mode 100644
index 00000000000..076756ff948
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_5.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Test that we can vectorize with SVE predication when generating
+   vector-length specific code if the number of elements to be
+   processed is greater than the minimum possible vector length
+   (of 16 bytes) but less the configured vector length.  */
+
+void
+f (uint8_t *x)
+{
+  x[0] += 1;
+  x[1] += 2;
+  x[2] += 1;
+  x[3] += 2;
+  x[4] += 1;
+  x[5] += 2;
+  x[6] += 1;
+  x[7] += 2;
+  x[8] += 1;
+  x[9] += 2;
+  x[10] += 1;
+  x[11] += 2;
+  x[12] += 1;
+  x[13] += 2;
+  x[14] += 1;
+  x[15] += 2;
+  x[16] += 1; // one more than the minimum vector length
+}
+
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, xzr, x[0-9]+\n} 1 } 
} */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, 
\[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, 
z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], 
\[x[0-9]+\]\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6.c
new file mode 100644
index 00000000000..fffb52e8f4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+/* Test that we do not attempt to vectorize with SVE predication when
+   generating vector-length agnostic code if the minimum possible
+   vector length (of 16 bytes) is smaller than the number of elements
+   to be processed.  */
+
+void
+f (uint8_t *x)
+{
+  x[0] += 1;
+  x[1] += 2;
+  x[2] += 1;
+  x[3] += 2;
+  x[4] += 1;
+  x[5] += 2;
+  x[6] += 1;
+  x[7] += 2;
+  x[8] += 1;
+  x[9] += 2;
+  x[10] += 1;
+  x[11] += 2;
+  x[12] += 1;
+  x[13] += 2;
+  x[14] += 1;
+  x[15] += 2;
+  x[16] += 1; // one more than the minimum vector length
+}
+
+/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, 
z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tldrb\tw[0-9]+, \[x[0-9], 16\]\n} 1 } } 
*/
+/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, 1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw[0-9]+, \[x[0-9], 16\]\n} 1 } } 
*/
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6_run.c
new file mode 100644
index 00000000000..2147a66abe9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6_run.c
@@ -0,0 +1,6 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=scalable" } */
+#include "slp_pred_harness.h"
+#include "slp_pred_6.c"
+
+HARNESS (17)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_7.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_7.c
new file mode 100644
index 00000000000..82f744c8bbc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_7.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve 
--param=aarch64-autovec-preference=sve-only -msve-vector-bits=128" } */
+
+#include <stdint.h>
+
+/* Test that we do not attempt to vectorize with SVE predication when
+   generating vector-length specific code if the configured vector
+   length is smaller than the number of elements to be processed.  */
+
+void
+f (uint8_t *x)
+{
+  x[0] += 1;
+  x[1] += 2;
+  x[2] += 1;
+  x[3] += 2;
+  x[4] += 1;
+  x[5] += 2;
+  x[6] += 1;
+  x[7] += 2;
+  x[8] += 1;
+  x[9] += 2;
+  x[10] += 1;
+  x[11] += 2;
+  x[12] += 1;
+  x[13] += 2;
+  x[14] += 1;
+  x[15] += 2;
+  x[16] += 1; // one more than the configured vector length
+}
+
+/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, 
z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tldrb\tw[0-9]+, \[x[0-9], 16\]\n} 1 } } 
*/
+/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, 1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw[0-9]+, \[x[0-9], 16\]\n} 1 } } 
*/
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_harness.h 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_harness.h
new file mode 100644
index 00000000000..ac569fc670c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_harness.h
@@ -0,0 +1,28 @@
+/* Test harness shared between tests for vectorization with SVE predication.  
*/
+
+#define HARNESS(N)                                                             
\
+  int __attribute__ ((optimize (1))) main (void)                               
\
+  {                                                                            
\
+    uint8_t a[N], b[N];                                                        
\
+    for (unsigned int i = 0; i < N; ++i)                                       
\
+      {                                                                        
\
+       a[i] = i * 2 + i % 5;                                                  \
+       b[i] = a[i];                                                           \
+       asm volatile ("" ::: "memory");                                        \
+      }                                                                        
\
+    f (a);                                                                     
\
+    for (unsigned int i = 0; i < N; i += 2)                                    
\
+      {                                                                        
\
+       b[i]++;                                                                \
+       if (a[i] != b[i])                                                      \
+         __builtin_abort ();                                                  \
+       if (i + 1 < N)                                                         \
+         {                                                                    \
+           b[i + 1] += 2;                                                     \
+           if (a[i + 1] != b[i + 1])                                          \
+             __builtin_abort ();                                              \
+         }                                                                    \
+       asm volatile ("" ::: "memory");                                        \
+      }                                                                        
\
+    return 0;                                                                  
\
+  }
-- 
2.43.0

[RFC 9/9] AArch64/SVE: Tests for use of predicated vector tails for BB SLP

Reply via email to