SVE: Tests for use of predicated vector tails for BB SLP

Tamar Christina Tue, 30 Dec 2025 08:26:53 -0800

> -----Original Message-----
> From: Christopher Bazley <[email protected]>
> Sent: 19 December 2025 15:09
> To: [email protected]
> Cc: [email protected]; Tamar Christina
> <[email protected]>
> Subject: [PATCH v8 09/10] AArch64/SVE: Tests for use of predicated vector
> tails for BB SLP
> 
> New tests verify that GCC can generate predicated vector-length
> specific code for AArch64 if the specified vector length is
> shorter than, equal to, or longer than the number of elements to
> be processed (including if the specified length is sufficient but
> the minimum length would not be); other tests verify that GCC can
> generate predicated vector-length agnostic code for AArch64 if
> the minimum length (of 16 bytes) is shorter than, equal to, or
> longer than the number of elements to be processed.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/aarch64/sve/slp_pred_1.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_1_run.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_2.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_3.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_3_run.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_4.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_5.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_6.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_6_run.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_7.c: New test.
>       * gcc.target/aarch64/sve/slp_pred_harness.h: Test harness
>       shared between tests for vectorization with SVE predication.
> 
> ---
>  .../gcc.target/aarch64/sve/slp_pred_1.c       | 33 ++++++++++++++++
>  .../gcc.target/aarch64/sve/slp_pred_1_run.c   |  6 +++
>  .../gcc.target/aarch64/sve/slp_pred_2.c       | 33 ++++++++++++++++
>  .../gcc.target/aarch64/sve/slp_pred_3.c       | 33 ++++++++++++++++
>  .../gcc.target/aarch64/sve/slp_pred_3_run.c   |  6 +++
>  .../gcc.target/aarch64/sve/slp_pred_4.c       | 33 ++++++++++++++++
>  .../gcc.target/aarch64/sve/slp_pred_5.c       | 36 +++++++++++++++++
>  .../gcc.target/aarch64/sve/slp_pred_6.c       | 39 +++++++++++++++++++
>  .../gcc.target/aarch64/sve/slp_pred_6_run.c   |  6 +++
>  .../gcc.target/aarch64/sve/slp_pred_7.c       | 38 ++++++++++++++++++
>  .../gcc.target/aarch64/sve/slp_pred_harness.h | 28 +++++++++++++
>  11 files changed, 291 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_2.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_4.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_5.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6_run.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/slp_pred_7.c
>  create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/slp_pred_harness.h
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1.c
> new file mode 100644
> index 00000000000..4e0a78de02a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=scalable" } */


+sve Is superfluous with armv9-a, but more on that in a bit.

> +
> +#include <stdint.h>
> +
> +/* Test that we can vectorize with SVE predication when generating vector-
> length
> +   agnostic code if the minimum possible vector length (of 16 bytes) is 
> larger
> +   than the number of elements to be processed.  */
> +
> +void
> +f (uint8_t *x)
> +{
> +  x[0] += 1;
> +  x[1] += 2;
> +  x[2] += 1;
> +  x[3] += 2;
> +  x[4] += 1;
> +  x[5] += 2;
> +  x[6] += 1;
> +  x[7] += 2;
> +  x[8] += 1;
> +  x[9] += 2;
> +  x[10] += 1;
> +  x[11] += 2;
> +  x[12] += 1;
> +  x[13] += 2;
> +  x[14] += 1; // one less than the minimum vector length
> +}
> +

These are probably done as a check-function-bodies check, because atm it's hard
To see the sequence we're after. I guess we're using a predicated load + 
unpredicated
add and predicated store. 

> +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, xzr, x[0-9]\n} 1 
> } }
> */
> +/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x[0-
> 9]\]\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, z[0-
> 9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], 
> \[x[0-9]\]\n}
> 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1_run.c
> new file mode 100644
> index 00000000000..7d0a88fec2f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_1_run.c
> @@ -0,0 +1,6 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=scalable" } */

You're using armv9-a so the above should be aarch64_sve2_hw since the compiler
is free to pick and SVE2 instruction.  That said the aarch64-sve.exp file 
already adds
the right runes, so I'd just remove the -march entirely.

My second question is why the --param=aarch64-autovec-preference=sve-only ?

The code should handle more elements than the Adv. SIMD one. So we shouldn't
need it, unless something is wrong in costing?

> +#include "slp_pred_harness.h"
> +#include "slp_pred_1.c"
> +
> +HARNESS (15)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_2.c
> new file mode 100644
> index 00000000000..da120ad36f9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_2.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=128" } */
> +
> +#include <stdint.h>
> +
> +/* Test that we can vectorize with SVE predication when generating vector-
> length
> +   specific code if the configured vector length is larger than the number of
> +   elements to be processed.  */
> +
> +void
> +f (uint8_t *x)
> +{
> +  x[0] += 1;
> +  x[1] += 2;
> +  x[2] += 1;
> +  x[3] += 2;
> +  x[4] += 1;
> +  x[5] += 2;
> +  x[6] += 1;
> +  x[7] += 2;
> +  x[8] += 1;
> +  x[9] += 2;
> +  x[10] += 1;
> +  x[11] += 2;
> +  x[12] += 1;
> +  x[13] += 2;
> +  x[14] += 1; // one less than the configured vector length
> +}
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7].b, mul3\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x[0-
> 9]\]\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, z[0-
> 9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], 
> \[x[0-9]\]\n}
> 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3.c
> new file mode 100644
> index 00000000000..184b9615cd9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=scalable" } */
> +
> +#include <stdint.h>
> +
> +/* Test that we can vectorize with SVE predication when generating vector-
> length
> +   agnostic code if the minimum possible vector length (of 16 bytes) is equal
> to
> +   the number of elements to be processed.  */
> +
> +void
> +f (uint8_t *x)
> +{
> +  x[0] += 1;
> +  x[1] += 2;
> +  x[2] += 1;
> +  x[3] += 2;
> +  x[4] += 1;
> +  x[5] += 2;
> +  x[6] += 1;
> +  x[7] += 2;
> +  x[8] += 1;
> +  x[9] += 2;
> +  x[10] += 1;
> +  x[11] += 2;
> +  x[12] += 1;
> +  x[13] += 2;
> +  x[14] += 1;
> +  x[15] += 2; // exactly fits the minimum vector length
> +}
> +
> +/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, z[0-
> 9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tstr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3_run.c
> new file mode 100644
> index 00000000000..5c92b1e0b39
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_3_run.c
> @@ -0,0 +1,6 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=scalable" } */
> +#include "slp_pred_harness.h"
> +#include "slp_pred_3.c"
> +
> +HARNESS (16)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_4.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_4.c
> new file mode 100644
> index 00000000000..ecb6ee2304a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_4.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=128" } */
> +
> +#include <stdint.h>
> +
> +/* Test that we can vectorize with SVE predication when generating vector-
> length
> +   specific code if the configured vector length is equal to the number of
> +   elements to be processed.  */
> +
> +void
> +f (uint8_t *x)
> +{
> +  x[0] += 1;
> +  x[1] += 2;
> +  x[2] += 1;
> +  x[3] += 2;
> +  x[4] += 1;
> +  x[5] += 2;
> +  x[6] += 1;
> +  x[7] += 2;
> +  x[8] += 1;
> +  x[9] += 2;
> +  x[10] += 1;
> +  x[11] += 2;
> +  x[12] += 1;
> +  x[13] += 2;
> +  x[14] += 1;
> +  x[15] += 2; // exactly fits the configured vector length
> +}
> +
> +/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, z[0-
> 9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tstr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_5.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_5.c
> new file mode 100644
> index 00000000000..076756ff948
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_5.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=256" } */
> +
> +#include <stdint.h>
> +
> +/* Test that we can vectorize with SVE predication when generating
> +   vector-length specific code if the number of elements to be
> +   processed is greater than the minimum possible vector length
> +   (of 16 bytes) but less the configured vector length.  */
> +
> +void
> +f (uint8_t *x)
> +{
> +  x[0] += 1;
> +  x[1] += 2;
> +  x[2] += 1;
> +  x[3] += 2;
> +  x[4] += 1;
> +  x[5] += 2;
> +  x[6] += 1;
> +  x[7] += 2;
> +  x[8] += 1;
> +  x[9] += 2;
> +  x[10] += 1;
> +  x[11] += 2;
> +  x[12] += 1;
> +  x[13] += 2;
> +  x[14] += 1;
> +  x[15] += 2;
> +  x[16] += 1; // one more than the minimum vector length
> +}
> +
> +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, xzr, x[0-9]+\n} 1 
> } }
> */
> +/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x[0-
> 9]+\]\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, z[0-
> 9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x[0-
> 9]+\]\n} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6.c
> new file mode 100644
> index 00000000000..fffb52e8f4b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=scalable" } */
> +
> +#include <stdint.h>
> +
> +/* Test that we do not attempt to vectorize with SVE predication when
> +   generating vector-length agnostic code if the minimum possible
> +   vector length (of 16 bytes) is smaller than the number of elements
> +   to be processed.  */
> +
> +void
> +f (uint8_t *x)
> +{
> +  x[0] += 1;
> +  x[1] += 2;
> +  x[2] += 1;
> +  x[3] += 2;
> +  x[4] += 1;
> +  x[5] += 2;
> +  x[6] += 1;
> +  x[7] += 2;
> +  x[8] += 1;
> +  x[9] += 2;
> +  x[10] += 1;
> +  x[11] += 2;
> +  x[12] += 1;
> +  x[13] += 2;
> +  x[14] += 1;
> +  x[15] += 2;
> +  x[16] += 1; // one more than the minimum vector length
> +}
> +
> +/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, z[0-
> 9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tstr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tldrb\tw[0-9]+, \[x[0-9], 16\]\n} 1 } 
> }
> */
> +/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, 1\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tstrb\tw[0-9]+, \[x[0-9], 16\]\n} 1 } 
> }
> */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6_run.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6_run.c
> new file mode 100644
> index 00000000000..2147a66abe9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_6_run.c
> @@ -0,0 +1,6 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=scalable" } */
> +#include "slp_pred_harness.h"
> +#include "slp_pred_6.c"
> +
> +HARNESS (17)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_7.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_7.c
> new file mode 100644
> index 00000000000..82f744c8bbc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_7.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -march=armv9-a+sve --param=aarch64-
> autovec-preference=sve-only -msve-vector-bits=128" } */
> +
> +#include <stdint.h>
> +
> +/* Test that we do not attempt to vectorize with SVE predication when
> +   generating vector-length specific code if the configured vector
> +   length is smaller than the number of elements to be processed.  */
> +
> +void
> +f (uint8_t *x)
> +{
> +  x[0] += 1;
> +  x[1] += 2;
> +  x[2] += 1;
> +  x[3] += 2;
> +  x[4] += 1;
> +  x[5] += 2;
> +  x[6] += 1;
> +  x[7] += 2;
> +  x[8] += 1;
> +  x[9] += 2;
> +  x[10] += 1;
> +  x[11] += 2;
> +  x[12] += 1;
> +  x[13] += 2;
> +  x[14] += 1;
> +  x[15] += 2;
> +  x[16] += 1; // one more than the configured vector length
> +}
> +
> +/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, z[0-
> 9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tstr\tq[0-9]+, \[x[0-9]\]\n} 1 } } */
> +
> +/* { dg-final { scan-assembler-times {\tldrb\tw[0-9]+, \[x[0-9], 16\]\n} 1 } 
> }
> */
> +/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, 1\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tstrb\tw[0-9]+, \[x[0-9], 16\]\n} 1 } 
> }
> */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_harness.h
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_harness.h
> new file mode 100644
> index 00000000000..ac569fc670c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_pred_harness.h
> @@ -0,0 +1,28 @@
> +/* Test harness shared between tests for vectorization with SVE predication.
> */
> +
> +#define HARNESS(N)                                                           
>   \
> +  int __attribute__ ((optimize (1))) main (void)                             
>   \

I'd drop this and the memory barriers below, and just add
__attribute__((noipa)) to the f functions in the testcases.

But no strong feelings here.

General question, I'm curious whether we reject pathologically not beneficial 
cases.
My concern is around costing. For instance you don't have a testcase with 
-msve-vector-bits=2048
nor a case with -msve-vector-bits=128 and e.g. 1 or 2 bytes.

This is not a blocker, but I'd like to know what we do here, so I know If any 
actions need to be taken
In stage 4.

Thanks,
Tamar
> +  {                                                                          
>   \
> +    uint8_t a[N], b[N];                                                      
>   \
> +    for (unsigned int i = 0; i < N; ++i)                                     
>   \
> +      {                                                                      
>   \
> +     a[i] = i * 2 + i % 5;                                                  \
> +     b[i] = a[i];                                                           \
> +     asm volatile ("" ::: "memory");                                        \
> +      }                                                                      
>   \
> +    f (a);                                                                   
>   \
> +    for (unsigned int i = 0; i < N; i += 2)                                  
>   \
> +      {                                                                      
>   \
> +     b[i]++;                                                                \
> +     if (a[i] != b[i])                                                      \
> +       __builtin_abort ();                                                  \
> +     if (i + 1 < N)                                                         \
> +       {                                                                    \
> +         b[i + 1] += 2;                                                     \
> +         if (a[i + 1] != b[i + 1])                                          \
> +           __builtin_abort ();                                              \
> +       }                                                                    \
> +     asm volatile ("" ::: "memory");                                        \
> +      }                                                                      
>   \
> +    return 0;                                                                
>   \
> +  }
> --
> 2.43.0

RE: [PATCH v8 09/10] AArch64/SVE: Tests for use of predicated vector tails for BB SLP

Reply via email to