Re: [PATCH 3/4]AArch64: Add support for boolean reductions for Adv. SIMD

Andrew Pinski Fri, 05 Dec 2025 12:35:14 -0800

On Mon, Oct 20, 2025 at 11:28 PM Tamar Christina
<[email protected]> wrote:
>
> The vectorizer has learned how to do boolean reductions of masks to a C bool
> for the operations OR, XOR and AND.
>
> This implements the new optabs for Adv.SIMD.  Adv.SIMD today can already
> vectorize such loops but does so through SHIFT-AND-INSERT to perform the
> reductions step-wise and inorder.  As an example, an OR reduction today does:
>
>         movi    v3.4s, 0
>         ext     v5.16b, v30.16b, v3.16b, #8
>         orr     v5.16b, v5.16b, v30.16b
>         ext     v29.16b, v5.16b, v3.16b, #4
>         orr     v29.16b, v29.16b, v5.16b
>         ext     v4.16b, v29.16b, v3.16b, #2
>         orr     v4.16b, v4.16b, v29.16b
>         ext     v3.16b, v4.16b, v3.16b, #1
>         orr     v3.16b, v3.16b, v4.16b
>         fmov    w1, s3
>         and     w1, w1, 1
>
> For reducing to a boolean however we don't need the stepwise reduction and can
> just look at the bit patterns. For e.g. OR we now generate:
>
>         umaxp   v3.4s, v3.4s, v3.4s
>         fmov    x1, d3
>         cmp     x1, 0
>         cset    w0, ne
>
> For the remaining codegen see test vect-reduc-bool-9.c.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-simd.md (reduc_sbool_and_scal_<mode>,
>         reduc_sbool_ior_scal_<mode>, reduc_sbool_xor_scal_<mode>): New.
>         * config/aarch64/iterators.md (VALLI): New.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vect-reduc-bool-1.c: New test.
>         * gcc.target/aarch64/vect-reduc-bool-2.c: New test.
>         * gcc.target/aarch64/vect-reduc-bool-3.c: New test.
>         * gcc.target/aarch64/vect-reduc-bool-4.c: New test.
>         * gcc.target/aarch64/vect-reduc-bool-5.c: New test.
>         * gcc.target/aarch64/vect-reduc-bool-6.c: New test.
>         * gcc.target/aarch64/vect-reduc-bool-7.c: New test.
>         * gcc.target/aarch64/vect-reduc-bool-8.c: New test.
>         * gcc.target/aarch64/vect-reduc-bool-9.c: New test.
>
> ---
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> eaa8d57cc41387717affe25ec6694ec3502e3950..5eddc05b5749bbd080a085db2e15dbb9bbce3be3
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3469,6 +3469,87 @@ (define_expand "reduc_plus_scal_v4sf"
>    DONE;
>  })
>
> +;; AND tree reductions.
> +;; Check if after a min pairwise reduction that all the lanes are 1.
> +;;
> +(define_expand "reduc_sbool_and_scal_<mode>"
> +  [(set (match_operand:QI 0 "register_operand")
> +       (unspec:QI [(match_operand:VALLI 1 "register_operand")]
> +                   UNSPEC_ANDV))]
> +  "TARGET_SIMD"
> +{
> +  rtx tmp = operands[1];
> +  /* For 64-bit vectors we need no reductions.  */
> +  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
> +    {
> +      /* Always reduce using a V4SI.  */
> +      rtx reduc = gen_lowpart (V4SImode, tmp);
> +      rtx res = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_aarch64_uminpv4si (res, reduc, reduc));
> +      emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));


This is wrong as tmp is operands[1] and you just overwrote the value
in operands[1] which might be used later on.
See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123026 for a testcase.
Most likely you need:
tmp = gen_reg_rtx (<MODE>mode);
before the emit_move_insn.

Thanks,
Andrew

> +    }
> +  rtx val = gen_reg_rtx (DImode);
> +  emit_move_insn (val, gen_lowpart (DImode, tmp));
> +  rtx cc_reg = aarch64_gen_compare_reg (EQ, val, constm1_rtx);
> +  rtx cmp = gen_rtx_fmt_ee (EQ, SImode, cc_reg, constm1_rtx);
> +  rtx tmp2 = gen_reg_rtx (SImode);
> +  emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg));
> +  emit_move_insn (operands[0], gen_lowpart (QImode, tmp2));
> +  DONE;
> +})
> +
> +;; IOR tree reductions.
> +;; Check that after a MAX pairwise reduction any lane is not 0
> +;;
> +(define_expand "reduc_sbool_ior_scal_<mode>"
> +  [(set (match_operand:QI 0 "register_operand")
> +       (unspec:QI [(match_operand:VALLI 1 "register_operand")]
> +                   UNSPEC_IORV))]
> +  "TARGET_SIMD"
> +{
> +  rtx tmp = operands[1];
> +  /* For 64-bit vectors we need no reductions.  */
> +  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
> +    {
> +      /* Always reduce using a V4SI.  */
> +      rtx reduc = gen_lowpart (V4SImode, tmp);
> +      rtx res = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
> +      emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));

Likewise.

> +    }
> +  rtx val = gen_reg_rtx (DImode);
> +  emit_move_insn (val, gen_lowpart (DImode, tmp));
> +  rtx cc_reg = aarch64_gen_compare_reg (NE, val, const0_rtx);
> +  rtx cmp = gen_rtx_fmt_ee (NE, SImode, cc_reg, const0_rtx);
> +  rtx tmp2 = gen_reg_rtx (SImode);
> +  emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg));
> +  emit_move_insn (operands[0], gen_lowpart (QImode, tmp2));
> +  DONE;
> +})
> +
> +;; Unpredicated predicate XOR tree reductions.
> +;; Check to see if the number of active lanes in the predicates is a multiple
> +;; of 2.  We use a normal reduction after masking with 0x1.
> +;;
> +(define_expand "reduc_sbool_xor_scal_<mode>"
> +  [(set (match_operand:QI 0 "register_operand")
> +       (unspec:QI [(match_operand:VALLI 1 "register_operand")]
> +                   UNSPEC_XORV))]
> +  "TARGET_SIMD"
> +{
> +  rtx tmp = gen_reg_rtx (<MODE>mode);
> +  rtx one_reg = force_reg (<MODE>mode, CONST1_RTX (<MODE>mode));
> +  emit_move_insn (tmp, gen_rtx_AND (<MODE>mode, operands[1], one_reg));
> +  rtx tmp2 = gen_reg_rtx (<VEL>mode);
> +  emit_insn (gen_reduc_plus_scal_<mode> (tmp2, tmp));
> +  rtx tmp3 = gen_reg_rtx (DImode);
> +  emit_move_insn (tmp3, gen_rtx_AND (DImode,
> +                                    lowpart_subreg (DImode, tmp2, <VEL>mode),
> +                                    const1_rtx));
> +  emit_move_insn (operands[0], gen_lowpart (QImode, tmp2));
> +  DONE;
> +})
> +
>  ;; SADDLV and UADDLV can be expressed as an ADDV instruction that first
>  ;; sign or zero-extends its elements.
>  (define_insn "aarch64_<su>addlv<mode>"
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> 3757998c0ea9831b526a5bbc8568933fc05ed5d4..c369b19507a9bb06ca60e883b19823ded7c01c85
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -217,6 +217,9 @@ (define_mode_iterator V2F [V2SF V2DF])
>  ;; All Advanced SIMD modes on which we support any arithmetic operations.
>  (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF 
> V2DF])
>
> +;; All Advanced SIMD integer modes
> +(define_mode_iterator VALLI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
> +
>  ;; All Advanced SIMD modes suitable for moving, loading, and storing.
>  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>                                 V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-1.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..c9b1c85c222e164da0f60f4774469d43036b6afc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-1.c
> @@ -0,0 +1,51 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only 
> -fdump-tree-vect-details" }*/
> +
> +char p[128];
> +
> +bool __attribute__((noipa))
> +fand (int n)
> +{
> +  bool r = true;
> +  for (int i = 0; i < n; ++i)
> +    r &= (p[i] != 0);
> +  return r;
> +}
> +
> +bool __attribute__((noipa))
> +fior (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r |= (p[i] != 0);
> +  return r;
> +}
> +
> +int main()
> +{
> +  __builtin_memset (p, 1, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (!fand (n))
> +      __builtin_abort ();
> +
> +  p[0] = 0;
> +  for (int n = 1; n < 77; ++n)
> +    if (fand (n))
> +      __builtin_abort ();
> +
> +  __builtin_memset (p, 0, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fior (n))
> +      __builtin_abort ();
> +
> +  p[0] = 1;
> +  for (int n = 1; n < 77; ++n)
> +    if (!fior (n))
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } 
> } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-2.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..598d6c71ec84bc7327b01ff94e51f4a213f07ff6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-2.c
> @@ -0,0 +1,51 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only 
> -fdump-tree-vect-details" }*/
> +
> +short p[128];
> +
> +bool __attribute__((noipa))
> +fand (int n)
> +{
> +  bool r = true;
> +  for (int i = 0; i < n; ++i)
> +    r &= (p[i] != 0);
> +  return r;
> +}
> +
> +bool __attribute__((noipa))
> +fior (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r |= (p[i] != 0);
> +  return r;
> +}
> +
> +int main()
> +{
> +  __builtin_memset (p, 1, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (!fand (n))
> +      __builtin_abort ();
> +
> +  p[0] = 0;
> +  for (int n = 1; n < 77; ++n)
> +    if (fand (n))
> +      __builtin_abort ();
> +
> +  __builtin_memset (p, 0, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fior (n))
> +      __builtin_abort ();
> +
> +  p[0] = 1;
> +  for (int n = 1; n < 77; ++n)
> +    if (!fior (n))
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } 
> } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-3.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..9517965753a7cfdd06b05d9298a14db4bb7112f9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-3.c
> @@ -0,0 +1,51 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only 
> -fdump-tree-vect-details" }*/
> +
> +int p[128];
> +
> +bool __attribute__((noipa))
> +fand (int n)
> +{
> +  bool r = true;
> +  for (int i = 0; i < n; ++i)
> +    r &= (p[i] != 0);
> +  return r;
> +}
> +
> +bool __attribute__((noipa))
> +fior (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r |= (p[i] != 0);
> +  return r;
> +}
> +
> +int main()
> +{
> +  __builtin_memset (p, 1, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (!fand (n))
> +      __builtin_abort ();
> +
> +  p[0] = 0;
> +  for (int n = 1; n < 77; ++n)
> +    if (fand (n))
> +      __builtin_abort ();
> +
> +  __builtin_memset (p, 0, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fior (n))
> +      __builtin_abort ();
> +
> +  p[0] = 1;
> +  for (int n = 1; n < 77; ++n)
> +    if (!fior (n))
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } 
> } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-4.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..3cd577f5ed5929dab45da1e2a23d7af197065767
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-4.c
> @@ -0,0 +1,51 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only 
> -fdump-tree-vect-details" }*/
> +
> +long long p[128];
> +
> +bool __attribute__((noipa))
> +fand (int n)
> +{
> +  bool r = true;
> +  for (int i = 0; i < n; ++i)
> +    r &= (p[i] != 0);
> +  return r;
> +}
> +
> +bool __attribute__((noipa))
> +fior (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r |= (p[i] != 0);
> +  return r;
> +}
> +
> +int main()
> +{
> +  __builtin_memset (p, 1, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (!fand (n))
> +      __builtin_abort ();
> +
> +  p[0] = 0;
> +  for (int n = 1; n < 77; ++n)
> +    if (fand (n))
> +      __builtin_abort ();
> +
> +  __builtin_memset (p, 0, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fior (n))
> +      __builtin_abort ();
> +
> +  p[0] = 1;
> +  for (int n = 1; n < 77; ++n)
> +    if (!fior (n))
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } 
> } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-5.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-5.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..c6fa63b7657ea8a176442b7609b10caf771ecbcf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-5.c
> @@ -0,0 +1,49 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only 
> -fdump-tree-vect-details" }*/
> +
> +char p[128];
> +
> +bool __attribute__((noipa))
> +fxort (int n)
> +{
> +  bool r = true;
> +  for (int i = 0; i < n; ++i)
> +    r ^= (p[i] != 0);
> +  return r;
> +}
> +
> +bool __attribute__((noipa))
> +fxorf (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r ^= (p[i] != 0);
> +  return r;
> +}
> +
> +int main()
> +{
> +  __builtin_memset (p, 1, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxort (n) != !(n & 1))
> +      __builtin_abort ();
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxorf (n) != (n & 1))
> +      __builtin_abort ();
> +
> +  __builtin_memset (p, 0, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (!fxort (n))
> +      __builtin_abort ();
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxorf (n))
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } 
> } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-6.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-6.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6d12e6a7cb4fd45bd43165f35cae68a4762f307b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-6.c
> @@ -0,0 +1,49 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only 
> -fdump-tree-vect-details" }*/
> +
> +short p[128];
> +
> +bool __attribute__((noipa))
> +fxort (int n)
> +{
> +  bool r = true;
> +  for (int i = 0; i < n; ++i)
> +    r ^= (p[i] != 0);
> +  return r;
> +}
> +
> +bool __attribute__((noipa))
> +fxorf (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r ^= (p[i] != 0);
> +  return r;
> +}
> +
> +int main()
> +{
> +  __builtin_memset (p, 1, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxort (n) != !(n & 1))
> +      __builtin_abort ();
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxorf (n) != (n & 1))
> +      __builtin_abort ();
> +
> +  __builtin_memset (p, 0, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (!fxort (n))
> +      __builtin_abort ();
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxorf (n))
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } 
> } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-7.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-7.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..58d6a785f9a0c23d3745927ffc2b9df16dfe2ae4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-7.c
> @@ -0,0 +1,49 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only 
> -fdump-tree-vect-details" }*/
> +
> +int p[128];
> +
> +bool __attribute__((noipa))
> +fxort (int n)
> +{
> +  bool r = true;
> +  for (int i = 0; i < n; ++i)
> +    r ^= (p[i] != 0);
> +  return r;
> +}
> +
> +bool __attribute__((noipa))
> +fxorf (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r ^= (p[i] != 0);
> +  return r;
> +}
> +
> +int main()
> +{
> +  __builtin_memset (p, 1, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxort (n) != !(n & 1))
> +      __builtin_abort ();
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxorf (n) != (n & 1))
> +      __builtin_abort ();
> +
> +  __builtin_memset (p, 0, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (!fxort (n))
> +      __builtin_abort ();
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxorf (n))
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } 
> } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-8.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-8.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..18ad94a4bd7fe7c87ca0c32cd93f7aee4937cd39
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-8.c
> @@ -0,0 +1,49 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only 
> -fdump-tree-vect-details" }*/
> +
> +long long p[128];
> +
> +bool __attribute__((noipa))
> +fxort (int n)
> +{
> +  bool r = true;
> +  for (int i = 0; i < n; ++i)
> +    r ^= (p[i] != 0);
> +  return r;
> +}
> +
> +bool __attribute__((noipa))
> +fxorf (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r ^= (p[i] != 0);
> +  return r;
> +}
> +
> +int main()
> +{
> +  __builtin_memset (p, 1, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxort (n) != !(n & 1))
> +      __builtin_abort ();
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxorf (n) != (n & 1))
> +      __builtin_abort ();
> +
> +  __builtin_memset (p, 0, sizeof(p));
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (!fxort (n))
> +      __builtin_abort ();
> +
> +  for (int n = 0; n < 77; ++n)
> +    if (fxorf (n))
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
> target { vect_int && vect_condition } } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-9.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-9.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..7d9a82f5fc3a8104e8fecdebe13cc1bacc6a798a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-9.c
> @@ -0,0 +1,63 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only 
> -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 
> -fdump-tree-vect-details" }*/
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +char p[128];
> +
> +/*
> +** fand:
> +**     ...
> +**     uminp   v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**     fmov    x[0-9]+, d[0-9]+
> +**     cmn     x[0-9]+, #1
> +**     cset    w[0-9]+, eq
> +**     ...
> +*/
> +bool __attribute__((noipa))
> +fand (int n)
> +{
> +  bool r = true;
> +  for (int i = 0; i < n; ++i)
> +    r &= (p[i] != 0);
> +  return r;
> +}
> +
> +/*
> +** fior:
> +**     ...
> +**     umaxp   v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**     fmov    x[0-9]+, d[0-9]+
> +**     cmp     x[0-9]+, 0
> +**     cset    w[0-9]+, ne
> +**     ...
> +*/
> +bool __attribute__((noipa))
> +fior (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r |= (p[i] != 0);
> +  return r;
> +}
> +
> +/*
> +** fxor:
> +**     ...
> +**     movi    v[0-9]+.16b, 0x1
> +**     and     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +**     addv    b[0-9]+, v[0-9]+.16b
> +**     fmov    w[0-9]+, s[0-9]+
> +**     and     w[0-9]+, w[0-9]+, 1
> +**     ...
> +*/
> +bool __attribute__((noipa))
> +fxor (int n)
> +{
> +  bool r = false;
> +  for (int i = 0; i < n; ++i)
> +    r ^= (p[i] != 0);
> +  return r;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 3 "vect" } 
> } */
> +
>
>
> --

Re: [PATCH 3/4]AArch64: Add support for boolean reductions for Adv. SIMD

Reply via email to