Re: [AArch64][SVE2] Support for EOR3 and variants of BSL

Richard Sandiford Wed, 16 Oct 2019 12:49:05 -0700

Thanks for the patch, looks really good.

Yuliang Wang <yuliang.w...@arm.com> writes:
> +;; Use NBSL for vector NOR.
> +(define_insn_and_rewrite "*aarch64_sve2_nor<mode>"
> +  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
> +     (unspec:SVE_I
> +       [(match_operand 3)
> +        (and:SVE_I
> +          (not:SVE_I
> +            (match_operand:SVE_I 1 "register_operand" "w, 0, w"))
> +          (not:SVE_I
> +            (match_operand:SVE_I 2 "register_operand" "0, w, w")))]
> +       UNSPEC_PRED_X))]
> +  "TARGET_SVE2"
> +  "@
> +  nbsl\t%0.d, %0.d, %1.d, %0.d
> +  nbsl\t%0.d, %0.d, %2.d, %0.d
> +  movprfx\t%0, %2\;nbsl\t%0.d, %0.d, %1.d, %0.d"
> +  "&& !CONSTANT_P (operands[3])"
> +  {
> +    operands[3] = CONSTM1_RTX (<VPRED>mode);
> +  }
> +  [(set_attr "movprfx" "*,*,yes")]
> +)
> +
> +;; Use NBSL for vector NAND.
> +(define_insn_and_rewrite "*aarch64_sve2_nand<mode>"
> +  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
> +     (unspec:SVE_I
> +       [(match_operand 3)
> +        (ior:SVE_I
> +          (not:SVE_I
> +            (match_operand:SVE_I 1 "register_operand" "w, 0, w"))
> +          (not:SVE_I
> +            (match_operand:SVE_I 2 "register_operand" "0, w, w")))]
> +       UNSPEC_PRED_X))]
> +  "TARGET_SVE2"
> +  "@
> +  nbsl\t%0.d, %0.d, %1.d, %1.d
> +  nbsl\t%0.d, %0.d, %2.d, %2.d
> +  movprfx\t%0, %2\;nbsl\t%0.d, %0.d, %1.d, %1.d"
> +  "&& !CONSTANT_P (operands[3])"
> +  {
> +    operands[3] = CONSTM1_RTX (<VPRED>mode);
> +  }
> +  [(set_attr "movprfx" "*,*,yes")]
> +)


For these two it should in theory be slightly better to use "%" on
operand 1 to make the operation commutative, rather than provide a
separate matching alternative for operand 2.  E.g.:

(define_insn_and_rewrite "*aarch64_sve2_nand<mode>"
  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
        (unspec:SVE_I
          [(match_operand 3)
           (ior:SVE_I
             (not:SVE_I
               (match_operand:SVE_I 1 "register_operand" "%0, w"))
             (not:SVE_I
               (match_operand:SVE_I 2 "register_operand" "w, w")))]
          UNSPEC_PRED_X))]
  "TARGET_SVE2"
  "@
  nbsl\t%0.d, %0.d, %2.d, %2.d
  movprfx\t%0, %1\;nbsl\t%0.d, %0.d, %2.d, %2.d"
  "&& !CONSTANT_P (operands[3])"
  {
    operands[3] = CONSTM1_RTX (<VPRED>mode);
  }
  [(set_attr "movprfx" "*,*,yes")]
)

(But the EOR3 pattern is fine as-is, since that supports any
permutation of the three inputs.)

> +;; Unpredicated bitwise select.
> +(define_insn "*aarch64_sve2_bsl<mode>"
> +  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
> +     (xor:SVE_I
> +       (and:SVE_I
> +         (xor:SVE_I
> +           (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w")
> +           (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w"))
> +         (match_operand:SVE_I 3 "register_operand" "w, w"))
> +       (match_dup BSL_3RD)))]
> +  "TARGET_SVE2"
> +  "@
> +  bsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d
> +  movprfx\t%0, %<bsl_mov>\;bsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d"
> +  [(set_attr "movprfx" "*,yes")]
> +)

This is sufficiently far from the documented logic that it might
be worth a comment.  E.g.:

;; (op3 ? bsl_mov : bsl_3rd) == (((bsl_mov ^ bsl_3rd) & op3) ^ bsl_3rd)

Similarly for the later patterns.

> +;; Unpredicated bitwise inverted select.
> +(define_insn_and_rewrite "*aarch64_sve2_nbsl<mode>"
> +  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
> +     (unspec:SVE_I
> +       [(match_operand 4)
> +        (not:SVE_I
> +          (xor:SVE_I
> +            (and:SVE_I
> +              (xor:SVE_I
> +                (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w")
> +                (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w"))
> +              (match_operand:SVE_I 3 "register_operand" "w, w"))
> +            (match_dup BSL_3RD)))]
> +       UNSPEC_PRED_X))]
> +  "TARGET_SVE2"
> +  "@
> +  nbsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d
> +  movprfx\t%0, %<bsl_mov>\;nbsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d"
> +  "&& !CONSTANT_P (operands[4])"
> +  {
> +    operands[4] = CONSTM1_RTX (<VPRED>mode);
> +  }
> +  [(set_attr "movprfx" "*,yes")]
> +)
> +
> +;; Unpredicated bitwise select with inverted first operand.
> +(define_insn_and_rewrite "*aarch64_sve2_bsl1n<mode>"
> +  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
> +     (xor:SVE_I
> +       (and:SVE_I
> +         (unspec:SVE_I
> +           [(match_operand 4)
> +            (not:SVE_I
> +              (xor:SVE_I
> +                (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w")
> +                (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")))]
> +           UNSPEC_PRED_X)
> +         (match_operand:SVE_I 3 "register_operand" "w, w"))
> +       (match_dup BSL_3RD)))]
> +  "TARGET_SVE2"
> +  "@
> +  bsl1n\t%0.d, %0.d, %<bsl_3rd>.d, %3.d
> +  movprfx\t%0, %<bsl_mov>\;bsl1n\t%0.d, %0.d, %<bsl_3rd>.d, %3.d"
> +  "&& !CONSTANT_P (operands[4])"
> +  {
> +    operands[4] = CONSTM1_RTX (<VPRED>mode);
> +  }
> +  [(set_attr "movprfx" "*,yes")]
> +)
> +
> +;; Unpredicated bitwise select with inverted second operand.
> +(define_insn_and_rewrite "*aarch64_sve2_bsl2n<mode>"
> +  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
> +     (ior:SVE_I
> +       (and:SVE_I
> +         (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w")
> +         (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w"))
> +       (unspec:SVE_I
> +         [(match_operand 4)
> +          (and:SVE_I
> +            (not:SVE_I
> +              (match_operand:SVE_I 3 "register_operand" "w, w"))
> +            (not:SVE_I
> +              (match_dup BSL_3RD)))]
> +         UNSPEC_PRED_X)))]
> +  "TARGET_SVE2"
> +  "@
> +  bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d
> +  movprfx\t%0, %<bsl_mov>\;bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d"
> +  "&& !CONSTANT_P (operands[4])"
> +  {
> +    operands[4] = CONSTM1_RTX (<VPRED>mode);
> +  }
> +  [(set_attr "movprfx" "*,yes")]
> +)
> +
> +;; Unpredicated bitwise select with inverted second operand, alternative 
> form.
> +(define_insn_and_rewrite "*aarch64_sve2_bsl2n<mode>"
> +  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
> +     (ior:SVE_I
> +       (and:SVE_I
> +         (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w")
> +         (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w"))
> +       (unspec:SVE_I
> +         [(match_operand 4)
> +          (and:SVE_I
> +            (not:SVE_I
> +              (match_dup BSL_3RD))
> +            (not:SVE_I
> +              (match_operand:SVE_I 3 "register_operand" "w, w")))]
> +         UNSPEC_PRED_X)))]
> +  "TARGET_SVE2"
> +  "@
> +  bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d
> +  movprfx\t%0, %<bsl_mov>\;bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d"
> +  "&& !CONSTANT_P (operands[4])"
> +  {
> +    operands[4] = CONSTM1_RTX (<VPRED>mode);
> +  }
> +  [(set_attr "movprfx" "*,yes")]
> +)

bsl_3rd might be a confusing name given the above, where it ends up
being the 4th operand.  How about renaming it to bsl_dup instead?
(Same for the iterator.)  Maybe bsl_tied instead of bsl_mov too.

> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 
> abd14a2f92c06828adfc6d2e2e81b63a6163d3a3..a18565f5c2046c207a281522917eb6e2128aefbf
>  100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -236,6 +236,10 @@ extern unsigned aarch64_architecture_version;
>  #define AARCH64_ISA_F16                 (aarch64_isa_flags & AARCH64_FL_F16)
>  #define AARCH64_ISA_SVE            (aarch64_isa_flags & AARCH64_FL_SVE)
>  #define AARCH64_ISA_SVE2        (aarch64_isa_flags & AARCH64_FL_SVE2)
> +#define AARCH64_ISA_SVE2_AES    (aarch64_isa_flags & AARCH64_FL_SVE2_AES)
> +#define AARCH64_ISA_SVE2_SM4    (aarch64_isa_flags & AARCH64_FL_SVE2_SM4)
> +#define AARCH64_ISA_SVE2_SHA3           (aarch64_isa_flags & 
> AARCH64_FL_SVE2_SHA3)
> +#define AARCH64_ISA_SVE2_BITPERM   (aarch64_isa_flags & 
> AARCH64_FL_SVE2_BITPERM)
>  #define AARCH64_ISA_V8_3        (aarch64_isa_flags & AARCH64_FL_V8_3)
>  #define AARCH64_ISA_DOTPROD     (aarch64_isa_flags & AARCH64_FL_DOTPROD)
>  #define AARCH64_ISA_AES                 (aarch64_isa_flags & AARCH64_FL_AES)
> @@ -285,6 +289,18 @@ extern unsigned aarch64_architecture_version;
>  /* SVE2 instructions, enabled through +sve2.  */
>  #define TARGET_SVE2 (AARCH64_ISA_SVE2)
>  
> +/* SVE2 AES instructions, enabled through +sve2-aes.  */
> +#define TARGET_SVE2_AES (TARGET_SVE2 && AARCH64_ISA_SVE2_AES)
> +
> +/* SVE2 SM4 instructions, enabled through +sve2-sm4.  */
> +#define TARGET_SVE2_SM4 (TARGET_SVE2 && AARCH64_ISA_SVE2_SM4)
> +
> +/* SVE2 SHA3 instructions, enabled through +sve2-sha3.  */
> +#define TARGET_SVE2_SHA3 (TARGET_SVE2 && AARCH64_ISA_SVE2_SHA3)
> +
> +/* SVE2 bitperm instructions, enabled through +sve2-bitperm.  */
> +#define TARGET_SVE2_BITPERM (TARGET_SVE2 && AARCH64_ISA_SVE2_BITPERM)
> +
>  /* ARMv8.3-A features.  */
>  #define TARGET_ARMV8_3       (AARCH64_ISA_V8_3)

These two hunks aren't needed for the patch.  I think it would be better
to leave them undefined until we need them.

> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> 1e321af710bfe80606eedee7e0d191f36c70355b..f6c74cce981aee6fb42704570ad6a00537f60d6e
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -1611,6 +1611,8 @@
>  
>  (define_int_iterator SHRNT [UNSPEC_SHRNT UNSPEC_RSHRNT])
>  
> +(define_int_iterator BSL_3RD [1 2])
> +
>  (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
>  
>  (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
> @@ -1976,6 +1978,18 @@
>                        (UNSPEC_RADDHN2 "add")
>                        (UNSPEC_RSUBHN2 "sub")])
>  
> +;; BSL variants: first commutative operand.
> +(define_int_attr bsl_1st [(1 "w") (2 "0")])
> +
> +;; BSL variants: second commutative operand.
> +(define_int_attr bsl_2nd [(1 "0") (2 "w")])
> +
> +;; BSL variants: third input operand.
> +(define_int_attr bsl_3rd [(1 "1") (2 "2")])

Just to show I thought about it :-) this isn't really necessary,
since <BSL_3RD> should work too.  But I agree it looks nicer
to have everything in the same case, so I think it's better
to have the attribute regardless.

> +
> +;; BSL variants: operand which requires preserving via movprfx.
> +(define_int_attr bsl_mov [(1 "2") (2 "1")])
> +
>  (define_int_attr offsetlr [(UNSPEC_SSLI "") (UNSPEC_USLI "")
>                          (UNSPEC_SSRI "offset_")
>                          (UNSPEC_USRI "offset_")])
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..041552f5081238197c455fb667abd19e3c74c4a8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" 
> } */
> +
> +#include <stdint.h>
> +
> +#ifndef OP
> +#define OP(x,y,z) (((x) & (z)) | ((y) & ~(z)))
> +#endif
> +
> +#define TYPE(N) int##N##_t
> +
> +#define TEMPLATE(SIZE)                                               \
> +void __attribute__ ((noinline, noclone))                     \
> +f_##SIZE##_##OP                                                      \
> +  (TYPE(SIZE) *restrict a, TYPE(SIZE) *restrict b,           \
> +   TYPE(SIZE) *restrict c, TYPE(SIZE) *restrict d, int n)    \
> +{                                                            \
> +  for (int i = 0; i < n; i++)                                        \
> +    a[i] = OP (b[i], c[i], d[i]);                            \
> +}
> +
> +TEMPLATE (8);
> +TEMPLATE (16);
> +TEMPLATE (32);
> +TEMPLATE (64);
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 
> "vect" } } */
> +
> +/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.d} } } */
> +/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.d} } } */

For these I think:

/* { dg-final { scan-assembler-not {\teor\tz} } } */
/* { dg-final { scan-assembler-not {\tand\tz} } } */

would be better, since it would be possible to use EOR or AND
with zN.b rather than zN.d.

Same for the other files.

Thanks,
Richard

Re: [AArch64][SVE2] Support for EOR3 and variants of BSL

Reply via email to