Thanks for the patch, looks really good. Yuliang Wang <yuliang.w...@arm.com> writes: > +;; Use NBSL for vector NOR. > +(define_insn_and_rewrite "*aarch64_sve2_nor<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w") > + (unspec:SVE_I > + [(match_operand 3) > + (and:SVE_I > + (not:SVE_I > + (match_operand:SVE_I 1 "register_operand" "w, 0, w")) > + (not:SVE_I > + (match_operand:SVE_I 2 "register_operand" "0, w, w")))] > + UNSPEC_PRED_X))] > + "TARGET_SVE2" > + "@ > + nbsl\t%0.d, %0.d, %1.d, %0.d > + nbsl\t%0.d, %0.d, %2.d, %0.d > + movprfx\t%0, %2\;nbsl\t%0.d, %0.d, %1.d, %0.d" > + "&& !CONSTANT_P (operands[3])" > + { > + operands[3] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,*,yes")] > +) > + > +;; Use NBSL for vector NAND. > +(define_insn_and_rewrite "*aarch64_sve2_nand<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w") > + (unspec:SVE_I > + [(match_operand 3) > + (ior:SVE_I > + (not:SVE_I > + (match_operand:SVE_I 1 "register_operand" "w, 0, w")) > + (not:SVE_I > + (match_operand:SVE_I 2 "register_operand" "0, w, w")))] > + UNSPEC_PRED_X))] > + "TARGET_SVE2" > + "@ > + nbsl\t%0.d, %0.d, %1.d, %1.d > + nbsl\t%0.d, %0.d, %2.d, %2.d > + movprfx\t%0, %2\;nbsl\t%0.d, %0.d, %1.d, %1.d" > + "&& !CONSTANT_P (operands[3])" > + { > + operands[3] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,*,yes")] > +)
For these two it should in theory be slightly better to use "%" on operand 1 to make the operation commutative, rather than provide a separate matching alternative for operand 2. E.g.: (define_insn_and_rewrite "*aarch64_sve2_nand<mode>" [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") (unspec:SVE_I [(match_operand 3) (ior:SVE_I (not:SVE_I (match_operand:SVE_I 1 "register_operand" "%0, w")) (not:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")))] UNSPEC_PRED_X))] "TARGET_SVE2" "@ nbsl\t%0.d, %0.d, %2.d, %2.d movprfx\t%0, %1\;nbsl\t%0.d, %0.d, %2.d, %2.d" "&& !CONSTANT_P (operands[3])" { operands[3] = CONSTM1_RTX (<VPRED>mode); } [(set_attr "movprfx" "*,*,yes")] ) (But the EOR3 pattern is fine as-is, since that supports any permutation of the three inputs.) > +;; Unpredicated bitwise select. > +(define_insn "*aarch64_sve2_bsl<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (xor:SVE_I > + (and:SVE_I > + (xor:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) > + (match_operand:SVE_I 3 "register_operand" "w, w")) > + (match_dup BSL_3RD)))] > + "TARGET_SVE2" > + "@ > + bsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d > + movprfx\t%0, %<bsl_mov>\;bsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d" > + [(set_attr "movprfx" "*,yes")] > +) This is sufficiently far from the documented logic that it might be worth a comment. E.g.: ;; (op3 ? bsl_mov : bsl_3rd) == (((bsl_mov ^ bsl_3rd) & op3) ^ bsl_3rd) Similarly for the later patterns. > +;; Unpredicated bitwise inverted select. > +(define_insn_and_rewrite "*aarch64_sve2_nbsl<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (unspec:SVE_I > + [(match_operand 4) > + (not:SVE_I > + (xor:SVE_I > + (and:SVE_I > + (xor:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) > + (match_operand:SVE_I 3 "register_operand" "w, w")) > + (match_dup BSL_3RD)))] > + UNSPEC_PRED_X))] > + "TARGET_SVE2" > + "@ > + nbsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d > + movprfx\t%0, %<bsl_mov>\;nbsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d" > + "&& !CONSTANT_P (operands[4])" > + { > + operands[4] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,yes")] > +) > + > +;; Unpredicated bitwise select with inverted first operand. > +(define_insn_and_rewrite "*aarch64_sve2_bsl1n<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (xor:SVE_I > + (and:SVE_I > + (unspec:SVE_I > + [(match_operand 4) > + (not:SVE_I > + (xor:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")))] > + UNSPEC_PRED_X) > + (match_operand:SVE_I 3 "register_operand" "w, w")) > + (match_dup BSL_3RD)))] > + "TARGET_SVE2" > + "@ > + bsl1n\t%0.d, %0.d, %<bsl_3rd>.d, %3.d > + movprfx\t%0, %<bsl_mov>\;bsl1n\t%0.d, %0.d, %<bsl_3rd>.d, %3.d" > + "&& !CONSTANT_P (operands[4])" > + { > + operands[4] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,yes")] > +) > + > +;; Unpredicated bitwise select with inverted second operand. > +(define_insn_and_rewrite "*aarch64_sve2_bsl2n<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (ior:SVE_I > + (and:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) > + (unspec:SVE_I > + [(match_operand 4) > + (and:SVE_I > + (not:SVE_I > + (match_operand:SVE_I 3 "register_operand" "w, w")) > + (not:SVE_I > + (match_dup BSL_3RD)))] > + UNSPEC_PRED_X)))] > + "TARGET_SVE2" > + "@ > + bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d > + movprfx\t%0, %<bsl_mov>\;bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d" > + "&& !CONSTANT_P (operands[4])" > + { > + operands[4] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,yes")] > +) > + > +;; Unpredicated bitwise select with inverted second operand, alternative > form. > +(define_insn_and_rewrite "*aarch64_sve2_bsl2n<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (ior:SVE_I > + (and:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) > + (unspec:SVE_I > + [(match_operand 4) > + (and:SVE_I > + (not:SVE_I > + (match_dup BSL_3RD)) > + (not:SVE_I > + (match_operand:SVE_I 3 "register_operand" "w, w")))] > + UNSPEC_PRED_X)))] > + "TARGET_SVE2" > + "@ > + bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d > + movprfx\t%0, %<bsl_mov>\;bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d" > + "&& !CONSTANT_P (operands[4])" > + { > + operands[4] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,yes")] > +) bsl_3rd might be a confusing name given the above, where it ends up being the 4th operand. How about renaming it to bsl_dup instead? (Same for the iterator.) Maybe bsl_tied instead of bsl_mov too. > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index > abd14a2f92c06828adfc6d2e2e81b63a6163d3a3..a18565f5c2046c207a281522917eb6e2128aefbf > 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -236,6 +236,10 @@ extern unsigned aarch64_architecture_version; > #define AARCH64_ISA_F16 (aarch64_isa_flags & AARCH64_FL_F16) > #define AARCH64_ISA_SVE (aarch64_isa_flags & AARCH64_FL_SVE) > #define AARCH64_ISA_SVE2 (aarch64_isa_flags & AARCH64_FL_SVE2) > +#define AARCH64_ISA_SVE2_AES (aarch64_isa_flags & AARCH64_FL_SVE2_AES) > +#define AARCH64_ISA_SVE2_SM4 (aarch64_isa_flags & AARCH64_FL_SVE2_SM4) > +#define AARCH64_ISA_SVE2_SHA3 (aarch64_isa_flags & > AARCH64_FL_SVE2_SHA3) > +#define AARCH64_ISA_SVE2_BITPERM (aarch64_isa_flags & > AARCH64_FL_SVE2_BITPERM) > #define AARCH64_ISA_V8_3 (aarch64_isa_flags & AARCH64_FL_V8_3) > #define AARCH64_ISA_DOTPROD (aarch64_isa_flags & AARCH64_FL_DOTPROD) > #define AARCH64_ISA_AES (aarch64_isa_flags & AARCH64_FL_AES) > @@ -285,6 +289,18 @@ extern unsigned aarch64_architecture_version; > /* SVE2 instructions, enabled through +sve2. */ > #define TARGET_SVE2 (AARCH64_ISA_SVE2) > > +/* SVE2 AES instructions, enabled through +sve2-aes. */ > +#define TARGET_SVE2_AES (TARGET_SVE2 && AARCH64_ISA_SVE2_AES) > + > +/* SVE2 SM4 instructions, enabled through +sve2-sm4. */ > +#define TARGET_SVE2_SM4 (TARGET_SVE2 && AARCH64_ISA_SVE2_SM4) > + > +/* SVE2 SHA3 instructions, enabled through +sve2-sha3. */ > +#define TARGET_SVE2_SHA3 (TARGET_SVE2 && AARCH64_ISA_SVE2_SHA3) > + > +/* SVE2 bitperm instructions, enabled through +sve2-bitperm. */ > +#define TARGET_SVE2_BITPERM (TARGET_SVE2 && AARCH64_ISA_SVE2_BITPERM) > + > /* ARMv8.3-A features. */ > #define TARGET_ARMV8_3 (AARCH64_ISA_V8_3) These two hunks aren't needed for the patch. I think it would be better to leave them undefined until we need them. > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > 1e321af710bfe80606eedee7e0d191f36c70355b..f6c74cce981aee6fb42704570ad6a00537f60d6e > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -1611,6 +1611,8 @@ > > (define_int_iterator SHRNT [UNSPEC_SHRNT UNSPEC_RSHRNT]) > > +(define_int_iterator BSL_3RD [1 2]) > + > (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT]) > > (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN > @@ -1976,6 +1978,18 @@ > (UNSPEC_RADDHN2 "add") > (UNSPEC_RSUBHN2 "sub")]) > > +;; BSL variants: first commutative operand. > +(define_int_attr bsl_1st [(1 "w") (2 "0")]) > + > +;; BSL variants: second commutative operand. > +(define_int_attr bsl_2nd [(1 "0") (2 "w")]) > + > +;; BSL variants: third input operand. > +(define_int_attr bsl_3rd [(1 "1") (2 "2")]) Just to show I thought about it :-) this isn't really necessary, since <BSL_3RD> should work too. But I agree it looks nicer to have everything in the same case, so I think it's better to have the attribute regardless. > + > +;; BSL variants: operand which requires preserving via movprfx. > +(define_int_attr bsl_mov [(1 "2") (2 "1")]) > + > (define_int_attr offsetlr [(UNSPEC_SSLI "") (UNSPEC_USLI "") > (UNSPEC_SSRI "offset_") > (UNSPEC_USRI "offset_")]) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c > b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..041552f5081238197c455fb667abd19e3c74c4a8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c > @@ -0,0 +1,33 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" > } */ > + > +#include <stdint.h> > + > +#ifndef OP > +#define OP(x,y,z) (((x) & (z)) | ((y) & ~(z))) > +#endif > + > +#define TYPE(N) int##N##_t > + > +#define TEMPLATE(SIZE) \ > +void __attribute__ ((noinline, noclone)) \ > +f_##SIZE##_##OP \ > + (TYPE(SIZE) *restrict a, TYPE(SIZE) *restrict b, \ > + TYPE(SIZE) *restrict c, TYPE(SIZE) *restrict d, int n) \ > +{ \ > + for (int i = 0; i < n; i++) \ > + a[i] = OP (b[i], c[i], d[i]); \ > +} > + > +TEMPLATE (8); > +TEMPLATE (16); > +TEMPLATE (32); > +TEMPLATE (64); > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.d} } } */ > +/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.d} } } */ For these I think: /* { dg-final { scan-assembler-not {\teor\tz} } } */ /* { dg-final { scan-assembler-not {\tand\tz} } } */ would be better, since it would be possible to use EOR or AND with zN.b rather than zN.d. Same for the other files. Thanks, Richard