https://gcc.gnu.org/g:1b7bcac0327ccd84f1966c748f4d1aedef64a9c5
commit r16-2203-g1b7bcac0327ccd84f1966c748f4d1aedef64a9c5 Author: Kyrylo Tkachov <ktkac...@nvidia.com> Date: Thu Jul 3 09:45:02 2025 -0700 aarch64: Handle DImode BCAX operations To handle DImode BCAX operations we want to do them on the SIMD side only if the incoming arguments don't require a cross-bank move. This means we need to split back the combination to separate GP BIC+EOR instructions if the operands are expected to be in GP regs through reload. The split happens pre-reload if we already know that the destination will be a GP reg. Otherwise if reload descides to use the "=r,r" alternative we ensure operand 0 is early-clobber. This scheme is similar to how we handle the BSL operations elsewhere in aarch64-simd.md. Thus, for the functions: uint64_t bcax_d_gp (uint64_t a, uint64_t b, uint64_t c) { return BCAX (a, b, c); } uint64x1_t bcax_d (uint64x1_t a, uint64x1_t b, uint64x1_t c) { return BCAX (a, b, c); } we now generate the desired: bcax_d_gp: bic x1, x1, x2 eor x0, x1, x0 ret bcax_d: bcax v0.16b, v0.16b, v1.16b, v2.16b ret When the inputs are in SIMD regs we use BCAX and when they are in GP regs we don't force them to SIMD with extra moves. Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com> gcc/ * config/aarch64/aarch64-simd.md (*bcaxqdi4): New define_insn_and_split. gcc/testsuite/ * gcc.target/aarch64/simd/bcax_d.c: Add tests for DImode arguments. Diff: --- gcc/config/aarch64/aarch64-simd.md | 29 ++++++++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/simd/bcax_d.c | 6 +++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 4493e55603d1..270cb2ff3a12 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9252,6 +9252,35 @@ [(set_attr "type" "crypto_sha3")] ) +(define_insn_and_split "*bcaxqdi4" + [(set (match_operand:DI 0 "register_operand") + (xor:DI + (and:DI + (not:DI (match_operand:DI 3 "register_operand")) + (match_operand:DI 2 "register_operand")) + (match_operand:DI 1 "register_operand")))] + "TARGET_SHA3" + {@ [ cons: =0, 1, 2 , 3 ; attrs: type ] + [ w , w, w , w ; crypto_sha3 ] bcax\t%0.16b, %1.16b, %2.16b, %3.16b + [ &r , r, r0, r0 ; multiple ] # + } + "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + [(set (match_dup 4) + (and:DI (not:DI (match_dup 3)) + (match_dup 2))) + (set (match_dup 0) + (xor:DI (match_dup 4) + (match_dup 1)))] + { + if (reload_completed) + operands[4] = operands[0]; + else if (can_create_pseudo_p ()) + operands[4] = gen_reg_rtx (DImode); + else + FAIL; + } +) + ;; SM3 (define_insn "aarch64_sm3ss1qv4si" diff --git a/gcc/testsuite/gcc.target/aarch64/simd/bcax_d.c b/gcc/testsuite/gcc.target/aarch64/simd/bcax_d.c index d68f0e102bf1..a7640c3f6f1e 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/bcax_d.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/bcax_d.c @@ -7,9 +7,13 @@ #define BCAX(x,y,z) ((x) ^ ((y) & ~(z))) +/* When the inputs come from GP regs don't form a BCAX. */ +uint64_t bcax_d_gp (uint64_t a, uint64_t b, uint64_t c) { return BCAX (a, b, c); } + +uint64x1_t bcax_d (uint64x1_t a, uint64x1_t b, uint64x1_t c) { return BCAX (a, b, c); } uint32x2_t bcax_s (uint32x2_t a, uint32x2_t b, uint32x2_t c) { return BCAX (a, b, c); } uint16x4_t bcax_h (uint16x4_t a, uint16x4_t b, uint16x4_t c) { return BCAX (a, b, c); } uint8x8_t bcax_b (uint8x8_t a, uint8x8_t b, uint8x8_t c) { return BCAX (a, b, c); } -/* { dg-final { scan-assembler-times {bcax\tv0.16b, v0.16b, v1.16b, v2.16b} 3 } } */ +/* { dg-final { scan-assembler-times {bcax\tv0.16b, v0.16b, v1.16b, v2.16b} 4 } } */