The current copysign pattern has a mismatch in the predicates and constraints - operand[2] is a register_operand but also has an alternative X which allows any operand. Since it is a floating point operation, having an integer alternative makes no sense. Change the expander to always use the vector variant of copysign which results in better code. Add a SVE bitmask move immediate alternative to the aarch64_simd_mov patterns so we emit a single move when SVE is available.
Passes bootstrap and regress, OK for commit? gcc: * config/aarch64/aarch64.md (copysign<GPF:mode>3): Defer to AdvSIMD copysign. (copysign<GPF:mode>3_insn): Remove pattern. * config/aarch64/aarch64-simd.md (aarch64_simd_mov<VDMOV:mode>): Add SVE movimm alternative. (aarch64_simd_mov<VQMOV:mode>): Likewise. Remove redundant V2DI check. (copysign<mode>3): Make global. (ior<mode>3<vczle><vczbe>): Move Neon immediate alternative before the SVE one. testsuite: * gcc.target/aarch64/copysign_3.c: New test. * gcc.target/aarch64/copysign_4.c: New test. * gcc.target/aarch64/fneg-abs_2.c: Allow .2s and .4s. --- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index e70d59380ed295577721f15277c28829d42a0189..3077e920ce623c92d21193124747ff7ad010d006 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -161,6 +161,7 @@ (define_insn_and_split "*aarch64_simd_mov<VDMOV:mode>" [?w, r ; f_mcr , * , *] fmov\t%d0, %1 [?r, r ; mov_reg , * , *] mov\t%0, %1 [w , Dn; neon_move<q> , simd , *] << aarch64_output_simd_mov_immediate (operands[1], 64); + [w , vsl; * , sve , *] mov\t%Z0.<Vetype>, %1 [w , Dz; f_mcr , * , *] fmov\t%d0, xzr [w , Dx; neon_move , simd , 8] # } @@ -190,6 +191,7 @@ (define_insn_and_split "*aarch64_simd_mov<VQMOV:mode>" [?w , r ; multiple , * , 8] # [?r , r ; multiple , * , 8] # [w , Dn; neon_move<q> , simd, 4] << aarch64_output_simd_mov_immediate (operands[1], 128); + [w , vsl; * , sve, 4] mov\t%Z0.<Vetype>, %1 [w , Dz; fmov , * , 4] fmov\t%d0, xzr [w , Dx; neon_move , simd, 8] # } @@ -208,7 +210,6 @@ (define_insn_and_split "*aarch64_simd_mov<VQMOV:mode>" else { if (FP_REGNUM_P (REGNO (operands[0])) - && <MODE>mode == V2DImode && aarch64_maybe_generate_simd_constant (operands[0], operands[1], <MODE>mode)) ; @@ -648,7 +649,7 @@ (define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle>< [(set_attr "type" "neon_dot<VS:q>")] ) -(define_expand "copysign<mode>3" +(define_expand "@copysign<mode>3" [(match_operand:VHSDF 0 "register_operand") (match_operand:VHSDF 1 "register_operand") (match_operand:VHSDF 2 "nonmemory_operand")] @@ -1138,10 +1139,8 @@ (define_insn "ior<mode>3<vczle><vczbe>" "TARGET_SIMD" {@ [ cons: =0 , 1 , 2; attrs: arch ] [ w , w , w ; simd ] orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype> - [ w , 0 , vsl; sve ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, #%2 - [ w , 0 , Do ; simd ] \ - << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, \ - AARCH64_CHECK_ORR); + [ w , 0 , Do ; simd ] << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, AARCH64_CHECK_ORR); + [ w , 0 , vsl; sve ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, %2 } [(set_attr "type" "neon_logic<q>")] ) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index c54b29cd64b9e0dc6c6d12735049386ccedc5408..e9b148e59abf81cee53cb0dd846af9a62bbad294 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -7218,20 +7218,11 @@ (define_expand "lrint<GPF:mode><GPI:mode>2" } ) -;; For copysign (x, y), we want to generate: +;; For copysignf (x, y), we want to generate: ;; -;; LDR d2, #(1 << 63) -;; BSL v2.8b, [y], [x] +;; movi v31.4s, 0x80, lsl 24 +;; bit v0.16b, v1.16b, v31.16b ;; -;; or another, equivalent, sequence using one of BSL/BIT/BIF. Because -;; we expect these operations to nearly always operate on -;; floating-point values, we do not want the operation to be -;; simplified into a bit-field insert operation that operates on the -;; integer side, since typically that would involve three inter-bank -;; register copies. As we do not expect copysign to be followed by -;; other logical operations on the result, it seems preferable to keep -;; this as an unspec operation, rather than exposing the underlying -;; logic to the compiler. (define_expand "copysign<GPF:mode>3" [(match_operand:GPF 0 "register_operand") @@ -7239,57 +7230,22 @@ (define_expand "copysign<GPF:mode>3" (match_operand:GPF 2 "nonmemory_operand")] "TARGET_SIMD" { - rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U - << (GET_MODE_BITSIZE (<MODE>mode) - 1)); - /* copysign (x, -1) should instead be expanded as orr with the sign - bit. */ - rtx op2_elt = unwrap_const_vec_duplicate (operands[2]); - if (GET_CODE (op2_elt) == CONST_DOUBLE - && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt))) - { - rtx v_bitmask - = force_reg (V2<V_INT_EQUIV>mode, - gen_const_vec_duplicate (V2<V_INT_EQUIV>mode, - signbit_const)); - - emit_insn (gen_iorv2<v_int_equiv>3 ( - lowpart_subreg (V2<V_INT_EQUIV>mode, operands[0], <MODE>mode), - lowpart_subreg (V2<V_INT_EQUIV>mode, operands[1], <MODE>mode), - v_bitmask)); - DONE; - } - - machine_mode int_mode = <V_INT_EQUIV>mode; - rtx bitmask = gen_reg_rtx (int_mode); - emit_move_insn (bitmask, signbit_const); - operands[2] = force_reg (<MODE>mode, operands[2]); - emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2], - bitmask)); + rtx tmp = gen_reg_rtx (<VCONQ>mode); + rtx op1 = lowpart_subreg (<VCONQ>mode, operands[1], <MODE>mode); + rtx op2 = REG_P (operands[2]) + ? lowpart_subreg (<VCONQ>mode, operands[2], <MODE>mode) + : gen_const_vec_duplicate (<VCONQ>mode, operands[2]); + emit_insn (gen_copysign3 (<VCONQ>mode, tmp, op1, op2)); + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, tmp, <VCONQ>mode)); DONE; } ) -(define_insn "copysign<GPF:mode>3_insn" - [(set (match_operand:GPF 0 "register_operand") - (unspec:GPF [(match_operand:GPF 1 "register_operand") - (match_operand:GPF 2 "register_operand") - (match_operand:<V_INT_EQUIV> 3 "register_operand")] - UNSPEC_COPYSIGN))] - "TARGET_SIMD" - {@ [ cons: =0 , 1 , 2 , 3 ; attrs: type ] - [ w , w , w , 0 ; neon_bsl<q> ] bsl\t%0.<Vbtype>, %2.<Vbtype>, %1.<Vbtype> - [ w , 0 , w , w ; neon_bsl<q> ] bit\t%0.<Vbtype>, %2.<Vbtype>, %3.<Vbtype> - [ w , w , 0 , w ; neon_bsl<q> ] bif\t%0.<Vbtype>, %1.<Vbtype>, %3.<Vbtype> - [ r , r , 0 , X ; bfm ] bfxil\t%<w1>0, %<w1>1, #0, <sizem1> - } -) - - -;; For xorsign (x, y), we want to generate: +;; For xorsignf (x, y), we want to generate: ;; -;; LDR d2, #1<<63 -;; AND v3.8B, v1.8B, v2.8B -;; EOR v0.8B, v0.8B, v3.8B +;; movi v31.4s, 0x80, lsl 24 +;; and v31.16b, v31.16b, v1.16b +;; eor v0.16b, v31.16b, v0.16b ;; (define_expand "@xorsign<mode>3" diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_3.c b/gcc/testsuite/gcc.target/aarch64/copysign_3.c new file mode 100644 index 0000000000000000000000000000000000000000..be48682420f1ff84e80af9efd9d11f64bd6e8052 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/copysign_3.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +float f1 (float x, float y) +{ + return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y); +} + +double f2 (double x, double y) +{ + return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y); +} + +/* { dg-final { scan-assembler-times "movi\t" 2 } } */ +/* { dg-final { scan-assembler-not "copysign\tw" } } */ +/* { dg-final { scan-assembler-not "dup\tw" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_4.c b/gcc/testsuite/gcc.target/aarch64/copysign_4.c new file mode 100644 index 0000000000000000000000000000000000000000..f3cec2fc9c21a4eaa3b6556479aeb15c04358a1c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/copysign_4.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=armv8-a+sve" } */ + +float f1 (float x, float y) +{ + return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y); +} + +double f2 (double x, double y) +{ + return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y); +} + +/* { dg-final { scan-assembler-times "movi\t" 1 } } */ +/* { dg-final { scan-assembler-times "mov\tz" 1 } } */ +/* { dg-final { scan-assembler-not "copysign\tw" } } */ +/* { dg-final { scan-assembler-not "dup\tw" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c index 18d10ee834d5d9b4361d890447060e78f09d3a73..1544bc5f1a736e95dd8bd2c608405aebb54ded1f 100644 --- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c @@ -9,7 +9,7 @@ /* ** f1: -** orr v[0-9]+.2s, #?128, lsl #?24 +** orr v[0-9]+.[24]s, #?128, lsl #?24 ** ret */ float32_t f1 (float32_t a)