Wilco Dijkstra <wilco.dijks...@arm.com> writes: > v2: Add more testcase fixes. > > The current copysign pattern has a mismatch in the predicates and constraints > - > operand[2] is a register_operand but also has an alternative X which allows > any > operand. Since it is a floating point operation, having an integer > alternative > makes no sense. Change the expander to always use the vector variant of > copysign > which results in better code. Add a SVE bitmask move immediate alternative to > the aarch64_simd_mov patterns so we emit a single move when SVE is available. > > Passes bootstrap and regress, OK for commit? > > gcc: > * config/aarch64/aarch64.md (copysign<GPF:mode>3): Defer to AdvSIMD > copysign. > (copysign<GPF:mode>3_insn): Remove pattern. > * config/aarch64/aarch64-simd.md (aarch64_simd_mov<VDMOV:mode>): Add > SVE movimm > alternative. > (aarch64_simd_mov<VQMOV:mode>): Likewise. Remove redundant V2DI > check. > (copysign<mode>3): Make global. > (ior<mode>3<vczle><vczbe>): Move Neon immediate alternative before > the SVE one. > > testsuite: > * gcc.target/aarch64/copysign_3.c: New test. > * gcc.target/aarch64/copysign_4.c: New test. > * gcc.target/aarch64/fneg-abs_2.c: Allow .2s and .4s. > * gcc.target/aarch64/sve/fneg-abs_1.c: Fixup test. > * gcc.target/aarch64/sve/fneg-abs_2.c: Likewise.
This seems to be doing several things at once. Could you split it up? E.g. I think the change to the move patterns should be a separate patch, with its own tests. Rather than add new alternatives, I think we should expand the definition of what a "valid" immediate is for TARGET_SIMD vectors when SVE is enabled, like Pengxuan did in g:a92f54f580c3. If you still need the removal of "<MODE>mode == V2DImode" after that change, could you send that separately too, with its own explanation and test cases? Could you explain why you flipped the order of the alternatives in: > > @@ -648,7 +649,7 @@ (define_insn > "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle>< > [(set_attr "type" "neon_dot<VS:q>")] > ) > > -(define_expand "copysign<mode>3" > +(define_expand "@copysign<mode>3" > [(match_operand:VHSDF 0 "register_operand") > (match_operand:VHSDF 1 "register_operand") > (match_operand:VHSDF 2 "nonmemory_operand")] > @@ -1138,10 +1139,8 @@ (define_insn "ior<mode>3<vczle><vczbe>" > "TARGET_SIMD" > {@ [ cons: =0 , 1 , 2; attrs: arch ] > [ w , w , w ; simd ] orr\t%0.<Vbtype>, %1.<Vbtype>, > %2.<Vbtype> > - [ w , 0 , vsl; sve ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, #%2 > - [ w , 0 , Do ; simd ] \ > - << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, \ > - AARCH64_CHECK_ORR); > + [ w , 0 , Do ; simd ] << aarch64_output_simd_mov_immediate > (operands[2], <bitsize>, AARCH64_CHECK_ORR); > + [ w , 0 , vsl; sve ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, %2 > } > [(set_attr "type" "neon_logic<q>")] ? I'm not opposed, just wasn't sure why it was useful. (The original formatting was arguably more correct, since it kept within the 80-character limit.) > ) > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index > c54b29cd64b9e0dc6c6d12735049386ccedc5408..e9b148e59abf81cee53cb0dd846af9a62bbad294 > 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -7218,20 +7218,11 @@ (define_expand "lrint<GPF:mode><GPI:mode>2" > } > ) > > -;; For copysign (x, y), we want to generate: > +;; For copysignf (x, y), we want to generate: > ;; > -;; LDR d2, #(1 << 63) > -;; BSL v2.8b, [y], [x] > +;; movi v31.4s, 0x80, lsl 24 > +;; bit v0.16b, v1.16b, v31.16b > ;; > -;; or another, equivalent, sequence using one of BSL/BIT/BIF. Because > -;; we expect these operations to nearly always operate on > -;; floating-point values, we do not want the operation to be > -;; simplified into a bit-field insert operation that operates on the > -;; integer side, since typically that would involve three inter-bank > -;; register copies. As we do not expect copysign to be followed by > -;; other logical operations on the result, it seems preferable to keep > -;; this as an unspec operation, rather than exposing the underlying > -;; logic to the compiler. > > (define_expand "copysign<GPF:mode>3" > [(match_operand:GPF 0 "register_operand") > @@ -7239,57 +7230,22 @@ (define_expand "copysign<GPF:mode>3" > (match_operand:GPF 2 "nonmemory_operand")] > "TARGET_SIMD" > { > - rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U > - << (GET_MODE_BITSIZE (<MODE>mode) - 1)); > - /* copysign (x, -1) should instead be expanded as orr with the sign > - bit. */ > - rtx op2_elt = unwrap_const_vec_duplicate (operands[2]); > - if (GET_CODE (op2_elt) == CONST_DOUBLE > - && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt))) > - { > - rtx v_bitmask > - = force_reg (V2<V_INT_EQUIV>mode, > - gen_const_vec_duplicate (V2<V_INT_EQUIV>mode, > - signbit_const)); > - > - emit_insn (gen_iorv2<v_int_equiv>3 ( > - lowpart_subreg (V2<V_INT_EQUIV>mode, operands[0], <MODE>mode), > - lowpart_subreg (V2<V_INT_EQUIV>mode, operands[1], <MODE>mode), > - v_bitmask)); > - DONE; > - } > - > - machine_mode int_mode = <V_INT_EQUIV>mode; > - rtx bitmask = gen_reg_rtx (int_mode); > - emit_move_insn (bitmask, signbit_const); > - operands[2] = force_reg (<MODE>mode, operands[2]); > - emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2], > - bitmask)); > + rtx tmp = gen_reg_rtx (<VCONQ>mode); > + rtx op1 = lowpart_subreg (<VCONQ>mode, operands[1], <MODE>mode); > + rtx op2 = REG_P (operands[2]) > + ? lowpart_subreg (<VCONQ>mode, operands[2], <MODE>mode) > + : gen_const_vec_duplicate (<VCONQ>mode, operands[2]); > + emit_insn (gen_copysign3 (<VCONQ>mode, tmp, op1, op2)); > + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, tmp, > <VCONQ>mode)); > DONE; > } > ) If I understand correctly, the idea here is bitcast to a vector, do a vector logical operation, and bitcast back. That is, we ultimately generate: (define_insn "aarch64_simd_bsl<mode>_internal<vczle><vczbe>" [(set (match_operand:VDQ_I 0 "register_operand") (xor:VDQ_I (and:VDQ_I (xor:VDQ_I (match_operand:<V_INT_EQUIV> 3 "register_operand") (match_operand:VDQ_I 2 "register_operand")) (match_operand:VDQ_I 1 "register_operand")) (match_dup:<V_INT_EQUIV> 3) ))] "TARGET_SIMD" {@ [ cons: =0 , 1 , 2 , 3 ] [ w , 0 , w , w ] bsl\t%0.<Vbtype>, %2.<Vbtype>, %3.<Vbtype> [ w , w , w , 0 ] bit\t%0.<Vbtype>, %2.<Vbtype>, %1.<Vbtype> [ w , w , 0 , w ] bif\t%0.<Vbtype>, %3.<Vbtype>, %1.<Vbtype> } [(set_attr "type" "neon_bsl<q>")] ) wrapped in scalar-to-vector and vector-to-scalar operations. The problem is that this process is entirely transparent to the RTL optimisers, so they could easily convert this into a scalar integer operation. (Similar things happened with the x86 STV pass.) That might not be a problem yet, but I think the concern raised in the existing comment is still valid. So my preference would be to keep the existing scheme for now, unless the problem can't be solved that way. > -(define_insn "copysign<GPF:mode>3_insn" > - [(set (match_operand:GPF 0 "register_operand") > - (unspec:GPF [(match_operand:GPF 1 "register_operand") > - (match_operand:GPF 2 "register_operand") > - (match_operand:<V_INT_EQUIV> 3 "register_operand")] > - UNSPEC_COPYSIGN))] > - "TARGET_SIMD" > - {@ [ cons: =0 , 1 , 2 , 3 ; attrs: type ] > - [ w , w , w , 0 ; neon_bsl<q> ] bsl\t%0.<Vbtype>, %2.<Vbtype>, > %1.<Vbtype> > - [ w , 0 , w , w ; neon_bsl<q> ] bit\t%0.<Vbtype>, %2.<Vbtype>, > %3.<Vbtype> > - [ w , w , 0 , w ; neon_bsl<q> ] bif\t%0.<Vbtype>, %1.<Vbtype>, > %3.<Vbtype> > - [ r , r , 0 , X ; bfm ] bfxil\t%<w1>0, %<w1>1, #0, > <sizem1> I agree that this final alternative is problematic though. A standalone patch to remove that alternative is OK for trunk and backports. If you hit an ICE with it, it would be nice to have a testcase too. Thanks, Richard > - } > -) > - > - > -;; For xorsign (x, y), we want to generate: > +;; For xorsignf (x, y), we want to generate: > ;; > -;; LDR d2, #1<<63 > -;; AND v3.8B, v1.8B, v2.8B > -;; EOR v0.8B, v0.8B, v3.8B > +;; movi v31.4s, 0x80, lsl 24 > +;; and v31.16b, v31.16b, v1.16b > +;; eor v0.16b, v31.16b, v0.16b > ;; > > (define_expand "@xorsign<mode>3" > diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_3.c > b/gcc/testsuite/gcc.target/aarch64/copysign_3.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..be48682420f1ff84e80af9efd9d11f64bd6e8052 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/copysign_3.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +float f1 (float x, float y) > +{ > + return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y); > +} > + > +double f2 (double x, double y) > +{ > + return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y); > +} > + > +/* { dg-final { scan-assembler-times "movi\t" 2 } } */ > +/* { dg-final { scan-assembler-not "copysign\tw" } } */ > +/* { dg-final { scan-assembler-not "dup\tw" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_4.c > b/gcc/testsuite/gcc.target/aarch64/copysign_4.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..f3cec2fc9c21a4eaa3b6556479aeb15c04358a1c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/copysign_4.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=armv8-a+sve" } */ > + > +float f1 (float x, float y) > +{ > + return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y); > +} > + > +double f2 (double x, double y) > +{ > + return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y); > +} > + > +/* { dg-final { scan-assembler-times "movi\t" 1 } } */ > +/* { dg-final { scan-assembler-times "mov\tz" 1 } } */ > +/* { dg-final { scan-assembler-not "copysign\tw" } } */ > +/* { dg-final { scan-assembler-not "dup\tw" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c > index > 18d10ee834d5d9b4361d890447060e78f09d3a73..1544bc5f1a736e95dd8bd2c608405aebb54ded1f > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c > @@ -9,7 +9,7 @@ > > /* > ** f1: > -** orr v[0-9]+.2s, #?128, lsl #?24 > +** orr v[0-9]+.[24]s, #?128, lsl #?24 > ** ret > */ > float32_t f1 (float32_t a) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c > index > a8b27199ff83d0eebadfc7dcf03f94e1229d76b8..1ebdc6aaeb102da25ad561b24641e72a652175fa > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c > @@ -6,7 +6,7 @@ > > /* > ** t1: > -** orr z[0-9]+.s, z[0-9]+.s, #-2147483648 > +** orr v0.2s, #?128, lsl #?24 > ** ret > */ > float32x2_t t1 (float32x2_t a) > @@ -16,7 +16,7 @@ float32x2_t t1 (float32x2_t a) > > /* > ** t2: > -** orr z[0-9]+.s, z[0-9]+.s, #-2147483648 > +** orr v0.4s, #?128, lsl #?24 > ** ret > */ > float32x4_t t2 (float32x4_t a) > @@ -26,7 +26,7 @@ float32x4_t t2 (float32x4_t a) > > /* > ** t3: > -** orr z[0-9]+.d, z[0-9]+.d, #-9223372036854775808 > +** orr z[0-9]+.d, z[0-9]+.d, -9223372036854775808 > ** ret > */ > float64x2_t t3 (float64x2_t a) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c > index > 19a7695e605bc8aced486a9c450d1cdc6be4691a..122152c0ebe4ea6840e418e75a2cadbfc9b0aba4 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c > @@ -7,7 +7,7 @@ > > /* > ** f1: > -** orr z0.s, z0.s, #-2147483648 > +** orr v0.4s, #?128, lsl #?24 > ** ret > */ > float32_t f1 (float32_t a) > @@ -17,7 +17,7 @@ float32_t f1 (float32_t a) > > /* > ** f2: > -** orr z0.d, z0.d, #-9223372036854775808 > +** orr z0.d, z0.d, -9223372036854775808 > ** ret > */ > float64_t f2 (float64_t a)