Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> v2: Add more testcase fixes.
>
> The current copysign pattern has a mismatch in the predicates and constraints 
> -
> operand[2] is a register_operand but also has an alternative X which allows 
> any
> operand.  Since it is a floating point operation, having an integer 
> alternative
> makes no sense.  Change the expander to always use the vector variant of 
> copysign
> which results in better code.  Add a SVE bitmask move immediate alternative to
> the aarch64_simd_mov patterns so we emit a single move when SVE is available.
>
> Passes bootstrap and regress, OK for commit?
>
> gcc:
>         * config/aarch64/aarch64.md (copysign<GPF:mode>3): Defer to AdvSIMD 
> copysign.
>         (copysign<GPF:mode>3_insn): Remove pattern.
>         * config/aarch64/aarch64-simd.md (aarch64_simd_mov<VDMOV:mode>): Add 
> SVE movimm
>         alternative.
>         (aarch64_simd_mov<VQMOV:mode>): Likewise.  Remove redundant V2DI 
> check.
>         (copysign<mode>3): Make global.
>         (ior<mode>3<vczle><vczbe>): Move Neon immediate alternative before 
> the SVE one.       
>
> testsuite:
>         * gcc.target/aarch64/copysign_3.c: New test.
>         * gcc.target/aarch64/copysign_4.c: New test.
>         * gcc.target/aarch64/fneg-abs_2.c: Allow .2s and .4s.
>         * gcc.target/aarch64/sve/fneg-abs_1.c: Fixup test.
>         * gcc.target/aarch64/sve/fneg-abs_2.c: Likewise.

This seems to be doing several things at once.  Could you split it up?

E.g. I think the change to the move patterns should be a separate patch,
with its own tests.  Rather than add new alternatives, I think we should
expand the definition of what a "valid" immediate is for TARGET_SIMD vectors
when SVE is enabled, like Pengxuan did in g:a92f54f580c3.

If you still need the removal of "<MODE>mode == V2DImode" after that change,
could you send that separately too, with its own explanation and test cases?

Could you explain why you flipped the order of the alternatives in:
>
> @@ -648,7 +649,7 @@ (define_insn 
> "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle><
>    [(set_attr "type" "neon_dot<VS:q>")]
>  )
>  
> -(define_expand "copysign<mode>3"
> +(define_expand "@copysign<mode>3"
>    [(match_operand:VHSDF 0 "register_operand")
>     (match_operand:VHSDF 1 "register_operand")
>     (match_operand:VHSDF 2 "nonmemory_operand")]
> @@ -1138,10 +1139,8 @@ (define_insn "ior<mode>3<vczle><vczbe>"
>    "TARGET_SIMD"
>    {@ [ cons: =0 , 1 , 2; attrs: arch ]
>       [ w        , w , w  ; simd      ] orr\t%0.<Vbtype>, %1.<Vbtype>, 
> %2.<Vbtype>
> -     [ w        , 0 , vsl; sve       ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, #%2
> -     [ w        , 0 , Do ; simd      ] \
> -       << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, \
> -                                          AARCH64_CHECK_ORR);
> +     [ w        , 0 , Do ; simd      ] << aarch64_output_simd_mov_immediate 
> (operands[2], <bitsize>, AARCH64_CHECK_ORR);
> +     [ w        , 0 , vsl; sve       ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, %2
>    }
>    [(set_attr "type" "neon_logic<q>")]

?  I'm not opposed, just wasn't sure why it was useful.

(The original formatting was arguably more correct, since it kept within
the 80-character limit.)

>  )
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> c54b29cd64b9e0dc6c6d12735049386ccedc5408..e9b148e59abf81cee53cb0dd846af9a62bbad294
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -7218,20 +7218,11 @@ (define_expand "lrint<GPF:mode><GPI:mode>2"
>  }
>  )
>  
> -;; For copysign (x, y), we want to generate:
> +;; For copysignf (x, y), we want to generate:
>  ;;
> -;;   LDR d2, #(1 << 63)
> -;;   BSL v2.8b, [y], [x]
> +;;   movi    v31.4s, 0x80, lsl 24
> +;;   bit     v0.16b, v1.16b, v31.16b
>  ;;
> -;; or another, equivalent, sequence using one of BSL/BIT/BIF.  Because
> -;; we expect these operations to nearly always operate on
> -;; floating-point values, we do not want the operation to be
> -;; simplified into a bit-field insert operation that operates on the
> -;; integer side, since typically that would involve three inter-bank
> -;; register copies.  As we do not expect copysign to be followed by
> -;; other logical operations on the result, it seems preferable to keep
> -;; this as an unspec operation, rather than exposing the underlying
> -;; logic to the compiler.
>  
>  (define_expand "copysign<GPF:mode>3"
>    [(match_operand:GPF 0 "register_operand")
> @@ -7239,57 +7230,22 @@ (define_expand "copysign<GPF:mode>3"
>     (match_operand:GPF 2 "nonmemory_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U
> -                            << (GET_MODE_BITSIZE (<MODE>mode) - 1));
> -  /* copysign (x, -1) should instead be expanded as orr with the sign
> -     bit.  */
> -  rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
> -  if (GET_CODE (op2_elt) == CONST_DOUBLE
> -      && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
> -    {
> -      rtx v_bitmask
> -     = force_reg (V2<V_INT_EQUIV>mode,
> -                  gen_const_vec_duplicate (V2<V_INT_EQUIV>mode,
> -                                           signbit_const));
> -
> -      emit_insn (gen_iorv2<v_int_equiv>3 (
> -     lowpart_subreg (V2<V_INT_EQUIV>mode, operands[0], <MODE>mode),
> -     lowpart_subreg (V2<V_INT_EQUIV>mode, operands[1], <MODE>mode),
> -     v_bitmask));
> -      DONE;
> -    }
> -
> -  machine_mode int_mode = <V_INT_EQUIV>mode;
> -  rtx bitmask = gen_reg_rtx (int_mode);
> -  emit_move_insn (bitmask, signbit_const);
> -  operands[2] = force_reg (<MODE>mode, operands[2]);
> -  emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
> -                                    bitmask));
> +  rtx tmp = gen_reg_rtx (<VCONQ>mode);
> +  rtx op1 = lowpart_subreg (<VCONQ>mode, operands[1], <MODE>mode);
> +  rtx op2 = REG_P (operands[2])
> +           ? lowpart_subreg (<VCONQ>mode, operands[2], <MODE>mode)
> +           : gen_const_vec_duplicate (<VCONQ>mode, operands[2]);
> +  emit_insn (gen_copysign3 (<VCONQ>mode, tmp, op1, op2));
> +  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, tmp, 
> <VCONQ>mode));
>    DONE;
>  }
>  )

If I understand correctly, the idea here is bitcast to a vector,
do a vector logical operation, and bitcast back.  That is,
we ultimately generate:

(define_insn "aarch64_simd_bsl<mode>_internal<vczle><vczbe>"
  [(set (match_operand:VDQ_I 0 "register_operand")
        (xor:VDQ_I
           (and:VDQ_I
             (xor:VDQ_I
               (match_operand:<V_INT_EQUIV> 3 "register_operand")
               (match_operand:VDQ_I 2 "register_operand"))
             (match_operand:VDQ_I 1 "register_operand"))
          (match_dup:<V_INT_EQUIV> 3)
        ))]
  "TARGET_SIMD"
  {@ [ cons: =0 , 1 , 2 , 3  ]
     [ w        , 0 , w , w  ] bsl\t%0.<Vbtype>, %2.<Vbtype>, %3.<Vbtype>
     [ w        , w , w , 0  ] bit\t%0.<Vbtype>, %2.<Vbtype>, %1.<Vbtype>
     [ w        , w , 0 , w  ] bif\t%0.<Vbtype>, %3.<Vbtype>, %1.<Vbtype>
  }
  [(set_attr "type" "neon_bsl<q>")]
)

wrapped in scalar-to-vector and vector-to-scalar operations.

The problem is that this process is entirely transparent to the RTL
optimisers, so they could easily convert this into a scalar integer
operation.  (Similar things happened with the x86 STV pass.)
That might not be a problem yet, but I think the concern raised
in the existing comment is still valid.

So my preference would be to keep the existing scheme for now,
unless the problem can't be solved that way.

> -(define_insn "copysign<GPF:mode>3_insn"
> -  [(set (match_operand:GPF 0 "register_operand")
> -     (unspec:GPF [(match_operand:GPF 1 "register_operand")
> -                  (match_operand:GPF 2 "register_operand")
> -                  (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> -      UNSPEC_COPYSIGN))]
> -  "TARGET_SIMD"
> -  {@ [ cons: =0 , 1 , 2 , 3 ; attrs: type  ]
> -     [ w        , w , w , 0 ; neon_bsl<q>  ] bsl\t%0.<Vbtype>, %2.<Vbtype>, 
> %1.<Vbtype>
> -     [ w        , 0 , w , w ; neon_bsl<q>  ] bit\t%0.<Vbtype>, %2.<Vbtype>, 
> %3.<Vbtype>
> -     [ w        , w , 0 , w ; neon_bsl<q>  ] bif\t%0.<Vbtype>, %1.<Vbtype>, 
> %3.<Vbtype>
> -     [ r        , r , 0 , X ; bfm          ] bfxil\t%<w1>0, %<w1>1, #0, 
> <sizem1>

I agree that this final alternative is problematic though.  A standalone
patch to remove that alternative is OK for trunk and backports.  If you
hit an ICE with it, it would be nice to have a testcase too.

Thanks,
Richard

> -  }
> -)
> -
> -
> -;; For xorsign (x, y), we want to generate:
> +;; For xorsignf (x, y), we want to generate:
>  ;;
> -;; LDR   d2, #1<<63
> -;; AND   v3.8B, v1.8B, v2.8B
> -;; EOR   v0.8B, v0.8B, v3.8B
> +;;   movi    v31.4s, 0x80, lsl 24
> +;;   and     v31.16b, v31.16b, v1.16b
> +;;   eor     v0.16b, v31.16b, v0.16b
>  ;;
>  
>  (define_expand "@xorsign<mode>3"
> diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_3.c 
> b/gcc/testsuite/gcc.target/aarch64/copysign_3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..be48682420f1ff84e80af9efd9d11f64bd6e8052
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/copysign_3.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +float f1 (float x, float y)
> +{
> +  return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y);
> +}
> +
> +double f2 (double x, double y)
> +{
> +  return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y);
> +}
> +
> +/* { dg-final { scan-assembler-times "movi\t" 2 } } */
> +/* { dg-final { scan-assembler-not "copysign\tw" } } */
> +/* { dg-final { scan-assembler-not "dup\tw" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_4.c 
> b/gcc/testsuite/gcc.target/aarch64/copysign_4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..f3cec2fc9c21a4eaa3b6556479aeb15c04358a1c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/copysign_4.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8-a+sve" } */
> +
> +float f1 (float x, float y)
> +{
> +  return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y);
> +}
> +
> +double f2 (double x, double y)
> +{
> +  return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y);
> +}
> +
> +/* { dg-final { scan-assembler-times "movi\t" 1 } } */
> +/* { dg-final { scan-assembler-times "mov\tz" 1 } } */
> +/* { dg-final { scan-assembler-not "copysign\tw" } } */
> +/* { dg-final { scan-assembler-not "dup\tw" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c 
> b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> index 
> 18d10ee834d5d9b4361d890447060e78f09d3a73..1544bc5f1a736e95dd8bd2c608405aebb54ded1f
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> @@ -9,7 +9,7 @@
>  
>  /*
>  ** f1:
> -**   orr     v[0-9]+.2s, #?128, lsl #?24
> +**   orr     v[0-9]+.[24]s, #?128, lsl #?24
>  **   ret
>  */
>  float32_t f1 (float32_t a)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> index 
> a8b27199ff83d0eebadfc7dcf03f94e1229d76b8..1ebdc6aaeb102da25ad561b24641e72a652175fa
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> @@ -6,7 +6,7 @@
>  
>  /*
>  ** t1:
> -**   orr     z[0-9]+.s, z[0-9]+.s, #-2147483648
> +**   orr     v0.2s, #?128, lsl #?24
>  **   ret
>  */
>  float32x2_t t1 (float32x2_t a)
> @@ -16,7 +16,7 @@ float32x2_t t1 (float32x2_t a)
>  
>  /*
>  ** t2:
> -**   orr     z[0-9]+.s, z[0-9]+.s, #-2147483648
> +**   orr     v0.4s, #?128, lsl #?24
>  **   ret
>  */
>  float32x4_t t2 (float32x4_t a)
> @@ -26,7 +26,7 @@ float32x4_t t2 (float32x4_t a)
>  
>  /*
>  ** t3:
> -**   orr     z[0-9]+.d, z[0-9]+.d, #-9223372036854775808
> +**   orr     z[0-9]+.d, z[0-9]+.d, -9223372036854775808
>  **   ret
>  */
>  float64x2_t t3 (float64x2_t a)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> index 
> 19a7695e605bc8aced486a9c450d1cdc6be4691a..122152c0ebe4ea6840e418e75a2cadbfc9b0aba4
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> @@ -7,7 +7,7 @@
>  
>  /*
>  ** f1:
> -**   orr     z0.s, z0.s, #-2147483648
> +**   orr     v0.4s, #?128, lsl #?24
>  **   ret
>  */
>  float32_t f1 (float32_t a)
> @@ -17,7 +17,7 @@ float32_t f1 (float32_t a)
>  
>  /*
>  ** f2:
> -**   orr     z0.d, z0.d, #-9223372036854775808
> +**   orr     z0.d, z0.d, -9223372036854775808
>  **   ret
>  */
>  float64_t f2 (float64_t a)

Reply via email to