Tamar Christina <tamar.christ...@arm.com> writes: > Hi All, > > This adds an implementation for masked copysign along with an optimized > pattern for masked copysign (x, -1).
It feels like we're ending up with a lot of AArch64-specific code that just hard-codes the observation that changing the sign is equivalent to changing the top bit. We then need to make sure that we choose the best way of changing the top bit for any given situation. Hard-coding the -1/negative case is one instance of that. But it looks like we also fail to use the best sequence for SVE2. E.g. [https://godbolt.org/z/ajh3MM5jv]: #include <stdint.h> void f(double *restrict a, double *restrict b) { for (int i = 0; i < 100; ++i) a[i] = __builtin_copysign(a[i], b[i]); } void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) { for (int i = 0; i < 100; ++i) a[i] = (a[i] & ~c) | (b[i] & c); } gives: f: mov x2, 0 mov w3, 100 whilelo p7.d, wzr, w3 .L2: ld1d z30.d, p7/z, [x0, x2, lsl 3] ld1d z31.d, p7/z, [x1, x2, lsl 3] and z30.d, z30.d, #0x7fffffffffffffff and z31.d, z31.d, #0x8000000000000000 orr z31.d, z31.d, z30.d st1d z31.d, p7, [x0, x2, lsl 3] incd x2 whilelo p7.d, w2, w3 b.any .L2 ret g: mov x3, 0 mov w4, 100 mov z29.d, x2 whilelo p7.d, wzr, w4 .L6: ld1d z30.d, p7/z, [x0, x3, lsl 3] ld1d z31.d, p7/z, [x1, x3, lsl 3] bsl z31.d, z31.d, z30.d, z29.d st1d z31.d, p7, [x0, x3, lsl 3] incd x3 whilelo p7.d, w3, w4 b.any .L6 ret I saw that you originally tried to do this in match.pd and that the decision was to fold to copysign instead. But perhaps there's a compromise where isel does something with the (new) copysign canonical form? I.e. could we go with your new version of the match.pd patch, and add some isel stuff as a follow-on? Not saying no to this patch, just thought that the above was worth considering. [I agree with Andrew's comments FWIW.] Thanks, Richard > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > PR tree-optimization/109154 > * config/aarch64/aarch64-sve.md (cond_copysign<mode>): New. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/109154 > * gcc.target/aarch64/sve/fneg-abs_5.c: New test. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index > 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254568f45b61a14aa11c305 > 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3" > } > ) > > +(define_expand "cond_copysign<mode>" > + [(match_operand:SVE_FULL_F 0 "register_operand") > + (match_operand:<VPRED> 1 "register_operand") > + (match_operand:SVE_FULL_F 2 "register_operand") > + (match_operand:SVE_FULL_F 3 "nonmemory_operand") > + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] > + "TARGET_SVE" > + { > + rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode); > + rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode); > + rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode); > + int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1; > + > + rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode); > + rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3], <MODE>mode); > + rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4], <MODE>mode); > + > + rtx v_sign_bitmask > + = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode, > + HOST_WIDE_INT_M1U << bits); > + > + /* copysign (x, -1) should instead be expanded as orr with the sign > + bit. */ > + if (!REG_P (operands[3])) > + { > + auto r0 > + = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[3])); > + if (-1 == real_to_integer (r0)) > + { > + arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask); > + emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2, > + arg3, arg4)); > + emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res)); > + DONE; > + } > + } > + > + operands[2] = force_reg (<MODE>mode, operands[3]); > + emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask)); > + emit_insn (gen_and<v_int_equiv>3 > + (mant, arg2, > + aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode, > + ~(HOST_WIDE_INT_M1U > + << bits)))); > + emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant, > + arg4)); > + emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res)); > + DONE; > + } > +) > + > (define_expand "xorsign<mode>3" > [(match_operand:SVE_FULL_F 0 "register_operand") > (match_operand:SVE_FULL_F 1 "register_operand") > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..f4ecbeecbe1290134e688f46a4389d17155e4a0a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c > @@ -0,0 +1,36 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ > + > +#include <arm_neon.h> > +#include <math.h> > + > +/* > +** f1: > +** ... > +** orr z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s > +** ... > +*/ > +void f1 (float32_t *a, int n) > +{ > + for (int i = 0; i < (n & -8); i++) > + if (a[i] > n) > + a[i] = -fabsf (a[i]); > + else > + a[i] = n; > +} > + > +/* > +** f2: > +** ... > +** orr z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d > +** ... > +*/ > +void f2 (float64_t *a, int n) > +{ > + for (int i = 0; i < (n & -8); i++) > + if (a[i] > n) > + a[i] = -fabs (a[i]); > + else > + a[i] = n; > +}