Tamar Christina <tamar.christ...@arm.com> writes:
> Hi All,
>
> This adds an implementation for masked copysign along with an optimized
> pattern for masked copysign (x, -1).

It feels like we're ending up with a lot of AArch64-specific code that
just hard-codes the observation that changing the sign is equivalent to
changing the top bit.  We then need to make sure that we choose the best
way of changing the top bit for any given situation.

Hard-coding the -1/negative case is one instance of that.  But it looks
like we also fail to use the best sequence for SVE2.  E.g.
[https://godbolt.org/z/ajh3MM5jv]:

#include <stdint.h>

void f(double *restrict a, double *restrict b) {
    for (int i = 0; i < 100; ++i)
        a[i] = __builtin_copysign(a[i], b[i]);
}

void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
    for (int i = 0; i < 100; ++i)
        a[i] = (a[i] & ~c) | (b[i] & c);
}

gives:

f:
        mov     x2, 0
        mov     w3, 100
        whilelo p7.d, wzr, w3
.L2:
        ld1d    z30.d, p7/z, [x0, x2, lsl 3]
        ld1d    z31.d, p7/z, [x1, x2, lsl 3]
        and     z30.d, z30.d, #0x7fffffffffffffff
        and     z31.d, z31.d, #0x8000000000000000
        orr     z31.d, z31.d, z30.d
        st1d    z31.d, p7, [x0, x2, lsl 3]
        incd    x2
        whilelo p7.d, w2, w3
        b.any   .L2
        ret
g:
        mov     x3, 0
        mov     w4, 100
        mov     z29.d, x2
        whilelo p7.d, wzr, w4
.L6:
        ld1d    z30.d, p7/z, [x0, x3, lsl 3]
        ld1d    z31.d, p7/z, [x1, x3, lsl 3]
        bsl     z31.d, z31.d, z30.d, z29.d
        st1d    z31.d, p7, [x0, x3, lsl 3]
        incd    x3
        whilelo p7.d, w3, w4
        b.any   .L6
        ret

I saw that you originally tried to do this in match.pd and that the
decision was to fold to copysign instead.  But perhaps there's a compromise
where isel does something with the (new) copysign canonical form?
I.e. could we go with your new version of the match.pd patch, and add
some isel stuff as a follow-on?

Not saying no to this patch, just thought that the above was worth
considering.

[I agree with Andrew's comments FWIW.]

Thanks,
Richard

>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>       PR tree-optimization/109154
>       * config/aarch64/aarch64-sve.md (cond_copysign<mode>): New.
>
> gcc/testsuite/ChangeLog:
>
>       PR tree-optimization/109154
>       * gcc.target/aarch64/sve/fneg-abs_5.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-sve.md 
> b/gcc/config/aarch64/aarch64-sve.md
> index 
> 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254568f45b61a14aa11c305
>  100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
>    }
>  )
>  
> +(define_expand "cond_copysign<mode>"
> +  [(match_operand:SVE_FULL_F 0 "register_operand")
> +   (match_operand:<VPRED> 1 "register_operand")
> +   (match_operand:SVE_FULL_F 2 "register_operand")
> +   (match_operand:SVE_FULL_F 3 "nonmemory_operand")
> +   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> +  "TARGET_SVE"
> +  {
> +    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> +    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
> +    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
> +    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
> +
> +    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
> +    rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3], <MODE>mode);
> +    rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4], <MODE>mode);
> +
> +    rtx v_sign_bitmask
> +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> +                                        HOST_WIDE_INT_M1U << bits);
> +
> +    /* copysign (x, -1) should instead be expanded as orr with the sign
> +       bit.  */
> +    if (!REG_P (operands[3]))
> +      {
> +     auto r0
> +       = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[3]));
> +     if (-1 == real_to_integer (r0))
> +       {
> +         arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
> +         emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
> +                                               arg3, arg4));
> +         emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> +         DONE;
> +       }
> +      }
> +
> +    operands[2] = force_reg (<MODE>mode, operands[3]);
> +    emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
> +    emit_insn (gen_and<v_int_equiv>3
> +            (mant, arg2,
> +             aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> +                                                ~(HOST_WIDE_INT_M1U
> +                                                  << bits))));
> +    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
> +                                       arg4));
> +    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> +    DONE;
> +  }
> +)
> +
>  (define_expand "xorsign<mode>3"
>    [(match_operand:SVE_FULL_F 0 "register_operand")
>     (match_operand:SVE_FULL_F 1 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..f4ecbeecbe1290134e688f46a4389d17155e4a0a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +**   ...
> +**   orr     z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> +**   ...
> +*/
> +void f1 (float32_t *a, int n)
> +{
> +  for (int i = 0; i < (n & -8); i++)
> +   if (a[i] > n)
> +     a[i] = -fabsf (a[i]);
> +   else
> +     a[i] = n;
> +}
> +
> +/*
> +** f2:
> +**   ...
> +**   orr     z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +**   ...
> +*/
> +void f2 (float64_t *a, int n)
> +{
> +  for (int i = 0; i < (n & -8); i++)
> +   if (a[i] > n)
> +     a[i] = -fabs (a[i]);
> +   else
> +     a[i] = n;
> +}

Reply via email to