Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > Hi, > > As subject, this patch rewrites the vcvtx Neon intrinsics to use RTL builtins > rather than inline assembly code, allowing for better scheduling and > optimization. > > Regression tested and bootstrapped on aarch64-none-linux-gnu and > aarch64_be-none-elf - no issues. > > Ok for master?
OK, thanks. Richard > Thanks, > Jonathan > > --- > > gcc/ChangeLog: > > 2021-02-18 Jonathan Wright <jonathan.wri...@arm.com> > > * config/aarch64/aarch64-simd-builtins.def: Add > float_trunc_rodd builtin generator macros. > * config/aarch64/aarch64-simd.md (aarch64_float_trunc_rodd_df): > Define. > (aarch64_float_trunc_rodd_lo_v2sf): Define. > (aarch64_float_trunc_rodd_hi_v4sf_le): Define. > (aarch64_float_trunc_rodd_hi_v4sf_be): Define. > (aarch64_float_trunc_rodd_hi_v4sf): Define. > * config/aarch64/arm_neon.h (vcvtx_f32_f64): Use RTL builtin > instead of inline asm. > (vcvtx_high_f32_f64): Likewise. > (vcvtxd_f32_f64): Likewise. > * config/aarch64/iterators.md: Add FCVTXN unspec. > > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def > b/gcc/config/aarch64/aarch64-simd-builtins.def > index > 52ae398858db1ec506a97376e7ccc1153aa210c5..e4a9e8740c5f87f1d7feb8f6a9725d7def1a0323 > 100644 > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > @@ -634,6 +634,10 @@ > VAR1 (UNOP, float_extend_lo_, 0, FP, v4sf) > BUILTIN_VDF (UNOP, float_truncate_lo_, 0, FP) > > + VAR1 (UNOP, float_trunc_rodd_, 0, FP, df) > + VAR1 (UNOP, float_trunc_rodd_lo_, 0, FP, v2sf) > + VAR1 (BINOP, float_trunc_rodd_hi_, 0, FP, v4sf) > + > /* Implemented by aarch64_ld1<VALL_F16:mode>. */ > BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) > VAR1(STORE1P, ld1, 0, ALL, v2di) > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 207d644487e77cd66d933dc7860a59e57fee523d..fa4972e769ee79dfd369223e867ed18beb6dfb2c > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -3185,6 +3185,60 @@ > > ;; Float narrowing operations. > > +(define_insn "aarch64_float_trunc_rodd_df" > + [(set (match_operand:SF 0 "register_operand" "=w") > + (unspec:SF [(match_operand:DF 1 "register_operand" "w")] > + UNSPEC_FCVTXN))] > + "TARGET_SIMD" > + "fcvtxn\\t%s0, %d1" > + [(set_attr "type" "neon_fp_cvt_narrow_d_q")] > +) > + > +(define_insn "aarch64_float_trunc_rodd_lo_v2sf" > + [(set (match_operand:V2SF 0 "register_operand" "=w") > + (unspec:V2SF [(match_operand:V2DF 1 "register_operand" "w")] > + UNSPEC_FCVTXN))] > + "TARGET_SIMD" > + "fcvtxn\\t%0.2s, %1.2d" > + [(set_attr "type" "neon_fp_cvt_narrow_d_q")] > +) > + > +(define_insn "aarch64_float_trunc_rodd_hi_v4sf_le" > + [(set (match_operand:V4SF 0 "register_operand" "=w") > + (vec_concat:V4SF > + (match_operand:V2SF 1 "register_operand" "0") > + (unspec:V2SF [(match_operand:V2DF 2 "register_operand" "w")] > + UNSPEC_FCVTXN)))] > + "TARGET_SIMD && !BYTES_BIG_ENDIAN" > + "fcvtxn2\\t%0.4s, %2.2d" > + [(set_attr "type" "neon_fp_cvt_narrow_d_q")] > +) > + > +(define_insn "aarch64_float_trunc_rodd_hi_v4sf_be" > + [(set (match_operand:V4SF 0 "register_operand" "=w") > + (vec_concat:V4SF > + (unspec:V2SF [(match_operand:V2DF 2 "register_operand" "w")] > + UNSPEC_FCVTXN) > + (match_operand:V2SF 1 "register_operand" "0")))] > + "TARGET_SIMD && BYTES_BIG_ENDIAN" > + "fcvtxn2\\t%0.4s, %2.2d" > + [(set_attr "type" "neon_fp_cvt_narrow_d_q")] > +) > + > +(define_expand "aarch64_float_trunc_rodd_hi_v4sf" > + [(match_operand:V4SF 0 "register_operand") > + (match_operand:V2SF 1 "register_operand") > + (match_operand:V2DF 2 "register_operand")] > + "TARGET_SIMD" > +{ > + rtx (*gen) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN > + ? gen_aarch64_float_trunc_rodd_hi_v4sf_be > + : gen_aarch64_float_trunc_rodd_hi_v4sf_le; > + emit_insn (gen (operands[0], operands[1], operands[2])); > + DONE; > +} > +) > + > (define_insn "aarch64_float_truncate_lo_<mode>" > [(set (match_operand:VDF 0 "register_operand" "=w") > (float_truncate:VDF > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index > 5ff11e7ea4a9a722c66a37ee65319125313df436..17e059efb80fa86a8a32127ace4fc7f43e2040a8 > 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -7014,36 +7014,21 @@ __extension__ extern __inline float32x2_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vcvtx_f32_f64 (float64x2_t __a) > { > - float32x2_t __result; > - __asm__ ("fcvtxn %0.2s,%1.2d" > - : "=w"(__result) > - : "w"(__a) > - : /* No clobbers */); > - return __result; > + return __builtin_aarch64_float_trunc_rodd_lo_v2sf (__a); > } > > __extension__ extern __inline float32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vcvtx_high_f32_f64 (float32x2_t __a, float64x2_t __b) > { > - float32x4_t __result; > - __asm__ ("fcvtxn2 %0.4s,%1.2d" > - : "=w"(__result) > - : "w" (__b), "0"(__a) > - : /* No clobbers */); > - return __result; > + return __builtin_aarch64_float_trunc_rodd_hi_v4sf (__a, __b); > } > > __extension__ extern __inline float32_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vcvtxd_f32_f64 (float64_t __a) > { > - float32_t __result; > - __asm__ ("fcvtxn %s0,%d1" > - : "=w"(__result) > - : "w"(__a) > - : /* No clobbers */); > - return __result; > + return __builtin_aarch64_float_trunc_rodd_df (__a); > } > > __extension__ extern __inline float32x2_t > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > 65e728acb3bc0cbcc8be29d330bc1ee66ef9a504..8b4c1bd1f72e43205ea33ec094b029026505b270 > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -864,6 +864,7 @@ > UNSPEC_BFCVTN ; Used in aarch64-simd.md. > UNSPEC_BFCVTN2 ; Used in aarch64-simd.md. > UNSPEC_BFCVT ; Used in aarch64-simd.md. > + UNSPEC_FCVTXN ; Used in aarch64-simd.md. > ]) > > ;; ------------------------------------------------------------------