Add UNSPEC_SEL combiner patterns for unpacked FP conversions, where the strictness value is SVE_RELAXED_GP.
gcc/ChangeLog: * config/aarch64/aarch64-sve.md (*cond_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>_relaxed): New FCVT/SEL combiner pattern. (*cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx2SI_ONLY:mode>_relaxed): New FCVTZ{S,U}/SEL combiner pattern. (*cond_<optab>_nonextend<SVE_HSDI:mode><SVE_PARTIAL_F:mode>_relaxed): New {S,U}CVTF/SEL combiner pattern. (*cond_<optab>_trunc<SVE_SDF:mode><SVE_PARTIAL_HSF:mode>): New FCVT/SEL combiner pattern. (*cond_<optab>_nontrunc<SVE_PARTIAL_HSF:mode><SVE_SDF:mode>_relaxed): New FCVTZ{S,U}/SEL combiner pattern. * config/aarch64/iterators.md: New mode iterator for VNx2SI. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c: New test. * gcc.target/aarch64/sve/unpacked_cond_fcvt_1.c: Likewise. * gcc.target/aarch64/sve/unpacked_cond_fcvtz_1.c: Likewise. --- gcc/config/aarch64/aarch64-sve.md | 121 ++++++++++++++++++ gcc/config/aarch64/iterators.md | 1 + .../aarch64/sve/unpacked_cond_cvtf_1.c | 47 +++++++ .../aarch64/sve/unpacked_cond_fcvt_1.c | 37 ++++++ .../aarch64/sve/unpacked_cond_fcvtz_1.c | 51 ++++++++ 5 files changed, 257 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvt_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvtz_1.c diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index f8f8d2f011a..7484aeeb161 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -9612,6 +9612,31 @@ } ) +;; As above, for pairs that are used by the auto-vectorizer only. +(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>_relaxed" + [(set (match_operand:SVE_HSDI 0 "register_operand") + (unspec:SVE_HSDI + [(match_operand:<SVE_HSDI:VPRED> 1 "register_operand") + (unspec:SVE_HSDI + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_PARTIAL_F 2 "register_operand")] + SVE_COND_FCVTI) + (match_operand:SVE_HSDI 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE + && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ &w , Upl , w , 0 ; * ] fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> + [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_HSDI:Vetype>, %1/z, %2.<SVE_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + (define_insn "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>_strict" [(set (match_operand:SVE_FULL_HSDI 0 "register_operand") (unspec:SVE_FULL_HSDI @@ -9665,6 +9690,29 @@ } ) +(define_insn_and_rewrite "*cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx2SI_ONLY:mode>_relaxed" + [(set (match_operand:VNx2SI_ONLY 0 "register_operand") + (unspec:VNx2SI_ONLY + [(match_operand:VNx2BI 1 "register_operand") + (unspec:VNx2SI_ONLY + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:VNx2DF_ONLY 2 "register_operand")] + SVE_COND_FCVTI) + (match_operand:VNx2SI_ONLY 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ &w , Upl , w , 0 ; * ] fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> + [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<VNx2DF_ONLY:Vetype>, %1/z, %2.<VNx2DF_ONLY:Vetype>\;fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + ;; ------------------------------------------------------------------------- ;; ---- [INT<-FP] Packs ;; ------------------------------------------------------------------------- @@ -9816,6 +9864,31 @@ } ) +;; As above, for pairs that are used by the auto-vectorizer only. +(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_HSDI:mode><SVE_PARTIAL_F:mode>_relaxed" + [(set (match_operand:SVE_PARTIAL_F 0 "register_operand") + (unspec:SVE_PARTIAL_F + [(match_operand:<SVE_HSDI:VPRED> 1 "register_operand") + (unspec:SVE_PARTIAL_F + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_HSDI 2 "register_operand")] + SVE_COND_ICVTF) + (match_operand:SVE_PARTIAL_F 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE + && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ &w , Upl , w , 0 ; * ] <su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype> + [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_HSDI:Vetype>, %1/z, %2.<SVE_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + (define_insn "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>_strict" [(set (match_operand:SVE_FULL_F 0 "register_operand") (unspec:SVE_FULL_F @@ -10025,6 +10098,30 @@ } ) +;; As above, for pairs that are used by the auto-vectorizer only. +(define_insn_and_rewrite "*cond_<optab>_trunc<SVE_SDF:mode><SVE_PARTIAL_HSF:mode>" + [(set (match_operand:SVE_PARTIAL_HSF 0 "register_operand") + (unspec:SVE_PARTIAL_HSF + [(match_operand:<SVE_SDF:VPRED> 1 "register_operand") + (unspec:SVE_PARTIAL_HSF + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_SDF 2 "register_operand")] + SVE_COND_FCVT) + (match_operand:SVE_PARTIAL_HSF 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w , Upl , w , 0 ; * ] fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype> + [ ?&w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_SDF:Vetype>, %1/z, %2.<SVE_SDF:Vetype>\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + ;; ------------------------------------------------------------------------- ;; ---- [FP<-FP] Packs (bfloat16) ;; ------------------------------------------------------------------------- @@ -10218,6 +10315,30 @@ } ) +;; As above, for pairs that are used by the auto-vectorizer only. +(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_PARTIAL_HSF:mode><SVE_SDF:mode>_relaxed" + [(set (match_operand:SVE_SDF 0 "register_operand") + (unspec:SVE_SDF + [(match_operand:<SVE_SDF:VPRED> 1 "register_operand") + (unspec:SVE_SDF + [(match_operand 4) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_PARTIAL_HSF 2 "register_operand")] + SVE_COND_FCVT) + (match_operand:SVE_SDF 3 "aarch64_simd_reg_or_zero")] + UNSPEC_SEL))] + "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w , Upl , w , 0 ; * ] fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype> + [ ?&w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_SDF:Vetype>, %1/z, %2.<SVE_SDF:Vetype>\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype> + } + "&& !rtx_equal_p (operands[1], operands[4])" + { + operands[4] = copy_rtx (operands[1]); + } +) + ;; ------------------------------------------------------------------------- ;; ---- [PRED<-PRED] Packs ;; ------------------------------------------------------------------------- diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 3a5ddb52776..41e483bb80e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -463,6 +463,7 @@ (define_mode_iterator VNx8SI_ONLY [VNx8SI]) (define_mode_iterator VNx8SF_ONLY [VNx8SF]) (define_mode_iterator VNx8DI_ONLY [VNx8DI]) +(define_mode_iterator VNx2SI_ONLY [VNx2SI]) (define_mode_iterator VNx4SI_ONLY [VNx4SI]) (define_mode_iterator VNx4SF_ONLY [VNx4SF]) (define_mode_iterator VNx2DI_ONLY [VNx2DI]) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c new file mode 100644 index 00000000000..8f69232f2cf --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 -fno-trapping-math" } */ + +#include <stdint.h> + +#define COND_CVT(TYPE0, TYPE1, TYPE2, COUNT) \ + void \ + test_##TYPE0##_##TYPE1##_##TYPE2 (TYPE0 *__restrict out, \ + TYPE1 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE2 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? (TYPE0)a[i] : b[i]; \ + } + +#define TEST_CVTF(PFX, T) \ + T (_Float16, PFX##int16_t, uint64_t, 32) \ + T (_Float16, PFX##int16_t, uint32_t, 64) \ + T (_Float16, PFX##int32_t, uint64_t, 32) \ + T (_Float16, PFX##int32_t, uint32_t, 64) \ + T (_Float16, PFX##int64_t, uint64_t, 32) \ + T (float, PFX##int32_t, uint64_t, 32) \ + T (float, PFX##int64_t, uint64_t, 32) + +#define TEST_ALL(T) \ + TEST_CVTF (, T) \ + TEST_CVTF (u, T) + +TEST_ALL (COND_CVT) + +/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvt_1.c new file mode 100644 index 00000000000..e37ea18ff85 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvt_1.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 -fno-trapping-math" } */ + +#include <stdint.h> + +#define COND_CVT(TYPE0, TYPE1, TYPE2, COUNT) \ + void \ + test_##TYPE0##_##TYPE1##_##TYPE2 (TYPE0 *__restrict out, \ + TYPE1 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE2 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? (TYPE0)a[i] : b[i]; \ + } + +#define TEST_FCVT(T) \ + T (_Float16, float, uint64_t, 32) \ + T (_Float16, float, uint32_t, 64) \ + T (_Float16, double, uint64_t, 32) \ + T (float, double, uint64_t, 32) \ + T (float, _Float16, uint64_t, 32) \ + T (float, _Float16, uint32_t, 64) \ + T (double, _Float16, uint64_t,32) \ + T (double, float, uint64_t, 32) + +TEST_FCVT (COND_CVT) + +/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvtz_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvtz_1.c new file mode 100644 index 00000000000..d52a044bd9a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvtz_1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 -fno-trapping-math" } */ + +#include <stdint.h> + +#define COND_CVT(TYPE0, TYPE1, TYPE2, COUNT) \ + void \ + test_##TYPE0##_##TYPE1##_##TYPE2 (TYPE0 *__restrict out, \ + TYPE1 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE2 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? (TYPE0)a[i] : b[i]; \ + } + +#define TEST_FCVTZ(PFX, T) \ + T (PFX##int16_t, _Float16, uint64_t, 32) \ + T (PFX##int16_t, _Float16, uint32_t, 64) \ + T (PFX##int32_t, _Float16, uint64_t, 32) \ + T (PFX##int32_t, _Float16, uint32_t, 64) \ + T (PFX##int64_t, _Float16, uint64_t, 32) \ + T (PFX##int32_t, float, uint64_t, 32) \ + T (PFX##int64_t, float, uint64_t, 32) \ + T (PFX##int32_t, double, uint64_t, 32) + +#define TEST_ALL(T) \ + TEST_FCVTZ (, T) \ + TEST_FCVTZ (u, T) + +TEST_ALL (COND_CVT) + +/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ -- 2.34.1