On Fri, Sep 16, 2022 at 2:55 AM liuhongt via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > For ifloor/lfloor/iceil/lceil/irint/lrint/iround/lround when size of > in_mode is not equal out_mode, vectorizer doesn't go to internal fn > way,still left that part in the ix86_builtin_vectorized_function. > > Remove others builtins and add corresponding expanders. > Note the patch just refactor the codes, doesn't solve the related case > in the PR which needs extra expander for 64-bit vector. > > Bootstrapped and regtested on x86-64-pc-linux-gnu{-m32,}. > Ok for trunk. > > gcc/ChangeLog: > > PR target/106910 > * config/i386/i386-builtins.cc > (ix86_builtin_vectorized_function): Modernized with > corresponding expanders. > * config/i386/sse.md (lrint<mode><sseintvecmodelower>2): New > expander. > (floor<mode>2): Ditto. > (lfloor<mode><sseintvecmodelower>2): Ditto. > (ceil<mode>2): Ditto. > (lceil<mode><sseintvecmodelower>2): Ditto. > (btrunc<mode>2): Ditto. > (lround<mode><sseintvecmodelower>2): Ditto. > (exp2<mode>2): Ditto.
LGTM. Thanks, Uros. > --- > gcc/config/i386/i386-builtins.cc | 185 +------------------------------ > gcc/config/i386/sse.md | 80 +++++++++++++ > 2 files changed, 84 insertions(+), 181 deletions(-) > > diff --git a/gcc/config/i386/i386-builtins.cc > b/gcc/config/i386/i386-builtins.cc > index 6a04fb57e65..af2faee245b 100644 > --- a/gcc/config/i386/i386-builtins.cc > +++ b/gcc/config/i386/i386-builtins.cc > @@ -1540,21 +1540,16 @@ ix86_builtin_vectorized_function (unsigned int fn, > tree type_out, > > switch (fn) > { > - CASE_CFN_EXP2: > - if (out_mode == SFmode && in_mode == SFmode) > - { > - if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_EXP2PS); > - } > - break; > - > CASE_CFN_IFLOOR: > CASE_CFN_LFLOOR: > - CASE_CFN_LLFLOOR: > /* The round insn does not trap on denormals. */ > if (flag_trapping_math || !TARGET_SSE4_1) > break; > > + /* PR106910, currently vectorizer doesn't go direct internal fn way > + when out_n != in_n, so let's still keep this. > + Otherwise, it relies on expander of > + lceilmn2/lfloormn2/lroundmn2/lrintmn2. */ > if (out_mode == SImode && in_mode == DFmode) > { > if (out_n == 4 && in_n == 2) > @@ -1564,20 +1559,10 @@ ix86_builtin_vectorized_function (unsigned int fn, > tree type_out, > else if (out_n == 16 && in_n == 8) > return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512); > } > - if (out_mode == SImode && in_mode == SFmode) > - { > - if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512); > - } > break; > > CASE_CFN_ICEIL: > CASE_CFN_LCEIL: > - CASE_CFN_LLCEIL: > /* The round insn does not trap on denormals. */ > if (flag_trapping_math || !TARGET_SSE4_1) > break; > @@ -1591,20 +1576,10 @@ ix86_builtin_vectorized_function (unsigned int fn, > tree type_out, > else if (out_n == 16 && in_n == 8) > return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512); > } > - if (out_mode == SImode && in_mode == SFmode) > - { > - if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512); > - } > break; > > CASE_CFN_IRINT: > CASE_CFN_LRINT: > - CASE_CFN_LLRINT: > if (out_mode == SImode && in_mode == DFmode) > { > if (out_n == 4 && in_n == 2) > @@ -1614,20 +1589,10 @@ ix86_builtin_vectorized_function (unsigned int fn, > tree type_out, > else if (out_n == 16 && in_n == 8) > return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512); > } > - if (out_mode == SImode && in_mode == SFmode) > - { > - if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512); > - } > break; > > CASE_CFN_IROUND: > CASE_CFN_LROUND: > - CASE_CFN_LLROUND: > /* The round insn does not trap on denormals. */ > if (flag_trapping_math || !TARGET_SSE4_1) > break; > @@ -1641,150 +1606,8 @@ ix86_builtin_vectorized_function (unsigned int fn, > tree type_out, > else if (out_n == 16 && in_n == 8) > return ix86_get_builtin > (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512); > } > - if (out_mode == SImode && in_mode == SFmode) > - { > - if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512); > - } > break; > > - CASE_CFN_FLOOR: > - /* The round insn does not trap on denormals. */ > - if (flag_trapping_math || !TARGET_SSE4_1) > - break; > - > - if (out_mode == DFmode && in_mode == DFmode) > - { > - if (out_n == 2 && in_n == 2) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPD); > - else if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPD256); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPD512); > - } > - if (out_mode == SFmode && in_mode == SFmode) > - { > - if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPS); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPS256); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPS512); > - } > - if (out_mode == HFmode && in_mode == HFmode) > - { > - /* V8HF/V16HF is supported in ix86_vector_mode_supported_p > - under TARGET_AVX512FP16, TARGET_AVX512VL is needed here. */ > - if (out_n < 32 && !TARGET_AVX512VL) > - break; > - > - if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPH); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPH256); > - else if (out_n == 32 && in_n == 32) > - return ix86_get_builtin (IX86_BUILTIN_FLOORPH512); > - } > - break; > - > - CASE_CFN_CEIL: > - /* The round insn does not trap on denormals. */ > - if (flag_trapping_math || !TARGET_SSE4_1) > - break; > - > - if (out_mode == DFmode && in_mode == DFmode) > - { > - if (out_n == 2 && in_n == 2) > - return ix86_get_builtin (IX86_BUILTIN_CEILPD); > - else if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_CEILPD256); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_CEILPD512); > - } > - if (out_mode == SFmode && in_mode == SFmode) > - { > - if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_CEILPS); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_CEILPS256); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_CEILPS512); > - } > - if (out_mode == HFmode && in_mode == HFmode) > - { > - /* V8HF/V16HF is supported in ix86_vector_mode_supported_p > - under TARGET_AVX512FP16, TARGET_AVX512VL is needed here. */ > - if (out_n < 32 && !TARGET_AVX512VL) > - break; > - > - if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_CEILPH); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_CEILPH256); > - else if (out_n == 32 && in_n == 32) > - return ix86_get_builtin (IX86_BUILTIN_CEILPH512); > - } > - break; > - > - CASE_CFN_TRUNC: > - /* The round insn does not trap on denormals. */ > - if (flag_trapping_math || !TARGET_SSE4_1) > - break; > - > - if (out_mode == DFmode && in_mode == DFmode) > - { > - if (out_n == 2 && in_n == 2) > - return ix86_get_builtin (IX86_BUILTIN_TRUNCPD); > - else if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512); > - } > - if (out_mode == SFmode && in_mode == SFmode) > - { > - if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_TRUNCPS); > - else if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512); > - } > - if (out_mode == HFmode && in_mode == HFmode) > - { > - /* V8HF/V16HF is supported in ix86_vector_mode_supported_p > - under TARGET_AVX512FP16, TARGET_AVX512VL is needed here. */ > - if (out_n < 32 && !TARGET_AVX512VL) > - break; > - > - if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_TRUNCPH); > - else if (out_n == 16 && in_n == 16) > - return ix86_get_builtin (IX86_BUILTIN_TRUNCPH256); > - else if (out_n == 32 && in_n == 32) > - return ix86_get_builtin (IX86_BUILTIN_TRUNCPH512); > - } > - break; > - > - CASE_CFN_FMA: > - if (out_mode == DFmode && in_mode == DFmode) > - { > - if (out_n == 2 && in_n == 2) > - return ix86_get_builtin (IX86_BUILTIN_VFMADDPD); > - if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256); > - } > - if (out_mode == SFmode && in_mode == SFmode) > - { > - if (out_n == 4 && in_n == 4) > - return ix86_get_builtin (IX86_BUILTIN_VFMADDPS); > - if (out_n == 8 && in_n == 8) > - return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256); > - } > - break; > > default: > break; > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index d535c0af043..dd6c94dce05 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -321,6 +321,11 @@ (define_mode_iterator VF > [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF > (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) > > +(define_mode_iterator VF1_VF2_AVX512DQ > + [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF > + (V8DF "TARGET_AVX512DQ") (V4DF "TARGET_AVX512DQ && TARGET_AVX512VL") > + (V2DF "TARGET_AVX512DQ && TARGET_AVX512VL")]) > + > (define_mode_iterator VFH > [(V32HF "TARGET_AVX512FP16") > (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") > @@ -23177,6 +23182,14 @@ (define_expand "rint<mode>2" > "TARGET_SSE4_1" > "operands[2] = GEN_INT (ROUND_MXCSR);") > > +;; Note vcvtpd2qq require avx512dq for all vector lengths. > +(define_expand "lrint<mode><sseintvecmodelower>2" > + [(set (match_operand:<sseintvecmode> 0 "register_operand") > + (unspec:<sseintvecmode> > + [(match_operand:VF1_VF2_AVX512DQ 1 "register_operand")] > + UNSPEC_FIX_NOTRUNC))] > + "TARGET_SSE2") > + > (define_insn "<sse4_1>_round<ssemodesuffix><avxsizesuffix>" > [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") > (unspec:VF_128_256 > @@ -23316,6 +23329,55 @@ (define_insn "*sse4_1_round<ssescalarmodesuffix>" > (set_attr "prefix" "orig,orig,vex,evex") > (set_attr "mode" "<MODE>")]) > > +(define_expand "floor<mode>2" > + [(set (match_operand:VFH 0 "register_operand") > + (unspec:VFH > + [(match_operand:VFH 1 "vector_operand") > + (match_dup 2)] > + UNSPEC_ROUND))] > + "TARGET_SSE4_1 && !flag_trapping_math" > + "operands[2] = GEN_INT (ROUND_FLOOR);") > + > +(define_expand "lfloor<mode><sseintvecmodelower>2" > + [(match_operand:<sseintvecmode> 0 "register_operand") > + (match_operand:VF1_VF2_AVX512DQ 1 "register_operand")] > + "TARGET_SSE4_1 && !flag_trapping_math" > +{ > + rtx tmp = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_floor<mode>2 (tmp, operands[1])); > + emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (operands[0], tmp)); > + DONE; > +}) > + > +(define_expand "ceil<mode>2" > + [(set (match_operand:VFH 0 "register_operand") > + (unspec:VFH > + [(match_operand:VFH 1 "vector_operand") > + (match_dup 2)] > + UNSPEC_ROUND))] > + "TARGET_SSE4_1 && !flag_trapping_math" > + "operands[2] = GEN_INT (ROUND_CEIL);") > + > +(define_expand "lceil<mode><sseintvecmodelower>2" > + [(match_operand:<sseintvecmode> 0 "register_operand") > + (match_operand:VF1_VF2_AVX512DQ 1 "register_operand")] > + "TARGET_SSE4_1 && !flag_trapping_math" > +{ > + rtx tmp = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_ceil<mode>2 (tmp, operands[1])); > + emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (operands[0], tmp)); > + DONE; > +}) > + > +(define_expand "btrunc<mode>2" > + [(set (match_operand:VFH 0 "register_operand") > + (unspec:VFH > + [(match_operand:VFH 1 "vector_operand") > + (match_dup 2)] > + UNSPEC_ROUND))] > + "TARGET_SSE4_1 && !flag_trapping_math" > + "operands[2] = GEN_INT (ROUND_TRUNC);") > + > (define_expand "round<mode>2" > [(set (match_dup 3) > (plus:VF > @@ -23350,6 +23412,17 @@ (define_expand "round<mode>2" > operands[4] = GEN_INT (ROUND_TRUNC); > }) > > +(define_expand "lround<mode><sseintvecmodelower>2" > + [(match_operand:<sseintvecmode> 0 "register_operand") > + (match_operand:VF1_VF2_AVX512DQ 1 "register_operand")] > + "TARGET_SSE4_1 && !flag_trapping_math" > +{ > + rtx tmp = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_round<mode>2 (tmp, operands[1])); > + emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (operands[0], tmp)); > + DONE; > +}) > + > (define_expand "round<mode>2_sfix" > [(match_operand:<sseintvecmode> 0 "register_operand") > (match_operand:VF1 1 "register_operand")] > @@ -23868,6 +23941,13 @@ (define_insn > "*avx512pf_scatterpf<VI4_256_8_512:mode>df_mask" > (set_attr "prefix" "evex") > (set_attr "mode" "XI")]) > > +(define_expand "exp2<mode>2" > + [(set (match_operand:VF_512 0 "register_operand") > + (unspec:VF_512 > + [(match_operand:VF_512 1 "vector_operand")] > + UNSPEC_EXP2))] > + "TARGET_AVX512ER") > + > (define_insn "avx512er_exp2<mode><mask_name><round_saeonly_name>" > [(set (match_operand:VF_512 0 "register_operand" "=v") > (unspec:VF_512 > -- > 2.27.0 >