On Fri, Nov 8, 2024 at 10:33 AM liuhongt <hongtao....@intel.com> wrote: > > hw instruction doesn't raise exceptions, turns sNAN into qNAN quietly, > and always round to nearest (even). Output denormals are always > flushed to zero and input denormals are always treated as zero. MXCSR > is not consulted nor updated. > W/o native instructions, flag_unsafe_math_optimizations is needed for > the permutation instructions. > Similar guard extend from vector __bf16 to vector float with > !HONOR_NANS (BFmode). > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Any comments? Pushed to trunk. > > gcc/ChangeLog: > > * config/i386/i386.md (truncsf2bf2): Add !flag_rounding_math > to the condition, require flag_unsafe_math_optimizations when > native instruction is not available. > * config/i386/mmx.md: (truncv2sfv2bf2): Ditto. > (extendv2bfv2sf2): Add !HONOR_NANS (BFmode) to the condition. > * config/i386/sse.md: (truncv4sfv4sf2): Add > !flag_rounding_math to the condition, require > flag_unsafe_math_optimizations when native instruction is not > available. > (truncv8sfv8bf2): Ditto. > (truncv16sfv16bf2): Ditto. > (extendv4bfv4sf2): Add !HONOR_NANS (BFmode) to the condition. > (extendv8bfv8sf2): Ditto. > (extendv16bfv16sf2): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx512bf16-truncsfbf.c: Add -ffast-math. > * gcc.target/i386/avx512bw-extendbf2sf.c: Ditto. > * gcc.target/i386/avx512bw-truncsfbf.c: Ditto. > * gcc.target/i386/sse2-extendbf2sf.c: Ditto. > * gcc.target/i386/ssse3-truncsfbf.c: Ditto. > --- > gcc/config/i386/i386.md | 11 ++++++++++- > gcc/config/i386/mmx.md | 8 ++++++-- > gcc/config/i386/sse.md | 16 ++++++++++++---- > .../gcc.target/i386/avx512bf16-truncsfbf.c | 2 +- > .../gcc.target/i386/avx512bw-extendbf2sf.c | 2 +- > .../gcc.target/i386/avx512bw-truncsfbf.c | 2 +- > gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c | 2 +- > gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c | 2 +- > 8 files changed, 33 insertions(+), 12 deletions(-) > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index c492fe55881..96d5420d9de 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -5694,11 +5694,20 @@ (define_insn "*trunc<mode>hf2" > (set_attr "prefix" "evex") > (set_attr "mode" "HF")]) > > +/* vcvtneps2bf16 doesn't honor SNAN, and turn sNAN into qNAN quietly, > + and it always round to even. > + flag_unsafte_math_optimization is needed for psrld. > + If we don't expect qNaNs nor sNaNs and can assume rounding > + to nearest, we can expand the conversion inline as > + (fromi + 0x7fff + ((fromi >> 16) & 1)) >> 16. */ > (define_insn "truncsfbf2" > [(set (match_operand:BF 0 "register_operand" "=x,x,v,Yv") > (float_truncate:BF > (match_operand:SF 1 "register_operand" "0,x,v,Yv")))] > - "TARGET_SSE2 && flag_unsafe_math_optimizations && !HONOR_NANS (BFmode)" > + "TARGET_SSE2 && !HONOR_NANS (BFmode) && !flag_rounding_math > + && (flag_unsafe_math_optimizations > + || TARGET_AVXNECONVERT > + || (TARGET_AVX512BF16 && TARGET_AVX512VL))" > "@ > psrld\t{$16, %0|%0, 16} > %{vex%} vcvtneps2bf16\t{%1, %0|%0, %1} > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index 021ac90ae2a..61a4f4d21ea 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -2998,7 +2998,11 @@ (define_expand "truncv2sfv2bf2" > [(set (match_operand:V2BF 0 "register_operand") > (float_truncate:V2BF > (match_operand:V2SF 1 "nonimmediate_operand")))] > - "TARGET_SSSE3 && TARGET_MMX_WITH_SSE" > + "TARGET_SSSE3 && TARGET_MMX_WITH_SSE > + && !HONOR_NANS (BFmode) && !flag_rounding_math > + && (flag_unsafe_math_optimizations > + || TARGET_AVXNECONVERT > + || (TARGET_AVX512BF16 && TARGET_AVX512VL))" > { > rtx op1 = gen_reg_rtx (V4SFmode); > rtx op0 = gen_reg_rtx (V4BFmode); > @@ -3016,7 +3020,7 @@ (define_expand "extendv2bfv2sf2" > [(set (match_operand:V2SF 0 "register_operand") > (float_extend:V2SF > (match_operand:V2BF 1 "nonimmediate_operand")))] > - "TARGET_SSE2 && TARGET_MMX_WITH_SSE" > + "TARGET_SSE2 && TARGET_MMX_WITH_SSE && !HONOR_NANS (BFmode)" > { > rtx op0 = gen_reg_rtx (V4SFmode); > rtx op1 = gen_reg_rtx (V4BFmode); > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 5eeb3ab221a..efe32e5149f 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -30995,7 +30995,10 @@ (define_expand "truncv4sfv4bf2" > [(set (match_operand:V4BF 0 "register_operand") > (float_truncate:V4BF > (match_operand:V4SF 1 "nonimmediate_operand")))] > - "TARGET_SSSE3" > + "TARGET_SSSE3 && !HONOR_NANS (BFmode) && !flag_rounding_math > + && (flag_unsafe_math_optimizations > + || TARGET_AVXNECONVERT > + || (TARGET_AVX512BF16 && TARGET_AVX512VL))" > { > if (!TARGET_AVXNECONVERT > && !(TARGET_AVX512BF16 && TARGET_AVX512VL)) > @@ -31088,7 +31091,10 @@ (define_expand "truncv8sfv8bf2" > [(set (match_operand:V8BF 0 "register_operand") > (float_truncate:V8BF > (match_operand:V8SF 1 "nonimmediate_operand")))] > - "TARGET_AVX2" > + "TARGET_AVX2 && !HONOR_NANS (BFmode) && !flag_rounding_math > + && (flag_unsafe_math_optimizations > + || TARGET_AVXNECONVERT > + || (TARGET_AVX512BF16 && TARGET_AVX512VL))" > { > if (!TARGET_AVXNECONVERT > && !(TARGET_AVX512BF16 && TARGET_AVX512VL)) > @@ -31114,7 +31120,9 @@ (define_expand "truncv16sfv16bf2" > [(set (match_operand:V16BF 0 "register_operand") > (float_truncate:V16BF > (match_operand:V16SF 1 "nonimmediate_operand")))] > - "TARGET_AVX512BW && TARGET_EVEX512" > + "TARGET_AVX512BW && TARGET_EVEX512 > + && !HONOR_NANS (BFmode) && !flag_rounding_math > + && (flag_unsafe_math_optimizations || TARGET_AVX512BF16)" > { > if (!TARGET_AVX512BF16) > { > @@ -31127,7 +31135,7 @@ (define_expand "extend<sf_cvt_bf16_lower><mode>2" > [(set (match_operand:VF1_AVX512BW 0 "register_operand") > (float_extend:VF1_AVX512BW > (match_operand:<sf_cvt_bf16> 1 "nonimmediate_operand")))] > - "TARGET_SSE2" > + "TARGET_SSE2 && !HONOR_NANS (BFmode)" > { > ix86_expand_vector_bf2sf_with_vec_perm (operands[0], operands[1]); > DONE; > diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c > b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c > index da31bdba21b..1b4b62f1060 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c > +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-mavx512vl -mavx512bf16 -O2" } */ > +/* { dg-options "-mavx512vl -mavx512bf16 -O2 -ffast-math" } */ > /* { dg-final { scan-assembler-times {(?n)vcvtneps2bf16} 6 } } */ > > #include "avx512bw-truncsfbf.c" > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c > b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c > index 5b59958151f..e7c65b7ee01 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-mavx512bw -mavx512vl -O2" } */ > +/* { dg-options "-mavx512bw -mavx512vl -O2 -ffast-math" } */ > /* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|vpunpcklwd)} 6 } } */ > > typedef float v4sf __attribute__((vector_size(16))); > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c > b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c > index 071db21cfb3..40802d865df 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert > -O2" } */ > +/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2 > -ffast-math" } */ > /* { dg-final { scan-assembler-times {(?n)(?:vpermw|vpshufb)} 6 } } */ > > typedef float v4sf __attribute__((vector_size(16))); > diff --git a/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c > b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c > index 0f007df68f6..d7f77acd603 100644 > --- a/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c > +++ b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-msse2 -O2" } */ > +/* { dg-options "-msse2 -O2 -ffast-math" } */ > /* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|punpcklwd)} 2 { target > { ! ia32 } } } } */ > > typedef float v2sf __attribute__((vector_size(8))); > diff --git a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c > b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c > index 70840c537f1..af92f4d0bef 100644 > --- a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c > +++ b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2" } */ > +/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2 -ffast-math" > } */ > /* { dg-final { scan-assembler-times {(?n)pshufb} 2 { target { ! ia32 } } } > } */ > > typedef float v2sf __attribute__((vector_size(8))); > -- > 2.34.1 >
-- BR, Hongtao