It's supported by vector permutation with zero vector. gcc/ChangeLog:
* config/i386/i386-expand.cc (ix86_expand_vector_bf2sf_with_vec_perm): New function. * config/i386/i386-protos.h (ix86_expand_vector_bf2sf_with_vec_perm): New Declare. * config/i386/mmx.md (extendv2bfv2sf2): New expander. * config/i386/sse.md (extend<sf_cvt_bf16_lower><mode>2): Ditto. (VF1_AVX512BW): New mode iterator. (sf_cvt_bf16): Add V4SF. (sf_cvt_bf16_lower): New mode attr. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bw-extendbf2sf.c: New test. * gcc.target/i386/sse2-extendbf2sf.c: New test. --- gcc/config/i386/i386-expand.cc | 39 ++++++++++++++++ gcc/config/i386/i386-protos.h | 2 + gcc/config/i386/mmx.md | 18 ++++++++ gcc/config/i386/sse.md | 20 +++++++- .../gcc.target/i386/avx512bw-extendbf2sf.c | 46 +++++++++++++++++++ .../gcc.target/i386/sse2-extendbf2sf.c | 20 ++++++++ 6 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 7138432659e..df9676b80d4 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -26854,5 +26854,44 @@ ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src) emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode)); } +/* Implement extendv8bf2v8sf2 with vector permutation. */ +void +ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src) +{ + machine_mode vperm_mode, src_mode = GET_MODE (src); + switch (src_mode) + { + case V16BFmode: + vperm_mode = V32BFmode; + break; + case V8BFmode: + vperm_mode = V16BFmode; + break; + case V4BFmode: + vperm_mode = V8BFmode; + break; + default: + gcc_unreachable (); + } + + int nelt = GET_MODE_NUNITS (vperm_mode); + vec_perm_builder sel (nelt, nelt, 1); + sel.quick_grow (nelt); + for (int i = 0, k = 0, j = nelt; i != nelt; i++) + sel[i] = i & 1 ? j++ : k++; + + vec_perm_indices indices (sel, 2, nelt); + + rtx target = gen_reg_rtx (vperm_mode); + rtx op1 = lowpart_subreg (vperm_mode, + force_reg (src_mode, src), + src_mode); + rtx op0 = CONST0_RTX (vperm_mode); + bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode, + target, op0, op1, indices); + gcc_assert (ok); + emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode)); +} + #include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 55ffdb9dcf1..c26ae5e4f1d 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -259,6 +259,8 @@ extern bool ix86_ternlog_operand_p (rtx op); extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, rtx target); extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx); +extern void ix86_expand_vector_bf2sf_with_vec_perm (rtx, rtx); + #ifdef TREE_CODE extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 5c776ec0aba..021ac90ae2a 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3012,6 +3012,24 @@ (define_expand "truncv2sfv2bf2" DONE; }) +(define_expand "extendv2bfv2sf2" + [(set (match_operand:V2SF 0 "register_operand") + (float_extend:V2SF + (match_operand:V2BF 1 "nonimmediate_operand")))] + "TARGET_SSE2 && TARGET_MMX_WITH_SSE" +{ + rtx op0 = gen_reg_rtx (V4SFmode); + rtx op1 = gen_reg_rtx (V4BFmode); + + emit_move_insn (op1, lowpart_subreg (V4BFmode, + force_reg (V2BFmode, operands[1]), + V2BFmode)); + emit_insn (gen_extendv4bfv4sf2 (op0, op1)); + + emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode)); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral arithmetic diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 7f7910383ae..3d57a90fad7 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -530,6 +530,9 @@ (define_mode_iterator VF2_AVX512VL (define_mode_iterator VF1_AVX512VL [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")]) +(define_mode_iterator VF1_AVX512BW + [(V16SF "TARGET_EVEX512 && TARGET_EVEX512") (V8SF "TARGET_AVX2") V4SF]) + (define_mode_iterator VF1_AVX10_2 [(V16SF "TARGET_AVX10_2_512") V8SF V4SF]) @@ -30925,7 +30928,11 @@ (define_mode_attr bf16_cvt_2sf [(V32BF "V16SF") (V16BF "V8SF") (V8BF "V4SF")]) ;; Converting from SF to BF (define_mode_attr sf_cvt_bf16 - [(V8SF "V8BF") (V16SF "V16BF")]) + [(V4SF "V4BF") (V8SF "V8BF") (V16SF "V16BF")]) + +(define_mode_attr sf_cvt_bf16_lower + [(V4SF "v4bf") (V8SF "v8bf") (V16SF "v16bf")]) + ;; Mapping from BF to SF (define_mode_attr sf_bf16 [(V4SF "V8BF") (V8SF "V16BF") (V16SF "V32BF")]) @@ -31084,6 +31091,17 @@ (define_expand "truncv16sfv16bf2" } }) +(define_expand "extend<sf_cvt_bf16_lower><mode>2" + [(set (match_operand:VF1_AVX512BW 0 "register_operand") + (float_extend:VF1_AVX512BW + (match_operand:<sf_cvt_bf16> 1 "nonimmediate_operand")))] + "TARGET_SSE2" +{ + ix86_expand_vector_bf2sf_with_vec_perm (operands[0], operands[1]); + DONE; +}) + + (define_insn "avx512f_cvtneps2bf16_<mode><mask_name>" [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v") (float_truncate:<sf_cvt_bf16> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c new file mode 100644 index 00000000000..5b59958151f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c @@ -0,0 +1,46 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|vpunpcklwd)} 6 } } */ + +typedef float v4sf __attribute__((vector_size(16))); +typedef float v8sf __attribute__((vector_size(32))); +typedef float v16sf __attribute__((vector_size(64))); +typedef __bf16 v4bf __attribute__((vector_size(8))); +typedef __bf16 v8bf __attribute__((vector_size(16))); +typedef __bf16 v16bf __attribute__((vector_size(32))); + +v4sf +foo (v4bf b, v4bf a) +{ + return __builtin_convertvector (a, v4sf); +} + +v8sf +foo2 (v8bf b, v8bf a) +{ + return __builtin_convertvector (a, v8sf); +} + +v16sf +foo3 (v16bf b, v16bf a) +{ + return __builtin_convertvector (a, v16sf); +} + +v4sf +foo_mem (v4bf* a) +{ + return __builtin_convertvector (*a, v4sf); +} + +v8sf +foo2_mem (v8bf* a) +{ + return __builtin_convertvector (*a, v8sf); +} + +v16sf +foo3_mem (v16bf* a) +{ + return __builtin_convertvector (*a, v16sf); +} diff --git a/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c new file mode 100644 index 00000000000..0f007df68f6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2" } */ +/* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|punpcklwd)} 2 { target { ! ia32 } } } } */ + +typedef float v2sf __attribute__((vector_size(8))); +typedef __bf16 v2bf __attribute__((vector_size(4))); + +v2sf +foo (v2bf b, v2bf a) +{ + return __builtin_convertvector (a, v2sf); +} + + +v2sf +foo_mem (v2bf* a) +{ + return __builtin_convertvector (*a, v2sf); +} + -- 2.34.1