Generate native instruction whenever possible, otherwise use vector permutation with odd indices.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready push to trunk. gcc/ChangeLog: * config/i386/i386-expand.cc (ix86_expand_vector_sf2bf_with_vec_perm): New function. * config/i386/i386-protos.h (ix86_expand_vector_sf2bf_with_vec_perm): New declare. * config/i386/mmx.md (truncv2sfv2bf2): New expander. * config/i386/sse.md (truncv4sfv4bf2): Ditto. (truncv8sfv8bf2): Ditto. (truncv16sfv16bf2): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bf16-truncsfbf.c: New test. * gcc.target/i386/avx512bw-truncsfbf.c: New test. * gcc.target/i386/ssse3-truncsfbf.c: New test. --- gcc/config/i386/i386-expand.cc | 38 +++++++++++++++ gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/mmx.md | 18 ++++++++ gcc/config/i386/sse.md | 44 ++++++++++++++++++ .../gcc.target/i386/avx512bf16-truncsfbf.c | 5 ++ .../gcc.target/i386/avx512bw-truncsfbf.c | 46 +++++++++++++++++++ .../gcc.target/i386/ssse3-truncsfbf.c | 20 ++++++++ 7 files changed, 172 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c create mode 100644 gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 63f5e348d64..7138432659e 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -26817,4 +26817,42 @@ ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_m emit_move_insn (output, gen_lowpart (out_mode, d.target)); } +/* Implement truncv8sfv8bf2 with vector permutation. */ +void +ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src) +{ + machine_mode vperm_mode, src_mode = GET_MODE (src); + switch (src_mode) + { + case V16SFmode: + vperm_mode = V32BFmode; + break; + case V8SFmode: + vperm_mode = V16BFmode; + break; + case V4SFmode: + vperm_mode = V8BFmode; + break; + default: + gcc_unreachable (); + } + + int nelt = GET_MODE_NUNITS (vperm_mode); + vec_perm_builder sel (nelt, nelt, 1); + sel.quick_grow (nelt); + for (int i = 0; i != nelt; i++) + sel[i] = (2 * i + 1) % nelt; + vec_perm_indices indices (sel, 1, nelt); + + rtx target = gen_reg_rtx (vperm_mode); + rtx op0 = lowpart_subreg (vperm_mode, + force_reg (src_mode, src), + src_mode); + bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode, + target, op0, op0, indices); + gcc_assert (ok); + emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode)); +} + + #include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index c1f9147769c..55ffdb9dcf1 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -258,6 +258,7 @@ extern int ix86_ternlog_idx (rtx op, rtx *args); extern bool ix86_ternlog_operand_p (rtx op); extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, rtx target); +extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx); #ifdef TREE_CODE extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 506f4cab6a8..5c776ec0aba 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2994,6 +2994,24 @@ (define_expand "truncv2sfv2hf2" DONE; }) +(define_expand "truncv2sfv2bf2" + [(set (match_operand:V2BF 0 "register_operand") + (float_truncate:V2BF + (match_operand:V2SF 1 "nonimmediate_operand")))] + "TARGET_SSSE3 && TARGET_MMX_WITH_SSE" +{ + rtx op1 = gen_reg_rtx (V4SFmode); + rtx op0 = gen_reg_rtx (V4BFmode); + + emit_move_insn (op1, lowpart_subreg (V4SFmode, + force_reg (V2SFmode, operands[1]), + V2SFmode)); + emit_insn (gen_truncv4sfv4bf2 (op0, op1)); + + emit_move_insn (operands[0], lowpart_subreg (V2BFmode, op0, V4BFmode)); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral arithmetic diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6c28b74ac3f..7f7910383ae 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -30952,6 +30952,24 @@ (define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>" "TARGET_AVX512BF16" "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}") +(define_expand "truncv4sfv4bf2" + [(set (match_operand:V4BF 0 "register_operand") + (float_truncate:V4BF + (match_operand:V4SF 1 "nonimmediate_operand")))] + "TARGET_SSSE3" +{ + if (!TARGET_AVXNECONVERT + && !(TARGET_AVX512BF16 && TARGET_AVX512VL)) + ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]); + else + { + rtx dest = gen_reg_rtx (V8BFmode); + emit_insn (gen_vcvtneps2bf16_v4sf (dest, operands[1])); + emit_move_insn (operands[0], lowpart_subreg (V4BFmode, dest, V8BFmode)); + } + DONE; +}) + (define_expand "vcvtneps2bf16_v4sf" [(set (match_operand:V8BF 0 "register_operand") (vec_concat:V8BF @@ -31027,6 +31045,20 @@ (define_expand "avx512f_cvtneps2bf16_<mode>_maskz" DONE; }) +(define_expand "truncv8sfv8bf2" + [(set (match_operand:V8BF 0 "register_operand") + (float_truncate:V8BF + (match_operand:V8SF 1 "nonimmediate_operand")))] + "TARGET_AVX2" +{ + if (!TARGET_AVXNECONVERT + && !(TARGET_AVX512BF16 && TARGET_AVX512VL)) + { + ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]); + DONE; + } +}) + (define_insn "vcvtneps2bf16_v8sf" [(set (match_operand:V8BF 0 "register_operand" "=x,v") (float_truncate:V8BF @@ -31039,6 +31071,18 @@ (define_insn "vcvtneps2bf16_v8sf" (set_attr "addr" "gpr16,*") (set_attr "prefix" "vex,evex")]) +(define_expand "truncv16sfv16bf2" + [(set (match_operand:V16BF 0 "register_operand") + (float_truncate:V16BF + (match_operand:V16SF 1 "nonimmediate_operand")))] + "TARGET_AVX512BW && TARGET_EVEX512" +{ + if (!TARGET_AVX512BF16) + { + ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]); + DONE; + } +}) (define_insn "avx512f_cvtneps2bf16_<mode><mask_name>" [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v") diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c new file mode 100644 index 00000000000..da31bdba21b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512vl -mavx512bf16 -O2" } */ +/* { dg-final { scan-assembler-times {(?n)vcvtneps2bf16} 6 } } */ + +#include "avx512bw-truncsfbf.c" diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c new file mode 100644 index 00000000000..071db21cfb3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c @@ -0,0 +1,46 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2" } */ +/* { dg-final { scan-assembler-times {(?n)(?:vpermw|vpshufb)} 6 } } */ + +typedef float v4sf __attribute__((vector_size(16))); +typedef float v8sf __attribute__((vector_size(32))); +typedef float v16sf __attribute__((vector_size(64))); +typedef __bf16 v4bf __attribute__((vector_size(8))); +typedef __bf16 v8bf __attribute__((vector_size(16))); +typedef __bf16 v16bf __attribute__((vector_size(32))); + +v4bf +foo (v4sf b, v4sf a) +{ + return __builtin_convertvector (a, v4bf); +} + +v8bf +foo2 (v8sf b, v8sf a) +{ + return __builtin_convertvector (a, v8bf); +} + +v16bf +foo3 (v16sf b, v16sf a) +{ + return __builtin_convertvector (a, v16bf); +} + +v4bf +foo_mem (v4sf* a) +{ + return __builtin_convertvector (*a, v4bf); +} + +v8bf +foo2_mem (v8sf* a) +{ + return __builtin_convertvector (*a, v8bf); +} + +v16bf +foo3_mem (v16sf* a) +{ + return __builtin_convertvector (*a, v16bf); +} diff --git a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c new file mode 100644 index 00000000000..70840c537f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2" } */ +/* { dg-final { scan-assembler-times {(?n)pshufb} 2 { target { ! ia32 } } } } */ + +typedef float v2sf __attribute__((vector_size(8))); +typedef __bf16 v2bf __attribute__((vector_size(4))); + +v2bf +foo (v2sf b, v2sf a) +{ + return __builtin_convertvector (a, v2bf); +} + + +v2bf +foo_mem (v2sf* a) +{ + return __builtin_convertvector (*a, v2bf); +} + -- 2.34.1