On Mon, Sep 2, 2024 at 4:33 PM Levy Hsu <ad...@levyhsu.com> wrote: > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? > > This patch introduces new mode iterators and expands for the i386 > architecture to support partial vectorization of bf16 operations using > AVX10.2 instructions. These operations include addition, subtraction, > multiplication, division, and square root calculations for V2BF and V4BF data > types. Ok. > > gcc/ChangeLog: > > * config/i386/mmx.md (VBF_32_64): New mode iterator for partial > vectorized V2BF/V4BF. > (<insn><mode>3): New define_expand for plusminusmultdiv. > (sqrt<mode>2): New define_expand for sqrt. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c: New test. > * gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c: New test. > --- > gcc/config/i386/mmx.md | 37 ++++++++++++ > .../avx10_2-partial-bf-vector-fast-math-1.c | 22 +++++++ > .../avx10_2-partial-bf-vector-operations-1.c | 57 +++++++++++++++++++ > 3 files changed, 116 insertions(+) > create mode 100644 > gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c > create mode 100644 > gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index e0065ed4d48..9116ddb5321 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -94,6 +94,8 @@ > > (define_mode_iterator VHF_32_64 [V2HF (V4HF "TARGET_MMX_WITH_SSE")]) > > +(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")]) > + > ;; Mapping from integer vector mode to mnemonic suffix > (define_mode_attr mmxvecsize > [(V8QI "b") (V4QI "b") (V2QI "b") > @@ -2036,6 +2038,26 @@ > DONE; > }) > > +;; VDIVNEPBF16 does not generate floating point exceptions. > +(define_expand "<insn><mode>3" > + [(set (match_operand:VBF_32_64 0 "register_operand") > + (plusminusmultdiv:VBF_32_64 > + (match_operand:VBF_32_64 1 "nonimmediate_operand") > + (match_operand:VBF_32_64 2 "nonimmediate_operand")))] > + "TARGET_AVX10_2_256" > +{ > + rtx op0 = gen_reg_rtx (V8BFmode); > + rtx op1 = lowpart_subreg (V8BFmode, > + force_reg (<MODE>mode, operands[1]), <MODE>mode); > + rtx op2 = lowpart_subreg (V8BFmode, > + force_reg (<MODE>mode, operands[2]), <MODE>mode); > + > + emit_insn (gen_<insn>v8bf3 (op0, op1, op2)); > + > + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); > + DONE; > +}) > + > (define_expand "divv2hf3" > [(set (match_operand:V2HF 0 "register_operand") > (div:V2HF > @@ -2091,6 +2113,21 @@ > DONE; > }) > > +(define_expand "sqrt<mode>2" > + [(set (match_operand:VBF_32_64 0 "register_operand") > + (sqrt:VBF_32_64 (match_operand:VBF_32_64 1 "vector_operand")))] > + "TARGET_AVX10_2_256" > +{ > + rtx op0 = gen_reg_rtx (V8BFmode); > + rtx op1 = lowpart_subreg (V8BFmode, > + force_reg (<MODE>mode, operands[1]), <MODE>mode); > + > + emit_insn (gen_sqrtv8bf2 (op0, op1)); > + > + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); > + DONE; > +}) > + > (define_expand "<code><mode>2" > [(set (match_operand:VHF_32_64 0 "register_operand") > (absneg:VHF_32_64 > diff --git > a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c > b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c > new file mode 100644 > index 00000000000..fd064f17445 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx10.2 -O2" } */ > +/* { dg-final { scan-assembler-times "vmulnepbf16\[ > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vrcppbf16\[ > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > + > +typedef __bf16 v4bf __attribute__ ((__vector_size__ (8))); > +typedef __bf16 v2bf __attribute__ ((__vector_size__ (4))); > + > + > +__attribute__((optimize("fast-math"))) > +v4bf > +foo_div_fast_math_4 (v4bf a, v4bf b) > +{ > + return a / b; > +} > + > +__attribute__((optimize("fast-math"))) > +v2bf > +foo_div_fast_math_2 (v2bf a, v2bf b) > +{ > + return a / b; > +} > diff --git > a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c > b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c > new file mode 100644 > index 00000000000..e7ee08a20a9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c > @@ -0,0 +1,57 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx10.2 -O2" } */ > +/* { dg-final { scan-assembler-times "vmulnepbf16\[ > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vaddnepbf16\[ > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vdivnepbf16\[ > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vsubnepbf16\[ > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > \\t\]+#)" 2 } } */ > + > +typedef __bf16 v4bf __attribute__ ((__vector_size__ (8))); > +typedef __bf16 v2bf __attribute__ ((__vector_size__ (4))); > + > +v4bf > +foo_mul_4 (v4bf a, v4bf b) > +{ > + return a * b; > +} > + > +v4bf > +foo_add_4 (v4bf a, v4bf b) > +{ > + return a + b; > +} > + > +v4bf > +foo_div_4 (v4bf a, v4bf b) > +{ > + return a / b; > +} > + > +v4bf > +foo_sub_4 (v4bf a, v4bf b) > +{ > + return a - b; > +} > + > +v2bf > +foo_mul_2 (v2bf a, v2bf b) > +{ > + return a * b; > +} > + > +v2bf > +foo_add_2 (v2bf a, v2bf b) > +{ > + return a + b; > +} > + > +v2bf > +foo_div_2 (v2bf a, v2bf b) > +{ > + return a / b; > +} > + > +v2bf > +foo_sub_2 (v2bf a, v2bf b) > +{ > + return a - b; > +} > -- > 2.31.1 >
-- BR, Hongtao