sqrt

Hongtao Liu Mon, 02 Sep 2024 19:01:23 -0700

On Mon, Sep 2, 2024 at 4:33 PM Levy Hsu <ad...@levyhsu.com> wrote:
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> This patch introduces new mode iterators and expands for the i386 
> architecture to support partial vectorization of bf16 operations using 
> AVX10.2 instructions. These operations include addition, subtraction, 
> multiplication, division, and square root calculations for V2BF and V4BF data 
> types.
Ok.
>
> gcc/ChangeLog:
>
>         * config/i386/mmx.md (VBF_32_64): New mode iterator for partial 
> vectorized V2BF/V4BF.
>         (<insn><mode>3): New define_expand for plusminusmultdiv.
>         (sqrt<mode>2): New define_expand for sqrt.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c: New test.
>         * gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c: New test.
> ---
>  gcc/config/i386/mmx.md                        | 37 ++++++++++++
>  .../avx10_2-partial-bf-vector-fast-math-1.c   | 22 +++++++
>  .../avx10_2-partial-bf-vector-operations-1.c  | 57 +++++++++++++++++++
>  3 files changed, 116 insertions(+)
>  create mode 100644 
> gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
>  create mode 100644 
> gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index e0065ed4d48..9116ddb5321 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -94,6 +94,8 @@
>
>  (define_mode_iterator VHF_32_64 [V2HF (V4HF "TARGET_MMX_WITH_SSE")])
>
> +(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")])
> +
>  ;; Mapping from integer vector mode to mnemonic suffix
>  (define_mode_attr mmxvecsize
>    [(V8QI "b") (V4QI "b") (V2QI "b")
> @@ -2036,6 +2038,26 @@
>    DONE;
>  })
>
> +;; VDIVNEPBF16 does not generate floating point exceptions.
> +(define_expand "<insn><mode>3"
> +  [(set (match_operand:VBF_32_64 0 "register_operand")
> +    (plusminusmultdiv:VBF_32_64
> +      (match_operand:VBF_32_64 1 "nonimmediate_operand")
> +      (match_operand:VBF_32_64 2 "nonimmediate_operand")))]
> +  "TARGET_AVX10_2_256"
> +{
> +  rtx op0 = gen_reg_rtx (V8BFmode);
> +  rtx op1 = lowpart_subreg (V8BFmode,
> +                           force_reg (<MODE>mode, operands[1]), <MODE>mode);
> +  rtx op2 = lowpart_subreg (V8BFmode,
> +                           force_reg (<MODE>mode, operands[2]), <MODE>mode);
> +
> +  emit_insn (gen_<insn>v8bf3 (op0, op1, op2));
> +
> +  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode));
> +  DONE;
> +})
> +
>  (define_expand "divv2hf3"
>    [(set (match_operand:V2HF 0 "register_operand")
>         (div:V2HF
> @@ -2091,6 +2113,21 @@
>    DONE;
>  })
>
> +(define_expand "sqrt<mode>2"
> +  [(set (match_operand:VBF_32_64 0 "register_operand")
> +       (sqrt:VBF_32_64 (match_operand:VBF_32_64 1 "vector_operand")))]
> +  "TARGET_AVX10_2_256"
> +{
> +  rtx op0 = gen_reg_rtx (V8BFmode);
> +  rtx op1 = lowpart_subreg (V8BFmode,
> +                           force_reg (<MODE>mode, operands[1]), <MODE>mode);
> +
> +  emit_insn (gen_sqrtv8bf2 (op0, op1));
> +
> +  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode));
> +  DONE;
> +})
> +
>  (define_expand "<code><mode>2"
>    [(set (match_operand:VHF_32_64 0 "register_operand")
>         (absneg:VHF_32_64
> diff --git 
> a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c 
> b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
> new file mode 100644
> index 00000000000..fd064f17445
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mavx10.2 -O2" } */
> +/* { dg-final { scan-assembler-times "vmulnepbf16\[ 
> \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
>  \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vrcppbf16\[ 
> \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
> +
> +typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
> +typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
> +
> +
> +__attribute__((optimize("fast-math")))
> +v4bf
> +foo_div_fast_math_4 (v4bf a, v4bf b)
> +{
> +  return a / b;
> +}
> +
> +__attribute__((optimize("fast-math")))
> +v2bf
> +foo_div_fast_math_2 (v2bf a, v2bf b)
> +{
> +  return a / b;
> +}
> diff --git 
> a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c 
> b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
> new file mode 100644
> index 00000000000..e7ee08a20a9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
> @@ -0,0 +1,57 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mavx10.2 -O2" } */
> +/* { dg-final { scan-assembler-times "vmulnepbf16\[ 
> \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
>  \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vaddnepbf16\[ 
> \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
>  \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vdivnepbf16\[ 
> \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
>  \\t\]+#)" 2 } } */
> +/* { dg-final { scan-assembler-times "vsubnepbf16\[ 
> \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
>  \\t\]+#)" 2 } } */
> +
> +typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
> +typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
> +
> +v4bf
> +foo_mul_4 (v4bf a, v4bf b)
> +{
> +  return a * b;
> +}
> +
> +v4bf
> +foo_add_4 (v4bf a, v4bf b)
> +{
> +  return a + b;
> +}
> +
> +v4bf
> +foo_div_4 (v4bf a, v4bf b)
> +{
> +  return a / b;
> +}
> +
> +v4bf
> +foo_sub_4 (v4bf a, v4bf b)
> +{
> +  return a - b;
> +}
> +
> +v2bf
> +foo_mul_2 (v2bf a, v2bf b)
> +{
> +  return a * b;
> +}
> +
> +v2bf
> +foo_add_2 (v2bf a, v2bf b)
> +{
> +  return a + b;
> +}
> +
> +v2bf
> +foo_div_2 (v2bf a, v2bf b)
> +{
> +  return a / b;
> +}
> +
> +v2bf
> +foo_sub_2 (v2bf a, v2bf b)
> +{
> +  return a - b;
> +}
> --
> 2.31.1
>



-- 
BR,
Hongtao

Re: [PATCH] i386: Support partial vectorized V2BF/V4BF plus/minus/mult/div/sqrt

Reply via email to