On Wed, Sep 4, 2024 at 10:53 AM Levy Hsu <ad...@levyhsu.com> wrote:
>
> Hi
>
> This patch adds support for bf16 operations in V2BF and V4BF modes on i386,
> handling signbit, xorsign, copysign, abs, neg, and various logical operations.
>
> Bootstrapped and tested on x86-64-pc-linux-gnu.
> Ok for trunk?
Ok.
>
> gcc/ChangeLog:
>
>         * config/i386/i386.cc (ix86_build_const_vector): Add V2BF/V4BF.
>         (ix86_build_signbit_mask): Add V2BF/V4BF.
>         * config/i386/mmx.md: Modified supported logic op to use VHBF_32_64.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/part-vect-absnegbf.c: New test.
> ---
>  gcc/config/i386/i386.cc                       |  4 +
>  gcc/config/i386/mmx.md                        | 74 +++++++++--------
>  .../gcc.target/i386/part-vect-absnegbf.c      | 81 +++++++++++++++++++
>  3 files changed, 124 insertions(+), 35 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 78bf890f14b..2bbfb1bf5fc 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -16176,6 +16176,8 @@ ix86_build_const_vector (machine_mode mode, bool 
> vect, rtx value)
>      case E_V32BFmode:
>      case E_V16BFmode:
>      case E_V8BFmode:
> +    case E_V4BFmode:
> +    case E_V2BFmode:
>        n_elt = GET_MODE_NUNITS (mode);
>        v = rtvec_alloc (n_elt);
>        scalar_mode = GET_MODE_INNER (mode);
> @@ -16215,6 +16217,8 @@ ix86_build_signbit_mask (machine_mode mode, bool 
> vect, bool invert)
>      case E_V32BFmode:
>      case E_V16BFmode:
>      case E_V8BFmode:
> +    case E_V4BFmode:
> +    case E_V2BFmode:
>        vec_mode = mode;
>        imode = HImode;
>        break;
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index cb2697537a8..44adcd8d8e0 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -121,7 +121,7 @@
>  ;; Mapping of vector float modes to an integer mode of the same size
>  (define_mode_attr mmxintvecmode
>    [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")
> -   (V4HF "V4HI") (V2HF "V2HI")])
> +   (V4HF "V4HI") (V2HF "V2HI") (V4BF "V4HI") (V2BF "V2HI")])
>
>  (define_mode_attr mmxintvecmodelower
>    [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")
> @@ -2091,18 +2091,22 @@
>    DONE;
>  })
>
> +(define_mode_iterator VHBF_32_64
> + [V2BF (V4BF "TARGET_MMX_WITH_SSE")
> +  V2HF (V4HF "TARGET_MMX_WITH_SSE")])
> +
>  (define_expand "<code><mode>2"
> -  [(set (match_operand:VHF_32_64 0 "register_operand")
> -       (absneg:VHF_32_64
> -         (match_operand:VHF_32_64 1 "register_operand")))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand")
> +       (absneg:VHBF_32_64
> +         (match_operand:VHBF_32_64 1 "register_operand")))]
>    "TARGET_SSE"
>    "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;")
>
>  (define_insn_and_split "*mmx_<code><mode>"
> -  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
> -       (absneg:VHF_32_64
> -         (match_operand:VHF_32_64 1 "register_operand" "0,x,x")))
> -   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x")
> +       (absneg:VHBF_32_64
> +         (match_operand:VHBF_32_64 1 "register_operand" "0,x,x")))
> +   (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))]
>    "TARGET_SSE"
>    "#"
>    "&& reload_completed"
> @@ -2115,11 +2119,11 @@
>    [(set_attr "isa" "noavx,noavx,avx")])
>
>  (define_insn_and_split "*mmx_nabs<mode>2"
> -  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
> -       (neg:VHF_32_64
> -         (abs:VHF_32_64
> -           (match_operand:VHF_32_64 1 "register_operand" "0,x,x"))))
> -   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x")
> +       (neg:VHBF_32_64
> +         (abs:VHBF_32_64
> +           (match_operand:VHBF_32_64 1 "register_operand" "0,x,x"))))
> +   (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))]
>    "TARGET_SSE"
>    "#"
>    "&& reload_completed"
> @@ -2410,11 +2414,11 @@
>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>
>  (define_insn "*mmx_andnot<mode>3"
> -  [(set (match_operand:VHF_32_64 0 "register_operand"    "=x,x")
> -       (and:VHF_32_64
> -         (not:VHF_32_64
> -           (match_operand:VHF_32_64 1 "register_operand" "0,x"))
> -         (match_operand:VHF_32_64 2 "register_operand"   "x,x")))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand"    "=x,x")
> +       (and:VHBF_32_64
> +         (not:VHBF_32_64
> +           (match_operand:VHBF_32_64 1 "register_operand" "0,x"))
> +         (match_operand:VHBF_32_64 2 "register_operand"   "x,x")))]
>    "TARGET_SSE"
>    "@
>     andnps\t{%2, %0|%0, %2}
> @@ -2425,10 +2429,10 @@
>     (set_attr "mode" "V4SF")])
>
>  (define_insn "<code><mode>3"
> -  [(set (match_operand:VHF_32_64 0 "register_operand"   "=x,x")
> -       (any_logic:VHF_32_64
> -         (match_operand:VHF_32_64 1 "register_operand" "%0,x")
> -         (match_operand:VHF_32_64 2 "register_operand" " x,x")))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand"   "=x,x")
> +       (any_logic:VHBF_32_64
> +         (match_operand:VHBF_32_64 1 "register_operand" "%0,x")
> +         (match_operand:VHBF_32_64 2 "register_operand" " x,x")))]
>    "TARGET_SSE"
>    "@
>     <logic>ps\t{%2, %0|%0, %2}
> @@ -2440,14 +2444,14 @@
>
>  (define_expand "copysign<mode>3"
>    [(set (match_dup 4)
> -       (and:VHF_32_64
> -         (not:VHF_32_64 (match_dup 3))
> -         (match_operand:VHF_32_64 1 "register_operand")))
> +       (and:VHBF_32_64
> +         (not:VHBF_32_64 (match_dup 3))
> +         (match_operand:VHBF_32_64 1 "register_operand")))
>     (set (match_dup 5)
> -       (and:VHF_32_64 (match_dup 3)
> -                 (match_operand:VHF_32_64 2 "register_operand")))
> -   (set (match_operand:VHF_32_64 0 "register_operand")
> -       (ior:VHF_32_64 (match_dup 4) (match_dup 5)))]
> +       (and:VHBF_32_64 (match_dup 3)
> +                 (match_operand:VHBF_32_64 2 "register_operand")))
> +   (set (match_operand:VHBF_32_64 0 "register_operand")
> +       (ior:VHBF_32_64 (match_dup 4) (match_dup 5)))]
>    "TARGET_SSE"
>  {
>    operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false);
> @@ -2458,11 +2462,11 @@
>
>  (define_expand "xorsign<mode>3"
>    [(set (match_dup 4)
> -       (and:VHF_32_64 (match_dup 3)
> -                 (match_operand:VHF_32_64 2 "register_operand")))
> -   (set (match_operand:VHF_32_64 0 "register_operand")
> -       (xor:VHF_32_64 (match_dup 4)
> -                 (match_operand:VHF_32_64 1 "register_operand")))]
> +       (and:VHBF_32_64 (match_dup 3)
> +                 (match_operand:VHBF_32_64 2 "register_operand")))
> +   (set (match_operand:VHBF_32_64 0 "register_operand")
> +       (xor:VHBF_32_64 (match_dup 4)
> +                 (match_operand:VHBF_32_64 1 "register_operand")))]
>    "TARGET_SSE"
>  {
>    operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false);
> @@ -2474,7 +2478,7 @@
>    [(set (match_operand:<mmxintvecmode> 0 "register_operand")
>         (lshiftrt:<mmxintvecmode>
>           (subreg:<mmxintvecmode>
> -           (match_operand:VHF_32_64 1 "register_operand") 0)
> +           (match_operand:VHBF_32_64 1 "register_operand") 0)
>           (match_dup 2)))]
>    "TARGET_SSE2"
>  {
> diff --git a/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c 
> b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c
> new file mode 100644
> index 00000000000..2d7ae35298e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c
> @@ -0,0 +1,81 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -fdump-tree-vect-details -fdump-tree-slp-details 
> -fdump-tree-optimized" } */
> +
> +extern void abort (void);
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512BF16
> +#include "avx512-check.h"
> +
> +__bf16 b_32[2], r_abs_32[2], r_neg_32[2];
> +__bf16 b_64[4], r_abs_64[4], r_neg_64[4];
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +abs_32 (void)
> +{
> +  for (int i = 0; i < 2; i++)
> +    r_abs_32[i] = __builtin_fabsf16 (b_32[i]);
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +neg_32 (void)
> +{
> +  for (int i = 0; i < 2; i++)
> +    r_neg_32[i] = -b_32[i];
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +abs_64 (void)
> +{
> +  for (int i = 0; i < 4; i++)
> +    r_abs_64[i] = __builtin_fabsf16 (b_64[i]);
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +neg_64 (void)
> +{
> +  for (int i = 0; i < 4; i++)
> +    r_neg_64[i] = -b_64[i];
> +}
> +
> +void
> +check_absneg_results (__bf16 *b, __bf16 *r_abs, __bf16 *r_neg, int len)
> +{
> +  for (int i = 0; i < len; i++)
> +    {
> +      __bf16 expected_abs = __builtin_fabsf16 (b[i]);
> +      __bf16 expected_neg = -b[i];
> +      if (r_abs[i] != expected_abs || r_neg[i] != expected_neg)
> +        abort ();
> +    }
> +}
> +
> +static void
> +__attribute__ ((noinline, noclone))
> +do_test (void)
> +{
> +  float float_b[16] = {-1.2f, 3.4f, -5.6f, 7.8f};
> +
> +  for (int i = 0; i < 2; i++)
> +    b_32[i] = (__bf16) float_b[i];
> +
> +  for (int i = 0; i < 4; i++)
> +    b_64[i] = (__bf16) float_b[i];
> +
> +  abs_32 ();
> +  neg_32 ();
> +  check_absneg_results (b_32, r_abs_32, r_neg_32, 2);
> +
> +  abs_64 ();
> +  neg_64 ();
> +  check_absneg_results (b_64, r_abs_64, r_neg_64, 4);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized using 4 byte vectors" 2 
> "slp1" } } */
> +/* { dg-final { scan-tree-dump-times "loop vectorized using 8 byte vectors" 
> 2 "vect" { target { ! ia32 } } } } */
> +/* { dg-final { scan-tree-dump-times {(?n)ABS_EXPR <vect} 2 "optimized" { 
> target { ! ia32 } } } } */
> --
> 2.31.1
>


-- 
BR,
Hongtao

Reply via email to