On Thu, Jul 4, 2024 at 11:24 AM Levy Hsu <ad...@levyhsu.com> wrote:
>
> This patch extends support for BF16 vector operations in GCC, including 
> bitwise AND, ANDNOT, ABS, NEG, COPYSIGN, and XORSIGN for V8BF, V16BF, and 
> V32BF modes.
> Bootstrapped and tested on x86_64-linux-gnu. ok for trunk?
>
> gcc/ChangeLog:
>
>         * config/i386/i386-expand.cc (ix86_expand_fp_absneg_operator): Add 
> VBF modes.
>         (ix86_expand_copysign): Ditto.
>         (ix86_expand_xorsign): Ditto.
>         * config/i386/i386.cc (ix86_build_const_vector): Ditto.
>         (ix86_build_signbit_mask): Ditto.
>         * config/i386/sse.md: Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx2-bf16-vec-absneg.c: New test.
>         * gcc.target/i386/avx512f-bf16-vec-absneg.c: New test.
>
> ---
>  gcc/config/i386/i386-expand.cc                | 76 +++++++++++------
>  gcc/config/i386/i386.cc                       |  6 ++
>  gcc/config/i386/sse.md                        | 37 +++++---
>  .../gcc.target/i386/avx2-bf16-vec-absneg.c    | 85 +++++++++++++++++++
>  .../gcc.target/i386/avx512f-bf16-vec-absneg.c | 66 ++++++++++++++
>  5 files changed, 234 insertions(+), 36 deletions(-)
>  create mode 100755 gcc/testsuite/gcc.target/i386/avx2-bf16-vec-absneg.c
>  create mode 100755 gcc/testsuite/gcc.target/i386/avx512f-bf16-vec-absneg.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 5c29ee1353f..46d13a55e6a 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -2175,20 +2175,28 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, 
> machine_mode mode,
>    machine_mode vmode = mode;
>    rtvec par;
>
> -  if (vector_mode || mode == TFmode || mode == HFmode)
> -    {
> -      use_sse = true;
> -      if (mode == HFmode)
> -       vmode = V8HFmode;
> -    }
> -  else if (TARGET_SSE_MATH)
> -    {
> -      use_sse = SSE_FLOAT_MODE_P (mode);
> -      if (mode == SFmode)
> -       vmode = V4SFmode;
> -      else if (mode == DFmode)
> -       vmode = V2DFmode;
> -    }
> +  switch (mode)
> +  {
> +  case HFmode:
> +    use_sse = true;
> +    vmode = V8HFmode;
> +    break;
> +  case BFmode:
> +    use_sse = true;
> +    vmode = V8BFmode;
> +    break;
> +  case SFmode:
> +    use_sse = TARGET_SSE_MATH;
use_sse = TARGET_SSE_MATH && TARGET_SSE;
> +    vmode = V4SFmode;
> +    break;
> +  case DFmode:
> +    use_sse = TARGET_SSE_MATH;
use_sse = TARGET_SSE_MATH && TARGET_SSE2;
Others LGTM.
> +    vmode = V2DFmode;
> +    break;
> +  default:
> +    use_sse = vector_mode || mode == TFmode;
> +    break;
> +  }
>
>    dst = operands[0];
>    src = operands[1];
> @@ -2321,16 +2329,26 @@ ix86_expand_copysign (rtx operands[])
>
>    mode = GET_MODE (operands[0]);
>
> -  if (mode == HFmode)
> +  switch (mode)
> +  {
> +  case HFmode:
>      vmode = V8HFmode;
> -  else if (mode == SFmode)
> +    break;
> +  case BFmode:
> +    vmode = V8BFmode;
> +    break;
> +  case SFmode:
>      vmode = V4SFmode;
> -  else if (mode == DFmode)
> +    break;
> +  case DFmode:
>      vmode = V2DFmode;
> -  else if (mode == TFmode)
> +    break;
> +  case TFmode:
>      vmode = mode;
> -  else
> -    gcc_unreachable ();
> +    break;
> +  default:
> +    gcc_unreachable();
> +  }
>
>    if (rtx_equal_p (operands[1], operands[2]))
>      {
> @@ -2391,14 +2409,24 @@ ix86_expand_xorsign (rtx operands[])
>
>    mode = GET_MODE (dest);
>
> -  if (mode == HFmode)
> +  switch (mode)
> +  {
> +  case HFmode:
>      vmode = V8HFmode;
> -  else if (mode == SFmode)
> +    break;
> +  case BFmode:
> +    vmode = V8BFmode;
> +    break;
> +  case SFmode:
>      vmode = V4SFmode;
> -  else if (mode == DFmode)
> +    break;
> +  case DFmode:
>      vmode = V2DFmode;
> -  else
> +    break;
> +  default:
>      gcc_unreachable ();
> +    break;
> +  }
>
>    temp = gen_reg_rtx (vmode);
>    mask = ix86_build_signbit_mask (vmode, 0, 0);
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index d4ccc24be6e..b5768a65e52 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -16353,6 +16353,9 @@ ix86_build_const_vector (machine_mode mode, bool 
> vect, rtx value)
>      case E_V8DFmode:
>      case E_V4DFmode:
>      case E_V2DFmode:
> +    case E_V32BFmode:
> +    case E_V16BFmode:
> +    case E_V8BFmode:
>        n_elt = GET_MODE_NUNITS (mode);
>        v = rtvec_alloc (n_elt);
>        scalar_mode = GET_MODE_INNER (mode);
> @@ -16389,6 +16392,9 @@ ix86_build_signbit_mask (machine_mode mode, bool 
> vect, bool invert)
>      case E_V8HFmode:
>      case E_V16HFmode:
>      case E_V32HFmode:
> +    case E_V32BFmode:
> +    case E_V16BFmode:
> +    case E_V8BFmode:
>        vec_mode = mode;
>        imode = HImode;
>        break;
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 0be2dcd8891..1703bbb4250 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -351,7 +351,9 @@
>
>  ;; 128-, 256- and 512-bit float vector modes for bitwise operations
>  (define_mode_iterator VFB
> -  [(V32HF "TARGET_AVX512F && TARGET_EVEX512")
> +  [(V32BF "TARGET_AVX512F && TARGET_EVEX512")
> +   (V16BF "TARGET_AVX") (V8BF "TARGET_SSE2")
> +   (V32HF "TARGET_AVX512F && TARGET_EVEX512")
>     (V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
>     (V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
>     (V8DF "TARGET_AVX512F && TARGET_EVEX512")
> @@ -364,7 +366,8 @@
>
>  ;; 128- and 256-bit float vector modes for bitwise operations
>  (define_mode_iterator VFB_128_256
> -  [(V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
> +  [(V16BF "TARGET_AVX") (V8BF "TARGET_SSE2")
> +   (V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
>     (V8SF "TARGET_AVX") V4SF
>     (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
>
> @@ -422,7 +425,10 @@
>
>  ;; All 512bit vector float modes for bitwise operations
>  (define_mode_iterator VFB_512
> -  [(V32HF "TARGET_EVEX512") (V16SF "TARGET_EVEX512") (V8DF 
> "TARGET_EVEX512")])
> +  [(V32BF "TARGET_EVEX512")
> +   (V32HF "TARGET_EVEX512")
> +   (V16SF "TARGET_EVEX512")
> +   (V8DF "TARGET_EVEX512")])
>
>  (define_mode_iterator V4SF_V8HF
>    [V4SF V8HF])
> @@ -929,6 +935,8 @@
>  (define_mode_attr sse
>    [(SF "sse") (DF "sse2") (HF "avx512fp16")
>     (V4SF "sse") (V2DF "sse2")
> +   (V32BF "avx512bf16") (V16BF "avx512bf16")
> +   (V8BF "avx512bf16")
>     (V32HF "avx512fp16") (V16HF "avx512fp16")
>     (V8HF "avx512fp16")
>     (V16SF "avx512f") (V8SF "avx")
> @@ -1058,7 +1066,8 @@
>  (define_mode_attr sseintvecmode2
>    [(V8DF "XI") (V4DF "OI") (V2DF "TI")
>     (V8SF "OI") (V4SF "TI")
> -   (V16HF "OI") (V8HF "TI")])
> +   (V16HF "OI") (V8HF "TI")
> +   (V16BF "OI") (V8BF "TI")])
>
>  (define_mode_attr sseintvecmodelower
>    [(V32HF "v32hi") (V32BF "v32hi") (V16SF "v16si") (V8DF "v8di")
> @@ -4939,7 +4948,7 @@
>             (match_operand:VFB_128_256 1 "register_operand" "0,x,v,v"))
>           (match_operand:VFB_128_256 2 "vector_operand" "xBm,xjm,vm,vm")))]
>    "TARGET_SSE && <mask_avx512vl_condition>
> -   && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
> +   && (!<mask_applied> || <ssescalarsize> != 16)"
>  {
>    char buf[128];
>    const char *ops;
> @@ -4961,6 +4970,8 @@
>
>    switch (get_attr_mode (insn))
>      {
> +    case MODE_V16BF:
> +    case MODE_V8BF:
>      case MODE_V16HF:
>      case MODE_V8HF:
>      case MODE_V8SF:
> @@ -5011,7 +5022,7 @@
>           (not:VFB_512
>             (match_operand:VFB_512 1 "register_operand" "v"))
>           (match_operand:VFB_512 2 "nonimmediate_operand" "vm")))]
> -  "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
> +  "TARGET_AVX512F && (!<mask_applied> || <ssescalarsize> != 16)"
>  {
>    char buf[128];
>    const char *ops;
> @@ -5022,7 +5033,7 @@
>
>    /* Since there are no vandnp[sd] without AVX512DQ nor vandnph,
>       use vp<logic>[dq].  */
> -  if (!TARGET_AVX512DQ || <MODE>mode == V32HFmode)
> +  if (!TARGET_AVX512DQ || <MODE>mode == V32HFmode || <MODE>mode == V32BFmode)
>      {
>        suffix = GET_MODE_INNER (<MODE>mode) == DFmode ? "q" : "d";
>        ops = "p";
> @@ -5047,7 +5058,7 @@
>           (match_operand:VFB_128_256 1 "vector_operand")
>           (match_operand:VFB_128_256 2 "vector_operand")))]
>    "TARGET_SSE && <mask_avx512vl_condition>
> -   && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
> +   && (!<mask_applied> || <ssescalarsize> != 16)"
>    "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
>
>  (define_expand "<code><mode>3<mask_name>"
> @@ -5055,7 +5066,7 @@
>         (any_logic:VFB_512
>           (match_operand:VFB_512 1 "nonimmediate_operand")
>           (match_operand:VFB_512 2 "nonimmediate_operand")))]
> -  "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
> +  "TARGET_AVX512F && (!<mask_applied> || <ssescalarsize> != 16)"
>    "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
>
>  (define_insn "*<code><mode>3<mask_name>"
> @@ -5064,7 +5075,7 @@
>           (match_operand:VFB_128_256 1 "vector_operand" "%0,x,v,v")
>           (match_operand:VFB_128_256 2 "vector_operand" "xBm,xm,vm,vm")))]
>    "TARGET_SSE && <mask_avx512vl_condition>
> -   && (!<mask_applied> || <ssescalarmode>mode != HFmode)
> +   && (!<mask_applied> || <ssescalarsize> != 16)
>     && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
>  {
>    char buf[128];
> @@ -5087,6 +5098,8 @@
>
>    switch (get_attr_mode (insn))
>      {
> +    case MODE_V16BF:
> +    case MODE_V8BF:
>      case MODE_V16HF:
>      case MODE_V8HF:
>      case MODE_V8SF:
> @@ -5132,7 +5145,7 @@
>           (match_operand:VFB_512 1 "nonimmediate_operand" "%v")
>           (match_operand:VFB_512 2 "nonimmediate_operand" "vm")))]
>    "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))
> -   && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
> +   && (!<mask_applied> || <ssescalarsize> != 16)"
>  {
>    char buf[128];
>    const char *ops;
> @@ -5143,7 +5156,7 @@
>
>    /* Since there are no v<logic>p[sd] without AVX512DQ nor v<logic>ph,
>       use vp<logic>[dq].  */
> -  if (!TARGET_AVX512DQ || <MODE>mode == V32HFmode)
> +  if (!TARGET_AVX512DQ || <MODE>mode == V32HFmode || <MODE>mode == V32BFmode)
>      {
>        suffix = GET_MODE_INNER (<MODE>mode) == DFmode ? "q" : "d";
>        ops = "p";
> diff --git a/gcc/testsuite/gcc.target/i386/avx2-bf16-vec-absneg.c 
> b/gcc/testsuite/gcc.target/i386/avx2-bf16-vec-absneg.c
> new file mode 100755
> index 00000000000..a3ee0b164f7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-bf16-vec-absneg.c
> @@ -0,0 +1,85 @@
> +/* { dg-do run { target avx2 } } */
> +/* { dg-options "-O1 -mavx512bf16 -fdump-tree-vect-details 
> -fdump-tree-optimized" } */
> +
> +extern void abort (void);
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512BF16
> +#include "avx512-check.h"
> +
> +__bf16 b_128[8], r_abs_128[8], r_neg_128[8];
> +__bf16 b_256[16], r_abs_256[16], r_neg_256[16];
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +abs_128 (void)
> +{
> +  for (int i = 0; i < 8; i++)
> +    r_abs_128[i] = __builtin_fabsf16(b_128[i]);
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +neg_128 (void)
> +{
> +  for (int i = 0; i < 8; i++)
> +    r_neg_128[i] = -b_128[i];
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +abs_256 (void)
> +{
> +  for (int i = 0; i < 16; i++)
> +    r_abs_256[i] = __builtin_fabsf16(b_256[i]);
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +neg_256 (void)
> +{
> +  for (int i = 0; i < 16; i++)
> +    r_neg_256[i] = -b_256[i];
> +}
> +
> +void
> +check_absneg_results (__bf16 *b, __bf16 *r_abs, __bf16 *r_neg, int len)
> +{
> +  for (int i = 0; i < len; i++)
> +    {
> +      __bf16 expected_abs = __builtin_fabsf16(b[i]);
> +      __bf16 expected_neg = -b[i];
> +      if (r_abs[i] != expected_abs || r_neg[i] != expected_neg)
> +        abort ();
> +    }
> +}
> +
> +static void
> +__attribute__ ((noinline, noclone))
> +do_test (void)
> +{
> +  /* Initialize test values */
> +  float float_b[16] = {-1.2f, 3.4f, -5.6f, 7.8f,
> +                      -9.0f, 1.0f, -2.0f, 3.0f,
> +                      -4.0f, -5.0f, 6.0f, 7.0f,
> +                      -8.0f, -9.0f, 10.0f, 11.0f};
> +
> +  for (int i = 0; i < 8; i++)
> +    b_128[i] = (__bf16)float_b[i];
> +
> +  for (int i = 0; i < 16; i++)
> +    b_256[i] = (__bf16)float_b[i];
> +
> +  abs_128 ();
> +  neg_128 ();
> +  check_absneg_results (b_128, r_abs_128, r_neg_128, 8);
> +
> +  abs_256 ();
> +  neg_256 ();
> +  check_absneg_results (b_256, r_abs_256, r_neg_256, 16);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 
> 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 
> 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)ABS_EXPR <vect} 2 "optimized" { 
> target { ! ia32 } } } } */
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-bf16-vec-absneg.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-bf16-vec-absneg.c
> new file mode 100755
> index 00000000000..01c7ad77204
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-bf16-vec-absneg.c
> @@ -0,0 +1,66 @@
> +/* { dg-do run { target avx512f } } */
> +/* { dg-options "-O1 -mavx512bf16 -fdump-tree-vect-details 
> -fdump-tree-optimized" } */
> +
> +extern void abort (void);
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512BF16
> +#include "avx512-check.h"
> +
> +__bf16 b_512[32], r_abs_512[32], r_neg_512[32];
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf,
> +target("prefer-vector-width=512")))
> +abs_512 (void)
> +{
> +  for (int i = 0; i < 32; i++)
> +    r_abs_512[i] = __builtin_fabsf16(b_512[i]);
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf,
> +target("prefer-vector-width=512")))
> +neg_512 (void)
> +{
> +  for (int i = 0; i < 32; i++)
> +    r_neg_512[i] = -b_512[i];
> +}
> +
> +void
> +check_absneg_results (__bf16 *b, __bf16 *r_abs, __bf16 *r_neg, int len)
> +{
> +  for (int i = 0; i < len; i++)
> +    {
> +      __bf16 expected_abs = __builtin_fabsf16(b[i]);
> +      __bf16 expected_neg = -b[i];
> +      if (r_abs[i] != expected_abs || r_neg[i] != expected_neg)
> +        abort ();
> +    }
> +}
> +
> +static void
> +__attribute__ ((noinline, noclone))
> +do_test (void)
> +{
> +  /* Initialize test values */
> +  float float_b[32] = {-1.2f, 3.4f, -5.6f, 7.8f,
> +                      -9.0f, 1.0f, -2.0f, 3.0f,
> +                      -4.0f, -5.0f, 6.0f, 7.0f,
> +                      -8.0f, -9.0f, 10.0f, 11.0f,
> +                      -1.2f, 3.4f, -5.6f, 7.8f,
> +                      -9.0f, 1.0f, -2.0f, 3.0f,
> +                      -4.0f, -5.0f, 6.0f, 7.0f,
> +                      -8.0f, -9.0f, 10.0f, 11.0f};
> +
> +  for (int i = 0; i < 32; i++)
> +    b_512[i] = (__bf16)float_b[i];
> +
> +  abs_512 ();
> +  neg_512 ();
> +  check_absneg_results (b_512, r_abs_512, r_neg_512, 32);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 
> 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)ABS_EXPR <vect} 1 "optimized" { 
> target { ! ia32 } } } } */
> \ No newline at end of file
> --
> 2.31.1
>


-- 
BR,
Hongtao

Reply via email to