On Sat, Oct 20, 2018 at 8:46 AM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> Many AVX512 vector operations can broadcast from a scalar memory source.
> This patch enables memory broadcast for FMSUB operations.  In order to
> support AVX512 memory broadcast for FMSUB, FMSUB builtin functions are
> also added, instead of passing the negated value to FMA builtin functions.
>
> gcc/
>
>         PR target/72782
>         * config/i386/avx512fintrin.h (_mm512_fmsub_round_pd): Use
>         __builtin_ia32_vfmsubpd512_mask.
>         (_mm512_mask_fmsub_round_pd): Likewise.
>         (_mm512_fmsub_pd): Likewise.
>         (_mm512_mask_fmsub_pd): Likewise.
>         (_mm512_maskz_fmsub_round_pd): Use
>         __builtin_ia32_vfmsubpd512_maskz.
>         (_mm512_maskz_fmsub_pd): Likewise.
>         (_mm512_fmsub_round_ps): Use __builtin_ia32_vfmsubps512_mask.
>         (_mm512_mask_fmsub_round_ps): Likewise.
>         (_mm512_fmsub_ps): Likewise.
>         (_mm512_mask_fmsub_ps): Likewise.
>         (_mm512_maskz_fmsub_round_ps): Use
>         __builtin_ia32_vfmsubps512_maskz.
>         (_mm512_maskz_fmsub_ps): Likewise.
>         * config/i386/avx512vlintrin.h (_mm256_mask_fmsub_pd): Use
>         __builtin_ia32_vfmsubpd256_mask.
>         (_mm256_maskz_fmsub_pd): Use __builtin_ia32_vfmsubpd256_maskz.
>         (_mm_mask_fmsub_pd): Use __builtin_ia32_vfmaddpd128_mask
>         (_mm_maskz_fmsub_pd): Use __builtin_ia32_vfmsubpd128_maskz.
>         (_mm256_mask_fmsub_ps): Use __builtin_ia32_vfmsubps256_mask.
>         (_mm256_mask_fmsub_ps): Use __builtin_ia32_vfmsubps256_mask.
>         (_mm256_maskz_fmsub_ps): Use __builtin_ia32_vfmsubps256_maskz.
>         (_mm_mask_fmsub_ps): Use __builtin_ia32_vfmsubps128_mask.
>         (_mm_maskz_fmsub_ps): Use __builtin_ia32_vfmsubps128_maskz.
>         * config/i386/fmaintrin.h (_mm_fmsub_pd): Use
>         __builtin_ia32_vfmsubpd.
>         (_mm256_fmsub_pd): Use __builtin_ia32_vfmsubpd256.
>         (_mm_fmsub_ps): Use __builtin_ia32_vfmsubps.
>         (_mm256_fmsub_ps): Use __builtin_ia32_vfmsubps256.
>         (_mm_fmsub_sd): Use __builtin_ia32_vfmsubsd3.
>         (_mm_fmsub_ss): Use __builtin_ia32_vfmsubss3.
>         * config/i386/i386-builtin.def: Add
>         __builtin_ia32_vfmsubpd256_mask,
>         __builtin_ia32_vfmsubpd256_maskz,
>         __builtin_ia32_vfmsubpd128_mask,
>         __builtin_ia32_vfmsubpd128_maskz,
>         __builtin_ia32_vfmsubps256_mask,
>         __builtin_ia32_vfmsubps256_maskz,
>         __builtin_ia32_vfmsubps128_mask,
>         __builtin_ia32_vfmsubps128_maskz,
>         __builtin_ia32_vfmsubpd512_mask,
>         __builtin_ia32_vfmsubpd512_maskz,
>         __builtin_ia32_vfmsubps512_mask,
>         __builtin_ia32_vfmsubps512_maskz, __builtin_ia32_vfmsubss3,
>         __builtin_ia32_vfmsubsd3, __builtin_ia32_vfmsubps,
>         __builtin_ia32_vfmsubpd, __builtin_ia32_vfmsubps256 and.
>         __builtin_ia32_vfmsubpd256.
>         * config/i386/sse.md (fma4i_fmsub_<mode>): New.
>         (<avx512>_fmsub_<mode>_maskz<round_expand_name>): Likewise.
>         (*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_1):
>         Likewise.
>         (*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_2):
>         Likewise.
>         (*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_3):
>         Likewise.
>         (fmai_vmfmsub_<mode><round_name>): Likewise.
>
> gcc/testsuite/
>
>         PR target/72782
>         * gcc.target/i386/avx512f-fmsub-df-zmm-1.c: New test.
>         * gcc.target/i386/avx512f-fmsub-sf-zmm-1.c: Likewise.
>         * gcc.target/i386/avx512f-fmsub-sf-zmm-2.c: Likewise.
>         * gcc.target/i386/avx512f-fmsub-sf-zmm-3.c: Likewise.
>         * gcc.target/i386/avx512f-fmsub-sf-zmm-4.c: Likewise.
>         * gcc.target/i386/avx512f-fmsub-sf-zmm-5.c: Likewise.
>         * gcc.target/i386/avx512f-fmsub-sf-zmm-6.c: Likewise.
>         * gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Likewise.
>         * gcc.target/i386/avx512f-fmsub-sf-zmm-8.c: Likewise.
>         * gcc.target/i386/avx512vl-fmsub-sf-xmm-1.c: Likewise.
>         * gcc.target/i386/avx512vl-fmsub-sf-ymm-1.c: Likewise.

LGTM.

Thanks,
Uros.

> ---
>  gcc/config/i386/avx512fintrin.h               | 60 +++++++--------
>  gcc/config/i386/avx512vlintrin.h              | 32 ++++----
>  gcc/config/i386/fmaintrin.h                   | 24 +++---
>  gcc/config/i386/i386-builtin.def              | 18 +++++
>  gcc/config/i386/sse.md                        | 77 +++++++++++++++++++
>  .../gcc.target/i386/avx512f-fmsub-df-zmm-1.c  | 12 +++
>  .../gcc.target/i386/avx512f-fmsub-sf-zmm-1.c  | 12 +++
>  .../gcc.target/i386/avx512f-fmsub-sf-zmm-2.c  | 12 +++
>  .../gcc.target/i386/avx512f-fmsub-sf-zmm-3.c  | 12 +++
>  .../gcc.target/i386/avx512f-fmsub-sf-zmm-4.c  | 12 +++
>  .../gcc.target/i386/avx512f-fmsub-sf-zmm-5.c  | 12 +++
>  .../gcc.target/i386/avx512f-fmsub-sf-zmm-6.c  | 12 +++
>  .../gcc.target/i386/avx512f-fmsub-sf-zmm-7.c  | 12 +++
>  .../gcc.target/i386/avx512f-fmsub-sf-zmm-8.c  | 12 +++
>  .../gcc.target/i386/avx512vl-fmsub-sf-xmm-1.c | 12 +++
>  .../gcc.target/i386/avx512vl-fmsub-sf-ymm-1.c | 12 +++
>  16 files changed, 285 insertions(+), 58 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-fmsub-df-zmm-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-6.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-8.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-fmsub-sf-xmm-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-fmsub-sf-ymm-1.c
>
> diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
> index 8473cd0d26c..c0c8fa1efd0 100644
> --- a/gcc/config/i386/avx512fintrin.h
> +++ b/gcc/config/i386/avx512fintrin.h
> @@ -3355,9 +3355,9 @@ extern __inline __m512d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
>  {
> -  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
> +  return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
>                                                     (__v8df) __B,
> -                                                   -(__v8df) __C,
> +                                                   (__v8df) __C,
>                                                     (__mmask8) -1, __R);
>  }
>
> @@ -3366,9 +3366,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm512_mask_fmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
>                             __m512d __C, const int __R)
>  {
> -  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
> +  return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
>                                                     (__v8df) __B,
> -                                                   -(__v8df) __C,
> +                                                   (__v8df) __C,
>                                                     (__mmask8) __U, __R);
>  }
>
> @@ -3388,9 +3388,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm512_maskz_fmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
>                              __m512d __C, const int __R)
>  {
> -  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
> +  return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A,
>                                                      (__v8df) __B,
> -                                                    -(__v8df) __C,
> +                                                    (__v8df) __C,
>                                                      (__mmask8) __U, __R);
>  }
>
> @@ -3398,9 +3398,9 @@ extern __inline __m512
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
>  {
> -  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
> +  return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
>                                                    (__v16sf) __B,
> -                                                  -(__v16sf) __C,
> +                                                  (__v16sf) __C,
>                                                    (__mmask16) -1, __R);
>  }
>
> @@ -3409,9 +3409,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm512_mask_fmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
>                             __m512 __C, const int __R)
>  {
> -  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
> +  return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
>                                                    (__v16sf) __B,
> -                                                  -(__v16sf) __C,
> +                                                  (__v16sf) __C,
>                                                    (__mmask16) __U, __R);
>  }
>
> @@ -3431,9 +3431,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm512_maskz_fmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
>                              __m512 __C, const int __R)
>  {
> -  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
> +  return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A,
>                                                     (__v16sf) __B,
> -                                                   -(__v16sf) __C,
> +                                                   (__v16sf) __C,
>                                                     (__mmask16) __U, __R);
>  }
>
> @@ -3806,28 +3806,28 @@ _mm512_maskz_fnmsub_round_ps (__mmask16 __U, __m512 
> __A, __m512 __B,
>      (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, C, U, R)
>
>  #define _mm512_fmsub_round_pd(A, B, C, R)            \
> -    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), -1, R)
> +    (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, -1, R)
>
>  #define _mm512_mask_fmsub_round_pd(A, U, B, C, R)    \
> -    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), U, R)
> +    (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, U, R)
>
>  #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R)   \
>      (__m512d)__builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R)
>
>  #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R)   \
> -    (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, -(C), U, R)
> +    (__m512d)__builtin_ia32_vfmsubpd512_maskz(A, B, C, U, R)
>
>  #define _mm512_fmsub_round_ps(A, B, C, R)            \
> -    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), -1, R)
> +    (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, -1, R)
>
>  #define _mm512_mask_fmsub_round_ps(A, U, B, C, R)    \
> -    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), U, R)
> +    (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, U, R)
>
>  #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R)   \
>      (__m512)__builtin_ia32_vfmsubps512_mask3(A, B, C, U, R)
>
>  #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R)   \
> -    (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, -(C), U, R)
> +    (__m512)__builtin_ia32_vfmsubps512_maskz(A, B, C, U, R)
>
>  #define _mm512_fmaddsub_round_pd(A, B, C, R)            \
>      (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R)
> @@ -12416,9 +12416,9 @@ extern __inline __m512d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C)
>  {
> -  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
> +  return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
>                                                     (__v8df) __B,
> -                                                   -(__v8df) __C,
> +                                                   (__v8df) __C,
>                                                     (__mmask8) -1,
>                                                     _MM_FROUND_CUR_DIRECTION);
>  }
> @@ -12427,9 +12427,9 @@ extern __inline __m512d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
>  {
> -  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
> +  return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
>                                                     (__v8df) __B,
> -                                                   -(__v8df) __C,
> +                                                   (__v8df) __C,
>                                                     (__mmask8) __U,
>                                                     _MM_FROUND_CUR_DIRECTION);
>  }
> @@ -12449,9 +12449,9 @@ extern __inline __m512d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
>  {
> -  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
> +  return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A,
>                                                      (__v8df) __B,
> -                                                    -(__v8df) __C,
> +                                                    (__v8df) __C,
>                                                      (__mmask8) __U,
>                                                      
> _MM_FROUND_CUR_DIRECTION);
>  }
> @@ -12460,9 +12460,9 @@ extern __inline __m512
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C)
>  {
> -  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
> +  return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
>                                                    (__v16sf) __B,
> -                                                  -(__v16sf) __C,
> +                                                  (__v16sf) __C,
>                                                    (__mmask16) -1,
>                                                    _MM_FROUND_CUR_DIRECTION);
>  }
> @@ -12471,9 +12471,9 @@ extern __inline __m512
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
>  {
> -  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
> +  return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
>                                                    (__v16sf) __B,
> -                                                  -(__v16sf) __C,
> +                                                  (__v16sf) __C,
>                                                    (__mmask16) __U,
>                                                    _MM_FROUND_CUR_DIRECTION);
>  }
> @@ -12493,9 +12493,9 @@ extern __inline __m512
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
>  {
> -  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
> +  return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A,
>                                                     (__v16sf) __B,
> -                                                   -(__v16sf) __C,
> +                                                   (__v16sf) __C,
>                                                     (__mmask16) __U,
>                                                     _MM_FROUND_CUR_DIRECTION);
>  }
> diff --git a/gcc/config/i386/avx512vlintrin.h 
> b/gcc/config/i386/avx512vlintrin.h
> index 68b5537845b..1e2e6da29bd 100644
> --- a/gcc/config/i386/avx512vlintrin.h
> +++ b/gcc/config/i386/avx512vlintrin.h
> @@ -4117,9 +4117,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm256_mask_fmsub_pd (__m256d __A, __mmask8 __U, __m256d __B,
>                       __m256d __C)
>  {
> -  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
> +  return (__m256d) __builtin_ia32_vfmsubpd256_mask ((__v4df) __A,
>                                                     (__v4df) __B,
> -                                                   -(__v4df) __C,
> +                                                   (__v4df) __C,
>                                                     (__mmask8) __U);
>  }
>
> @@ -4139,9 +4139,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm256_maskz_fmsub_pd (__mmask8 __U, __m256d __A, __m256d __B,
>                        __m256d __C)
>  {
> -  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
> +  return (__m256d) __builtin_ia32_vfmsubpd256_maskz ((__v4df) __A,
>                                                      (__v4df) __B,
> -                                                    -(__v4df) __C,
> +                                                    (__v4df) __C,
>                                                      (__mmask8) __U);
>  }
>
> @@ -4149,9 +4149,9 @@ extern __inline __m128d
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_mask_fmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
>  {
> -  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
> +  return (__m128d) __builtin_ia32_vfmsubpd128_mask ((__v2df) __A,
>                                                     (__v2df) __B,
> -                                                   -(__v2df) __C,
> +                                                   (__v2df) __C,
>                                                     (__mmask8) __U);
>  }
>
> @@ -4171,9 +4171,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_maskz_fmsub_pd (__mmask8 __U, __m128d __A, __m128d __B,
>                     __m128d __C)
>  {
> -  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
> +  return (__m128d) __builtin_ia32_vfmsubpd128_maskz ((__v2df) __A,
>                                                      (__v2df) __B,
> -                                                    -(__v2df) __C,
> +                                                    (__v2df) __C,
>                                                      (__mmask8) __U);
>  }
>
> @@ -4181,9 +4181,9 @@ extern __inline __m256
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_mask_fmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
>  {
> -  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
> +  return (__m256) __builtin_ia32_vfmsubps256_mask ((__v8sf) __A,
>                                                    (__v8sf) __B,
> -                                                  -(__v8sf) __C,
> +                                                  (__v8sf) __C,
>                                                    (__mmask8) __U);
>  }
>
> @@ -4203,9 +4203,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm256_maskz_fmsub_ps (__mmask8 __U, __m256 __A, __m256 __B,
>                        __m256 __C)
>  {
> -  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
> +  return (__m256) __builtin_ia32_vfmsubps256_maskz ((__v8sf) __A,
>                                                     (__v8sf) __B,
> -                                                   -(__v8sf) __C,
> +                                                   (__v8sf) __C,
>                                                     (__mmask8) __U);
>  }
>
> @@ -4213,9 +4213,9 @@ extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_mask_fmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
>  {
> -  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
> +  return (__m128) __builtin_ia32_vfmsubps128_mask ((__v4sf) __A,
>                                                    (__v4sf) __B,
> -                                                  -(__v4sf) __C,
> +                                                  (__v4sf) __C,
>                                                    (__mmask8) __U);
>  }
>
> @@ -4233,9 +4233,9 @@ extern __inline __m128
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_maskz_fmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
>  {
> -  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
> +  return (__m128) __builtin_ia32_vfmsubps128_maskz ((__v4sf) __A,
>                                                     (__v4sf) __B,
> -                                                   -(__v4sf) __C,
> +                                                   (__v4sf) __C,
>                                                     (__mmask8) __U);
>  }
>
> diff --git a/gcc/config/i386/fmaintrin.h b/gcc/config/i386/fmaintrin.h
> index 660d3453590..2eddd896579 100644
> --- a/gcc/config/i386/fmaintrin.h
> +++ b/gcc/config/i386/fmaintrin.h
> @@ -86,48 +86,48 @@ extern __inline __m128d
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_fmsub_pd (__m128d __A, __m128d __B, __m128d __C)
>  {
> -  return (__m128d)__builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B,
> -                                           -(__v2df)__C);
> +  return (__m128d)__builtin_ia32_vfmsubpd ((__v2df)__A, (__v2df)__B,
> +                                           (__v2df)__C);
>  }
>
>  extern __inline __m256d
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_fmsub_pd (__m256d __A, __m256d __B, __m256d __C)
>  {
> -  return (__m256d)__builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B,
> -                                              -(__v4df)__C);
> +  return (__m256d)__builtin_ia32_vfmsubpd256 ((__v4df)__A, (__v4df)__B,
> +                                              (__v4df)__C);
>  }
>
>  extern __inline __m128
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_fmsub_ps (__m128 __A, __m128 __B, __m128 __C)
>  {
> -  return (__m128)__builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B,
> -                                          -(__v4sf)__C);
> +  return (__m128)__builtin_ia32_vfmsubps ((__v4sf)__A, (__v4sf)__B,
> +                                          (__v4sf)__C);
>  }
>
>  extern __inline __m256
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_fmsub_ps (__m256 __A, __m256 __B, __m256 __C)
>  {
> -  return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B,
> -                                             -(__v8sf)__C);
> +  return (__m256)__builtin_ia32_vfmsubps256 ((__v8sf)__A, (__v8sf)__B,
> +                                             (__v8sf)__C);
>  }
>
>  extern __inline __m128d
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_fmsub_sd (__m128d __A, __m128d __B, __m128d __C)
>  {
> -  return (__m128d)__builtin_ia32_vfmaddsd3 ((__v2df)__A, (__v2df)__B,
> -                                            -(__v2df)__C);
> +  return (__m128d)__builtin_ia32_vfmsubsd3 ((__v2df)__A, (__v2df)__B,
> +                                            (__v2df)__C);
>  }
>
>  extern __inline __m128
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_fmsub_ss (__m128 __A, __m128 __B, __m128 __C)
>  {
> -  return (__m128)__builtin_ia32_vfmaddss3 ((__v4sf)__A, (__v4sf)__B,
> -                                           -(__v4sf)__C);
> +  return (__m128)__builtin_ia32_vfmsubss3 ((__v4sf)__A, (__v4sf)__B,
> +                                           (__v4sf)__C);
>  }
>
>  extern __inline __m128d
> diff --git a/gcc/config/i386/i386-builtin.def 
> b/gcc/config/i386/i386-builtin.def
> index dc4c70c7ea3..f5b5e56a01c 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -1903,10 +1903,18 @@ BDESC (OPTION_MASK_ISA_AVX512VL, 
> CODE_FOR_avx512vl_fmadd_v8sf_maskz, "__builtin_
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmadd_v4sf_mask, 
> "__builtin_ia32_vfmaddps128_mask", IX86_BUILTIN_VFMADDPS128_MASK, UNKNOWN, 
> (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmadd_v4sf_mask3, 
> "__builtin_ia32_vfmaddps128_mask3", IX86_BUILTIN_VFMADDPS128_MASK3, UNKNOWN, 
> (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmadd_v4sf_maskz, 
> "__builtin_ia32_vfmaddps128_maskz", IX86_BUILTIN_VFMADDPS128_MASKZ, UNKNOWN, 
> (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v4df_mask, 
> "__builtin_ia32_vfmsubpd256_mask", IX86_BUILTIN_VFMSUBPD256_MASK, UNKNOWN, 
> (int) V4DF_FTYPE_V4DF_V4DF_V4DF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v4df_mask3, 
> "__builtin_ia32_vfmsubpd256_mask3", IX86_BUILTIN_VFMSUBPD256_MASK3, UNKNOWN, 
> (int) V4DF_FTYPE_V4DF_V4DF_V4DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v4df_maskz, 
> "__builtin_ia32_vfmsubpd256_maskz", IX86_BUILTIN_VFMSUBPD256_MASKZ, UNKNOWN, 
> (int) V4DF_FTYPE_V4DF_V4DF_V4DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v2df_mask, 
> "__builtin_ia32_vfmsubpd128_mask", IX86_BUILTIN_VFMSUBPD128_MASK, UNKNOWN, 
> (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v2df_mask3, 
> "__builtin_ia32_vfmsubpd128_mask3", IX86_BUILTIN_VFMSUBPD128_MASK3, UNKNOWN, 
> (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v2df_maskz, 
> "__builtin_ia32_vfmsubpd128_maskz", IX86_BUILTIN_VFMSUBPD128_MASKZ, UNKNOWN, 
> (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v8sf_mask, 
> "__builtin_ia32_vfmsubps256_mask", IX86_BUILTIN_VFMSUBPS256_MASK, UNKNOWN, 
> (int) V8SF_FTYPE_V8SF_V8SF_V8SF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v8sf_mask3, 
> "__builtin_ia32_vfmsubps256_mask3", IX86_BUILTIN_VFMSUBPS256_MASK3, UNKNOWN, 
> (int) V8SF_FTYPE_V8SF_V8SF_V8SF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v8sf_maskz, 
> "__builtin_ia32_vfmsubps256_maskz", IX86_BUILTIN_VFMSUBPS256_MASKZ, UNKNOWN, 
> (int) V8SF_FTYPE_V8SF_V8SF_V8SF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v4sf_mask, 
> "__builtin_ia32_vfmsubps128_mask", IX86_BUILTIN_VFMSUBPS128_MASK, UNKNOWN, 
> (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v4sf_mask3, 
> "__builtin_ia32_vfmsubps128_mask3", IX86_BUILTIN_VFMSUBPS128_MASK3, UNKNOWN, 
> (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fmsub_v4sf_maskz, 
> "__builtin_ia32_vfmsubps128_maskz", IX86_BUILTIN_VFMSUBPS128_MASKZ, UNKNOWN, 
> (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fnmadd_v4df_mask, 
> "__builtin_ia32_vfnmaddpd256_mask", IX86_BUILTIN_VFNMADDPD256_MASK, UNKNOWN, 
> (int) V4DF_FTYPE_V4DF_V4DF_V4DF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fnmadd_v2df_mask, 
> "__builtin_ia32_vfnmaddpd128_mask", IX86_BUILTIN_VFNMADDPD128_MASK, UNKNOWN, 
> (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_fnmadd_v8sf_mask, 
> "__builtin_ia32_vfnmaddps256_mask", IX86_BUILTIN_VFNMADDPS256_MASK, UNKNOWN, 
> (int) V8SF_FTYPE_V8SF_V8SF_V8SF_UQI)
> @@ -2768,8 +2776,12 @@ BDESC (OPTION_MASK_ISA_AVX512F, 
> CODE_FOR_avx512f_fmaddsub_v16sf_mask3_round, "__
>  BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v16sf_maskz_round, 
> "__builtin_ia32_vfmaddsubps512_maskz", IX86_BUILTIN_VFMADDSUBPS512_MASKZ, 
> UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT)
>  BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsubadd_v8df_mask3_round, 
> "__builtin_ia32_vfmsubaddpd512_mask3", IX86_BUILTIN_VFMSUBADDPD512_MASK3, 
> UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT)
>  BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsubadd_v16sf_mask3_round, 
> "__builtin_ia32_vfmsubaddps512_mask3", IX86_BUILTIN_VFMSUBADDPS512_MASK3, 
> UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT)
> +BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v8df_mask_round, 
> "__builtin_ia32_vfmsubpd512_mask", IX86_BUILTIN_VFMSUBPD512_MASK, UNKNOWN, 
> (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT)
>  BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v8df_mask3_round, 
> "__builtin_ia32_vfmsubpd512_mask3", IX86_BUILTIN_VFMSUBPD512_MASK3, UNKNOWN, 
> (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT)
> +BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v8df_maskz_round, 
> "__builtin_ia32_vfmsubpd512_maskz", IX86_BUILTIN_VFMSUBPD512_MASKZ, UNKNOWN, 
> (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT)
> +BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v16sf_mask_round, 
> "__builtin_ia32_vfmsubps512_mask", IX86_BUILTIN_VFMSUBPS512_MASK, UNKNOWN, 
> (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT)
>  BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v16sf_mask3_round, 
> "__builtin_ia32_vfmsubps512_mask3", IX86_BUILTIN_VFMSUBPS512_MASK3, UNKNOWN, 
> (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT)
> +BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v16sf_maskz_round, 
> "__builtin_ia32_vfmsubps512_maskz", IX86_BUILTIN_VFMSUBPS512_MASKZ, UNKNOWN, 
> (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT)
>  BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmadd_v8df_mask_round, 
> "__builtin_ia32_vfnmaddpd512_mask", IX86_BUILTIN_VFNMADDPD512_MASK, UNKNOWN, 
> (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT)
>  BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmadd_v16sf_mask_round, 
> "__builtin_ia32_vfnmaddps512_mask", IX86_BUILTIN_VFNMADDPS512_MASK, UNKNOWN, 
> (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT)
>  BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmsub_v8df_mask_round, 
> "__builtin_ia32_vfnmsubpd512_mask", IX86_BUILTIN_VFNMSUBPD512_MASK, UNKNOWN, 
> (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT)
> @@ -2855,11 +2867,17 @@ BDESC_FIRST (multi_arg, MULTI_ARG,
>  BDESC (OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df, 
> "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD, UNKNOWN, 
> (int)MULTI_ARG_3_DF)
>  BDESC (OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf, 
> "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3, UNKNOWN, 
> (int)MULTI_ARG_3_SF)
>  BDESC (OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df, 
> "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3, UNKNOWN, 
> (int)MULTI_ARG_3_DF)
> +BDESC (OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmsub_v4sf, 
> "__builtin_ia32_vfmsubss3", IX86_BUILTIN_VFMSUBSS3, UNKNOWN, 
> (int)MULTI_ARG_3_SF)
> +BDESC (OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmsub_v2df, 
> "__builtin_ia32_vfmsubsd3", IX86_BUILTIN_VFMSUBSD3, UNKNOWN, 
> (int)MULTI_ARG_3_DF)
>
>  BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, 
> CODE_FOR_fma4i_fmadd_v4sf, "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS, 
> UNKNOWN, (int)MULTI_ARG_3_SF)
>  BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, 
> CODE_FOR_fma4i_fmadd_v2df, "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD, 
> UNKNOWN, (int)MULTI_ARG_3_DF)
>  BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, 
> CODE_FOR_fma4i_fmadd_v8sf, "__builtin_ia32_vfmaddps256", 
> IX86_BUILTIN_VFMADDPS256, UNKNOWN, (int)MULTI_ARG_3_SF2)
>  BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, 
> CODE_FOR_fma4i_fmadd_v4df, "__builtin_ia32_vfmaddpd256", 
> IX86_BUILTIN_VFMADDPD256, UNKNOWN, (int)MULTI_ARG_3_DF2)
> +BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, 
> CODE_FOR_fma4i_fmsub_v4sf, "__builtin_ia32_vfmsubps", IX86_BUILTIN_VFMSUBPS, 
> UNKNOWN, (int)MULTI_ARG_3_SF)
> +BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, 
> CODE_FOR_fma4i_fmsub_v2df, "__builtin_ia32_vfmsubpd", IX86_BUILTIN_VFMSUBPD, 
> UNKNOWN, (int)MULTI_ARG_3_DF)
> +BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, 
> CODE_FOR_fma4i_fmsub_v8sf, "__builtin_ia32_vfmsubps256", 
> IX86_BUILTIN_VFMSUBPS256, UNKNOWN, (int)MULTI_ARG_3_SF2)
> +BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, 
> CODE_FOR_fma4i_fmsub_v4df, "__builtin_ia32_vfmsubpd256", 
> IX86_BUILTIN_VFMSUBPD256, UNKNOWN, (int)MULTI_ARG_3_DF2)
>
>  BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf, 
> "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS, UNKNOWN, 
> (int)MULTI_ARG_3_SF)
>  BDESC (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df, 
> "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD, UNKNOWN, 
> (int)MULTI_ARG_3_DF)
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 635a6902d33..0e9d3541ccc 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -3760,6 +3760,14 @@
>           (match_operand:FMAMODE_AVX512 2 "nonimmediate_operand")
>           (match_operand:FMAMODE_AVX512 3 "nonimmediate_operand")))])
>
> +(define_expand "fma4i_fmsub_<mode>"
> +  [(set (match_operand:FMAMODE_AVX512 0 "register_operand")
> +       (fma:FMAMODE_AVX512
> +         (match_operand:FMAMODE_AVX512 1 "nonimmediate_operand")
> +         (match_operand:FMAMODE_AVX512 2 "nonimmediate_operand")
> +         (neg:FMAMODE_AVX512
> +           (match_operand:FMAMODE_AVX512 3 "nonimmediate_operand"))))])
> +
>  (define_expand "<avx512>_fmadd_<mode>_maskz<round_expand_name>"
>    [(match_operand:VF_AVX512VL 0 "register_operand")
>     (match_operand:VF_AVX512VL 1 "<round_expand_nimm_predicate>")
> @@ -3898,6 +3906,20 @@
>     (set_attr "type" "ssemuladd")
>     (set_attr "mode" "<MODE>")])
>
> +(define_expand "<avx512>_fmsub_<mode>_maskz<round_expand_name>"
> +  [(match_operand:VF_AVX512VL 0 "register_operand")
> +   (match_operand:VF_AVX512VL 1 "<round_expand_nimm_predicate>")
> +   (match_operand:VF_AVX512VL 2 "<round_expand_nimm_predicate>")
> +   (match_operand:VF_AVX512VL 3 "<round_expand_nimm_predicate>")
> +   (match_operand:<avx512fmaskmode> 4 "register_operand")]
> +  "TARGET_AVX512F && <round_mode512bit_condition>"
> +{
> +  emit_insn (gen_fma_fmsub_<mode>_maskz_1<round_expand_name> (
> +    operands[0], operands[1], operands[2], operands[3],
> +    CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>));
> +  DONE;
> +})
> +
>  (define_insn "<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name><round_name>"
>    [(set (match_operand:VF_SF_AVX512VL 0 "register_operand" "=v,v,v")
>         (fma:VF_SF_AVX512VL
> @@ -3913,6 +3935,49 @@
>    [(set_attr "type" "ssemuladd")
>     (set_attr "mode" "<MODE>")])
>
> +(define_insn "*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_1"
> +  [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v")
> +       (fma:VF_AVX512
> +         (match_operand:VF_AVX512 1 "register_operand" "0,v")
> +         (match_operand:VF_AVX512 2 "register_operand" "v,0")
> +         (neg:VF_AVX512
> +           (vec_duplicate:VF_AVX512
> +             (match_operand:<ssescalarmode> 3 "memory_operand" "m,m")))))]
> +  "TARGET_AVX512F && <sd_mask_mode512bit_condition>"
> +  "vfmsub213<ssemodesuffix>\t{%3<avx512bcst>, %2, 
> %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<avx512bcst>}"
> +  [(set_attr "type" "ssemuladd")
> +   (set_attr "mode" "<MODE>")])
> +
> +(define_insn "*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_2"
> +  [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v")
> +       (fma:VF_AVX512
> +         (vec_duplicate:VF_AVX512
> +           (match_operand:<ssescalarmode> 1 "memory_operand" "m,m"))
> +         (match_operand:VF_AVX512 2 "register_operand" "0,v")
> +         (neg:VF_AVX512
> +           (match_operand:VF_AVX512 3 "register_operand" "v,0"))))]
> +  "TARGET_AVX512F && <sd_mask_mode512bit_condition>"
> +  "@
> +   vfmsub132<ssemodesuffix>\t{%1<avx512bcst>, %3, 
> %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %1<avx512bcst>}
> +   vfmsub231<ssemodesuffix>\t{%1<avx512bcst>, %2, 
> %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %1<avx512bcst>}"
> +  [(set_attr "type" "ssemuladd")
> +   (set_attr "mode" "<MODE>")])
> +
> +(define_insn "*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_3"
> +  [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v")
> +       (fma:VF_AVX512
> +         (match_operand:VF_AVX512 1 "register_operand" "0,v")
> +         (vec_duplicate:VF_AVX512
> +           (match_operand:<ssescalarmode> 2 "memory_operand" "m,m"))
> +         (neg:VF_AVX512
> +           (match_operand:VF_AVX512 3 "nonimmediate_operand" "v,0"))))]
> +  "TARGET_AVX512F && <sd_mask_mode512bit_condition>"
> +  "@
> +   vfmsub132<ssemodesuffix>\t{%2<avx512bcst>, %3, 
> %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<avx512bcst>}
> +   vfmsub231<ssemodesuffix>\t{%2<avx512bcst>, %1, 
> %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<avx512bcst>}"
> +  [(set_attr "type" "ssemuladd")
> +   (set_attr "mode" "<MODE>")])
> +
>  (define_insn "<avx512>_fmsub_<mode>_mask<round_name>"
>    [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
>         (vec_merge:VF_AVX512VL
> @@ -4261,6 +4326,18 @@
>           (const_int 1)))]
>    "TARGET_FMA")
>
> +(define_expand "fmai_vmfmsub_<mode><round_name>"
> +  [(set (match_operand:VF_128 0 "register_operand")
> +       (vec_merge:VF_128
> +         (fma:VF_128
> +           (match_operand:VF_128 1 "<round_nimm_predicate>")
> +           (match_operand:VF_128 2 "<round_nimm_predicate>")
> +           (neg:VF_128
> +             (match_operand:VF_128 3 "<round_nimm_predicate>")))
> +         (match_dup 1)
> +         (const_int 1)))]
> +  "TARGET_FMA")
> +
>  (define_insn "*fmai_fmadd_<mode>"
>    [(set (match_operand:VF_128 0 "register_operand" "=v,v")
>          (vec_merge:VF_128
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-df-zmm-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-df-zmm-1.c
> new file mode 100644
> index 00000000000..840888a2d81
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-df-zmm-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vfmsub...pd\[ 
> \\t\]+\\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vbroadcastsd\[^\n\]*%zmm\[0-9\]+" } } */
> +
> +#define type __m512d
> +#define vec 512
> +#define op fmsub
> +#define suffix pd
> +#define SCALAR double
> +
> +#include "avx512-fma-1.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-1.c
> new file mode 100644
> index 00000000000..0cb675b7628
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[ 
> \\t\]+\\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vbroadcastss\[^\n\]*%zmm\[0-9\]+" } } */
> +
> +#define type __m512
> +#define vec 512
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-1.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-2.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-2.c
> new file mode 100644
> index 00000000000..10212d471b1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[ 
> \\t\]+\\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vbroadcastss\[^\n\]*%zmm\[0-9\]+" } } */
> +
> +#define type __m512
> +#define vec 512
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-2.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-3.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-3.c
> new file mode 100644
> index 00000000000..feb34077085
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-3.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[ 
> \\t\]+\\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vbroadcastss\[^\n\]*%zmm\[0-9\]+" } } */
> +
> +#define type __m512
> +#define vec 512
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-3.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-4.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-4.c
> new file mode 100644
> index 00000000000..4305fffe628
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-4.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[ 
> \\t\]+\\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vbroadcastss\[^\n\]*%zmm\[0-9\]+" } } */
> +
> +#define type __m512
> +#define vec 512
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-4.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-5.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-5.c
> new file mode 100644
> index 00000000000..d57251f83be
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-5.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[ 
> \\t\]+\\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vbroadcastss\[^\n\]*%zmm\[0-9\]+" } } */
> +
> +#define type __m512
> +#define vec 512
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-5.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-6.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-6.c
> new file mode 100644
> index 00000000000..b26a9ee7eeb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-6.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[ 
> \\t\]+\\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vbroadcastss\[^\n\]*%zmm\[0-9\]+" } } */
> +
> +#define type __m512
> +#define vec 512
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-6.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
> new file mode 100644
> index 00000000000..cc705af8ea5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } 
> } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } 
> */
> +
> +#define type __m512
> +#define vec 512
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-7.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-8.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-8.c
> new file mode 100644
> index 00000000000..2b929fa11e8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-8.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } 
> } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[ \\t\]+%zmm\[0-9\]+, 
> %zmm\[0-9\]+, %zmm0" 1 } } */
> +
> +#define type __m512
> +#define vec 512
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-8.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-fmsub-sf-xmm-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512vl-fmsub-sf-xmm-1.c
> new file mode 100644
> index 00000000000..70efbcc98f9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-fmsub-sf-xmm-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mfma -mavx512vl -O2" } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[ 
> \\t\]+\\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %xmm\[0-9\]+, %xmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vbroadcastss\[^\n\]*%xmm\[0-9\]+" } } */
> +
> +#define type __m128
> +#define vec
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-1.h"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-fmsub-sf-ymm-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512vl-fmsub-sf-ymm-1.c
> new file mode 100644
> index 00000000000..a7c1b370b74
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-fmsub-sf-ymm-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mfma -mavx512vl -O2" } */
> +/* { dg-final { scan-assembler-times "vfmsub...ps\[ 
> \\t\]+\\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %ymm\[0-9\]+, %ymm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vbroadcastss\[^\n\]*%ymm\[0-9\]+" } } */
> +
> +#define type __m256
> +#define vec 256
> +#define op fmsub
> +#define suffix ps
> +#define SCALAR float
> +
> +#include "avx512-fma-1.h"
> --
> 2.17.2
>

Reply via email to