From: konglin1 <lingling.k...@intel.com> gcc/ChangeLog:
* config/i386/avx10_2-512bf16intrin.h: Add new intrinsics. * config/i386/avx10_2bf16intrin.h: Diito. * config/i386/i386-builtin-types.def : Add new DEF_FUNCTION_TYPE for new type. * config/i386/i386-builtin.def (BDESC): Add new buildin. * config/i386/i386-expand.cc (ix86_expand_args_builtin): Handle new type. * config/i386/sse.md (avx10_2_rsqrtpbf16_<mode><mask_name>): New define_insn. (avx10_2_sqrtnepbf16_<mode><mask_name>): Ditto. (avx10_2_rcppbf16_<mode><mask_name>): Ditto. (avx10_2_getexppbf16_<mode><mask_name>): Ditto. (BF16IMMOP): New iterator. (bf16immop): Ditto. (avx10_2_<bf16immop>pbf16_<mode><mask_name>): New define_insn. (avx10_2_fpclasspbf16_<mode><mask_scalar_merge_name>): Ditto. (avx10_2_cmppbf16_<mode><mask_scalar_merge_name>): Ditto. (avx10_2_comsbf16_v8bf): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx10-helper.h: Add helper functions. * gcc.target/i386/avx10_2-512-bf16-1.c: Add new tests. * gcc.target/i386/avx10_2-bf16-1.c: Ditto. * gcc.target/i386/avx-1.c: Add macros. * gcc.target/i386/sse-13.c: Ditto. * gcc.target/i386/sse-14.c: Ditto. * gcc.target/i386/sse-22.c: Ditto. * gcc.target/i386/sse-23.c: Ditto. * gcc.target/i386/avx10_2-512-vcmppbf16-2.c: New test. * gcc.target/i386/avx10_2-512-vfpclasspbf16-2.c: Ditto. * gcc.target/i386/avx10_2-512-vgetexppbf16-2.c: Ditto. * gcc.target/i386/avx10_2-512-vgetmantpbf16-2.c: Ditto. * gcc.target/i386/avx10_2-512-vrcppbf16-2.c: Ditto. * gcc.target/i386/avx10_2-512-vreducenepbf16-2.c: Ditto. * gcc.target/i386/avx10_2-512-vrndscalenepbf16-2.c: Ditto. * gcc.target/i386/avx10_2-512-vrsqrtpbf16-2.c: Ditto. * gcc.target/i386/avx10_2-512-vsqrtnepbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vcmppbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vcomsbf16-1.c: Ditto. * gcc.target/i386/avx10_2-vcomsbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vfpclasspbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vgetexppbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vgetmantpbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vrcppbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vreducenepbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vrndscalenepbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vrsqrtpbf16-2.c: Ditto. * gcc.target/i386/avx10_2-vsqrtnepbf16-2.c: Ditto. Co-authored-by: Levy Hsu <ad...@levyhsu.com> --- gcc/config/i386/avx10_2-512bf16intrin.h | 317 +++++++++ gcc/config/i386/avx10_2bf16intrin.h | 650 ++++++++++++++++++ gcc/config/i386/i386-builtin-types.def | 10 + gcc/config/i386/i386-builtin.def | 33 + gcc/config/i386/i386-expand.cc | 16 + gcc/config/i386/sse.md | 92 +++ gcc/testsuite/gcc.target/i386/avx-1.c | 19 + gcc/testsuite/gcc.target/i386/avx10-check.h | 4 +- gcc/testsuite/gcc.target/i386/avx10-helper.h | 28 + .../gcc.target/i386/avx10_2-512-bf16-1.c | 58 ++ .../gcc.target/i386/avx10_2-512-vcmppbf16-2.c | 36 + .../i386/avx10_2-512-vfpclasspbf16-2.c | 44 ++ .../i386/avx10_2-512-vgetexppbf16-2.c | 47 ++ .../i386/avx10_2-512-vgetmantpbf16-2.c | 50 ++ .../gcc.target/i386/avx10_2-512-vrcppbf16-2.c | 45 ++ .../i386/avx10_2-512-vreducenepbf16-2.c | 50 ++ .../i386/avx10_2-512-vrndscalenepbf16-2.c | 46 ++ .../i386/avx10_2-512-vrsqrtpbf16-2.c | 47 ++ .../i386/avx10_2-512-vscalefpbf16-2.c | 2 +- .../i386/avx10_2-512-vsqrtnepbf16-2.c | 47 ++ .../gcc.target/i386/avx10_2-bf16-1.c | 114 +++ .../gcc.target/i386/avx10_2-vcmppbf16-2.c | 16 + .../gcc.target/i386/avx10_2-vcomsbf16-1.c | 19 + .../gcc.target/i386/avx10_2-vcomsbf16-2.c | 58 ++ .../gcc.target/i386/avx10_2-vfpclasspbf16-2.c | 16 + .../gcc.target/i386/avx10_2-vgetexppbf16-2.c | 16 + .../gcc.target/i386/avx10_2-vgetmantpbf16-2.c | 16 + .../gcc.target/i386/avx10_2-vrcppbf16-2.c | 16 + .../i386/avx10_2-vreducenepbf16-2.c | 16 + .../i386/avx10_2-vrndscalenepbf16-2.c | 16 + .../gcc.target/i386/avx10_2-vrsqrtpbf16-2.c | 16 + .../gcc.target/i386/avx10_2-vsqrtnepbf16-2.c | 16 + gcc/testsuite/gcc.target/i386/sse-13.c | 19 + gcc/testsuite/gcc.target/i386/sse-14.c | 43 ++ gcc/testsuite/gcc.target/i386/sse-22.c | 43 ++ gcc/testsuite/gcc.target/i386/sse-23.c | 19 + 36 files changed, 2097 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vcmppbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vfpclasspbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vgetexppbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vgetmantpbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vrcppbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vreducenepbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vrndscalenepbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vrsqrtpbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-512-vsqrtnepbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vcmppbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vcomsbf16-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vcomsbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vfpclasspbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vgetexppbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vgetmantpbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vrcppbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vreducenepbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vrndscalenepbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vrsqrtpbf16-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx10_2-vsqrtnepbf16-2.c diff --git a/gcc/config/i386/avx10_2-512bf16intrin.h b/gcc/config/i386/avx10_2-512bf16intrin.h index b409ea17adb..4e7f8eba146 100644 --- a/gcc/config/i386/avx10_2-512bf16intrin.h +++ b/gcc/config/i386/avx10_2-512bf16intrin.h @@ -356,6 +356,323 @@ _mm512_maskz_fnmsubne_pbh (__mmask32 __U, __m512bh __A, __builtin_ia32_fnmsubnepbf16512_maskz (__A, __B, __C, __U); } +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt_pbh (__m512bh __A) +{ + return (__m512bh) + __builtin_ia32_rsqrtpbf16512_mask (__A, + (__v32bf) _mm512_setzero_si512 (), + (__mmask32) -1); + +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt_pbh (__m512bh __W, __mmask32 __U, __m512bh __A) +{ + return (__m512bh) + __builtin_ia32_rsqrtpbf16512_mask (__A, __W, __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt_pbh (__mmask32 __U, __m512bh __A) +{ + return (__m512bh) + __builtin_ia32_rsqrtpbf16512_mask (__A, + (__v32bf) _mm512_setzero_si512 (), + __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrtne_pbh (__m512bh __A) +{ + return (__m512bh) + __builtin_ia32_sqrtnepbf16512_mask (__A, + (__v32bf) _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrtne_pbh (__m512bh __W, __mmask32 __U, __m512bh __A) +{ + return (__m512bh) + __builtin_ia32_sqrtnepbf16512_mask (__A, __W, __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrtne_pbh (__mmask32 __U, __m512bh __A) +{ + return (__m512bh) + __builtin_ia32_sqrtnepbf16512_mask (__A, + (__v32bf) _mm512_setzero_si512 (), + __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp_pbh (__m512bh __A) +{ + return (__m512bh) + __builtin_ia32_rcppbf16512_mask (__A, + (__v32bf) _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp_pbh (__m512bh __W, __mmask32 __U, __m512bh __A) +{ + return (__m512bh) + __builtin_ia32_rcppbf16512_mask (__A, __W, __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp_pbh (__mmask32 __U, __m512bh __A) +{ + return (__m512bh) + __builtin_ia32_rcppbf16512_mask (__A, + (__v32bf) _mm512_setzero_si512 (), + __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_pbh (__m512bh __A) +{ + return (__m512bh) + __builtin_ia32_getexppbf16512_mask (__A, + (__v32bf) _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_pbh (__m512bh __W, __mmask32 __U, __m512bh __A) +{ + return (__m512bh) __builtin_ia32_getexppbf16512_mask (__A, __W, __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_pbh (__mmask32 __U, __m512bh __A) +{ + return (__m512bh) + __builtin_ia32_getexppbf16512_mask (__A, + (__v32bf) _mm512_setzero_si512 (), + __U); +} + +/* Intrinsics vrndscalepbf16. */ +#ifdef __OPTIMIZE__ +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscalene_pbh (__m512bh __A, int B) +{ + return (__m512bh) + __builtin_ia32_rndscalenepbf16512_mask (__A, B, + (__v32bf) _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscalene_pbh (__m512bh __W, __mmask32 __U, __m512bh __A, int B) +{ + return (__m512bh) + __builtin_ia32_rndscalenepbf16512_mask (__A, B, __W, __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscalene_pbh (__mmask32 __U, __m512bh __A, int B) +{ + return (__m512bh) + __builtin_ia32_rndscalenepbf16512_mask (__A, B, + (__v32bf) _mm512_setzero_si512 (), + __U); +} + +#else +#define _mm512_roundscalene_pbh(A, B) \ + (__builtin_ia32_rndscalenepbf16512_mask ((A), (B), \ + (__v32bf) _mm512_setzero_si512 (), \ + (__mmask32) -1)) + +#define _mm512_mask_roundscalene_pbh(A, B, C, D) \ + (__builtin_ia32_rndscalenepbf16512_mask ((C), (D), (A), (B))) + +#define _mm512_maskz_roundscalene_pbh(A, B, C) \ + (__builtin_ia32_rndscalenepbf16512_mask ((B), (C), \ + (__v32bf) _mm512_setzero_si512 (), \ + (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vreducepbf16. */ +#ifdef __OPTIMIZE__ +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reducene_pbh (__m512bh __A, int B) +{ + return (__m512bh) + __builtin_ia32_reducenepbf16512_mask (__A, B, + (__v32bf) _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reducene_pbh (__m512bh __W, __mmask32 __U, + __m512bh __A, int B) +{ + return (__m512bh) + __builtin_ia32_reducenepbf16512_mask (__A, B, __W, __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reducene_pbh (__mmask32 __U, __m512bh __A, int B) +{ + return (__m512bh) + __builtin_ia32_reducenepbf16512_mask (__A, B, + (__v32bf) _mm512_setzero_si512 (), + __U); +} + +#else +#define _mm512_reducene_pbh(A, B) \ + (__builtin_ia32_reducenepbf16512_mask ((A), (B), \ + (__v32bf) _mm512_setzero_si512 (), \ + (__mmask32) -1)) + +#define _mm512_mask_reducene_pbh(A, B, C, D) \ + (__builtin_ia32_reducenepbf16512_mask ((C), (D), (A), (B))) + +#define _mm512_maskz_reducene_pbh(A, B, C) \ + (__builtin_ia32_reducenepbf16512_mask ((B), (C), \ + (__v32bf) _mm512_setzero_si512 (), \ + (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vgetmantpbf16. */ +#ifdef __OPTIMIZE__ +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_pbh (__m512bh __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512bh) + __builtin_ia32_getmantpbf16512_mask (__A, (int) (__C << 2) | __B, + (__v32bf) _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_pbh (__m512bh __W, __mmask32 __U, __m512bh __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512bh) + __builtin_ia32_getmantpbf16512_mask (__A, (int) (__C << 2) | __B, + __W, __U); +} + +extern __inline__ __m512bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_pbh (__mmask32 __U, __m512bh __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512bh) + __builtin_ia32_getmantpbf16512_mask (__A, (int) (__C << 2) | __B, + (__v32bf) _mm512_setzero_si512 (), + __U); +} + +#else +#define _mm512_getmant_pbh(A, B, C) \ + (__builtin_ia32_getmantpbf16512_mask ((A), (int)(((C)<<2) | (B)), \ + (__v32bf) _mm512_setzero_si512 (), \ + (__mmask32) -1)) + +#define _mm512_mask_getmant_pbh(A, B, C, D, E) \ + (__builtin_ia32_getmantpbf16512_mask ((C), (int)(((D)<<2) | (E)), (A), (B))) + +#define _mm512_maskz_getmant_pbh(A, B, C, D) \ + (__builtin_ia32_getmantpbf16512_mask ((B), (int)(((C)<<2) | (D)), \ + (__v32bf) _mm512_setzero_si512 (), \ + (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfpclasspbf16. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fpclass_pbh_mask (__mmask32 __U, __m512bh __A, + const int __imm) +{ + return (__mmask32) + __builtin_ia32_fpclasspbf16512_mask (__A, __imm, __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fpclass_pbh_mask (__m512bh __A, const int __imm) +{ + return (__mmask32) + __builtin_ia32_fpclasspbf16512_mask (__A, __imm, + (__mmask32) -1); +} + +#else +#define _mm512_mask_fpclass_pbh_mask(U, X, C) \ + ((__mmask32) __builtin_ia32_fpclasspbf16512_mask ( \ + (__v32bf) (__m512bh) (X), (int) (C), (__mmask32) (U))) + +#define _mm512_fpclass_pbh_mask(X, C) \ + ((__mmask32) __builtin_ia32_fpclasspbf16512_mask ( \ + (__v32bf) (__m512bh) (X), (int) (C), (__mmask32) (-1))) +#endif /* __OPIMTIZE__ */ + + +/* Intrinsics vcmppbf16. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_pbh_mask (__mmask32 __U, __m512bh __A, __m512bh __B, + const int __imm) +{ + return (__mmask32) + __builtin_ia32_cmppbf16512_mask (__A, __B, __imm, __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_pbh_mask (__m512bh __A, __m512bh __B, const int __imm) +{ + return (__mmask32) + __builtin_ia32_cmppbf16512_mask (__A, __B, __imm, + (__mmask32) -1); +} + +#else +#define _mm512_mask_cmp_pbh_mask(A, B, C, D) \ + ((__mmask32) __builtin_ia32_cmppbf16512_mask ((B), (C), (D), (A))) + +#define _mm512_cmp_pbh_mask(A, B, C) \ + ((__mmask32) __builtin_ia32_cmppbf16512_mask ((A), (B), (C), (-1))) + +#endif /* __OPIMTIZE__ */ + #ifdef __DISABLE_AVX10_2_512__ #undef __DISABLE_AVX10_2_512__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx10_2bf16intrin.h b/gcc/config/i386/avx10_2bf16intrin.h index e16f1b66481..f36fb8ee8b3 100644 --- a/gcc/config/i386/avx10_2bf16intrin.h +++ b/gcc/config/i386/avx10_2bf16intrin.h @@ -677,6 +677,656 @@ _mm_maskz_fnmsubne_pbh (__mmask8 __U, __m128bh __A, __builtin_ia32_fnmsubnepbf16128_maskz (__A, __B, __C, __U); } +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt_pbh (__m256bh __A) +{ + return (__m256bh) + __builtin_ia32_rsqrtpbf16256_mask (__A, + (__v16bf) _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rsqrt_pbh (__m256bh __W, __mmask16 __U, __m256bh __A) +{ + return (__m256bh) + __builtin_ia32_rsqrtpbf16256_mask (__A, __W, __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rsqrt_pbh (__mmask16 __U, __m256bh __A) +{ + return (__m256bh) + __builtin_ia32_rsqrtpbf16256_mask (__A, + (__v16bf) _mm256_setzero_si256 (), + __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_pbh (__m128bh __A) +{ + return (__m128bh) + __builtin_ia32_rsqrtpbf16128_mask (__A, + (__v8bf) _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt_pbh (__m128bh __W, __mmask8 __U, __m128bh __A) +{ + return (__m128bh) + __builtin_ia32_rsqrtpbf16128_mask (__A, __W, __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt_pbh (__mmask8 __U, __m128bh __A) +{ + return (__m128bh) + __builtin_ia32_rsqrtpbf16128_mask (__A, + (__v8bf) _mm_setzero_si128 (), + __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sqrtne_pbh (__m256bh __A) +{ + return (__m256bh) + __builtin_ia32_sqrtnepbf16256_mask (__A, + (__v16bf) _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sqrtne_pbh (__m256bh __W, __mmask16 __U, __m256bh __A) +{ + return (__m256bh) + __builtin_ia32_sqrtnepbf16256_mask (__A, __W, __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sqrtne_pbh (__mmask16 __U, __m256bh __A) +{ + return (__m256bh) + __builtin_ia32_sqrtnepbf16256_mask (__A, + (__v16bf) _mm256_setzero_si256 (), + __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrtne_pbh (__m128bh __A) +{ + return (__m128bh) + __builtin_ia32_sqrtnepbf16128_mask (__A, + (__v8bf) _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrtne_pbh (__m128bh __W, __mmask8 __U, __m128bh __A) +{ + return (__m128bh) + __builtin_ia32_sqrtnepbf16128_mask (__A, __W, __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrtne_pbh (__mmask8 __U, __m128bh __A) +{ + return (__m128bh) + __builtin_ia32_sqrtnepbf16128_mask (__A, + (__v8bf) _mm_setzero_si128 (), + __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp_pbh (__m256bh __A) +{ + return (__m256bh) + __builtin_ia32_rcppbf16256_mask (__A, + (__v16bf) _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rcp_pbh (__m256bh __W, __mmask16 __U, __m256bh __A) +{ + return (__m256bh) + __builtin_ia32_rcppbf16256_mask (__A, __W, __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rcp_pbh (__mmask16 __U, __m256bh __A) +{ + return (__m256bh) + __builtin_ia32_rcppbf16256_mask (__A, + (__v16bf) _mm256_setzero_si256 (), + __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_pbh (__m128bh __A) +{ + return (__m128bh) + __builtin_ia32_rcppbf16128_mask (__A, + (__v8bf) _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp_pbh (__m128bh __W, __mmask8 __U, __m128bh __A) +{ + return (__m128bh) + __builtin_ia32_rcppbf16128_mask (__A, __W, __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp_pbh (__mmask8 __U, __m128bh __A) +{ + return (__m128bh) + __builtin_ia32_rcppbf16128_mask (__A, + (__v8bf) _mm_setzero_si128 (), + __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getexp_pbh (__m256bh __A) +{ + return (__m256bh) + __builtin_ia32_getexppbf16256_mask (__A, + (__v16bf) _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getexp_pbh (__m256bh __W, __mmask16 __U, __m256bh __A) +{ + return (__m256bh) + __builtin_ia32_getexppbf16256_mask (__A, __W, __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getexp_pbh (__mmask16 __U, __m256bh __A) +{ + return (__m256bh) + __builtin_ia32_getexppbf16256_mask (__A, + (__v16bf) _mm256_setzero_si256 (), + __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_pbh (__m128bh __A) +{ + return (__m128bh) + __builtin_ia32_getexppbf16128_mask (__A, + (__v8bf) _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_pbh (__m128bh __W, __mmask8 __U, __m128bh __A) +{ + return (__m128bh) + __builtin_ia32_getexppbf16128_mask (__A, __W, __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_pbh (__mmask8 __U, __m128bh __A) +{ + return (__m128bh) + __builtin_ia32_getexppbf16128_mask (__A, + (__v8bf) _mm_setzero_si128 (), + __U); +} + +/* Intrinsics vrndscalepbf16. */ +#ifdef __OPTIMIZE__ +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_roundscalene_pbh (__m256bh __A, int B) +{ + return (__m256bh) + __builtin_ia32_rndscalenepbf16256_mask (__A, B, + (__v16bf) _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_roundscalene_pbh (__m256bh __W, __mmask16 __U, + __m256bh __A, int B) +{ + return (__m256bh) + __builtin_ia32_rndscalenepbf16256_mask (__A, B, __W, __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_roundscalene_pbh (__mmask16 __U, __m256bh __A, int B) +{ + return (__m256bh) + __builtin_ia32_rndscalenepbf16256_mask (__A, B, + (__v16bf) _mm256_setzero_si256 (), + __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscalene_pbh (__m128bh __A, int B) +{ + return (__m128bh) + __builtin_ia32_rndscalenepbf16128_mask (__A, B, + (__v8bf) _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscalene_pbh (__m128bh __W, __mmask8 __U, + __m128bh __A, int B) +{ + return (__m128bh) + __builtin_ia32_rndscalenepbf16128_mask (__A, B, __W, __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscalene_pbh (__mmask8 __U, __m128bh __A, int B) +{ + return (__m128bh) + __builtin_ia32_rndscalenepbf16128_mask (__A, B, + (__v8bf) _mm_setzero_si128 (), + __U); +} + +#else +#define _mm256_roundscalene_pbh(A, B) \ + (__builtin_ia32_rndscalenepbf16256_mask ((A), (B), \ + (__v16bf) _mm256_setzero_si256 (), \ + (__mmask16) -1)) + +#define _mm256_mask_roundscalene_pbh(A, B, C, D) \ + (__builtin_ia32_rndscalenepbf16256_mask ((C), (D), (A), (B))) + +#define _mm256_maskz_roundscalene_pbh(A, B, C) \ + (__builtin_ia32_rndscalenepbf16256_mask ((B), (C), \ + (__v16bf) _mm256_setzero_si256 (), \ + (A))) + +#define _mm_roundscalene_pbh(A, B) \ + (__builtin_ia32_rndscalenepbf16128_mask ((A), (B), \ + (__v8bf) _mm_setzero_si128 (), \ + (__mmask8) -1)) + +#define _mm_mask_roundscalene_pbh(A, B, C, D) \ + (__builtin_ia32_rndscalenepbf16128_mask ((C), (D), (A), (B))) + +#define _mm_maskz_roundscalene_pbh(A, B, C) \ + (__builtin_ia32_rndscalenepbf16128_mask ((B), (C), \ + (__v8bf) _mm_setzero_si128 (), \ + (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vreducepbf16. */ +#ifdef __OPTIMIZE__ +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reducene_pbh (__m256bh __A, int B) +{ + return (__m256bh) + __builtin_ia32_reducenepbf16256_mask (__A, B, + (__v16bf) _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reducene_pbh (__m256bh __W, __mmask16 __U, + __m256bh __A, int B) +{ + return (__m256bh) + __builtin_ia32_reducenepbf16256_mask (__A, B, __W, __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_reducene_pbh (__mmask16 __U, __m256bh __A, int B) +{ + return (__m256bh) + __builtin_ia32_reducenepbf16256_mask (__A, B, + (__v16bf) _mm256_setzero_si256 (), + __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reducene_pbh (__m128bh __A, int B) +{ + return (__m128bh) + __builtin_ia32_reducenepbf16128_mask (__A, B, + (__v8bf) _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reducene_pbh (__m128bh __W, __mmask8 __U, + __m128bh __A, int B) +{ + return (__m128bh) + __builtin_ia32_reducenepbf16128_mask (__A, B, __W, __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reducene_pbh (__mmask8 __U, __m128bh __A, int B) +{ + return (__m128bh) + __builtin_ia32_reducenepbf16128_mask (__A, B, + (__v8bf) _mm_setzero_si128 (), + __U); +} + +#else +#define _mm256_reducene_pbh(A, B) \ + (__builtin_ia32_reducenepbf16256_mask ((A), (B), \ + (__v16bf) _mm256_setzero_si256 (), \ + (__mmask16) -1)) + +#define _mm256_mask_reducene_pbh(A, B, C, D) \ + (__builtin_ia32_reducenepbf16256_mask ((C), (D), (A), (B))) + +#define _mm256_maskz_reducene_pbh(A, B, C) \ + (__builtin_ia32_reducenepbf16256_mask ((B), (C), \ + (__v16bf) _mm256_setzero_si256 (), \ + (A))) + +#define _mm_reducene_pbh(A, B) \ + (__builtin_ia32_reducenepbf16128_mask ((A), (B), \ + (__v8bf) _mm_setzero_si128 (), \ + (__mmask8) -1)) + +#define _mm_mask_reducene_pbh(A, B, C, D) \ + (__builtin_ia32_reducenepbf16128_mask ((C), (D), (A), (B))) + +#define _mm_maskz_reducene_pbh(A, B, C) \ + (__builtin_ia32_reducenepbf16128_mask ((B), (C), \ + (__v8bf) _mm_setzero_si128 (), \ + (A))) + +#endif /* __OPTIMIZE__ */ + + +/* Intrinsics vgetmantpbf16. */ +#ifdef __OPTIMIZE__ +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getmant_pbh (__m256bh __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256bh) + __builtin_ia32_getmantpbf16256_mask (__A, (int) (__C << 2) | __B, + (__v16bf) _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getmant_pbh (__m256bh __W, __mmask16 __U, __m256bh __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256bh) + __builtin_ia32_getmantpbf16256_mask (__A, (int) (__C << 2) | __B, + __W, __U); +} + +extern __inline__ __m256bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getmant_pbh (__mmask16 __U, __m256bh __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256bh) + __builtin_ia32_getmantpbf16256_mask (__A, (int) (__C << 2) | __B, + (__v16bf) _mm256_setzero_si256 (), + __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_pbh (__m128bh __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128bh) + __builtin_ia32_getmantpbf16128_mask (__A, (int) (__C << 2) | __B, + (__v8bf) _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_pbh (__m128bh __W, __mmask8 __U, __m128bh __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128bh) + __builtin_ia32_getmantpbf16128_mask (__A, (int) (__C << 2) | __B, + __W, __U); +} + +extern __inline__ __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_pbh (__mmask8 __U, __m128bh __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128bh) + __builtin_ia32_getmantpbf16128_mask (__A, (int) (__C << 2) | __B, + (__v8bf) _mm_setzero_si128 (), + __U); +} + +#else +#define _mm256_getmant_pbh(A, B, C) \ + (__builtin_ia32_getmantpbf16256_mask ((A), (int)(((C)<<2) | (B)), \ + (__v16bf) _mm256_setzero_si256 (), \ + (__mmask16) (-1))) + +#define _mm256_mask_getmant_pbh(A, B, C, D, E) \ + (__builtin_ia32_getmantpbf16256_mask ((C), (int)(((D)<<2) | (E)), (A), (B))) + +#define _mm256_maskz_getmant_pbh(A, B, C, D) \ + (__builtin_ia32_getmantpbf16256_mask ((B), (int)(((C)<<2) | (D)), \ + (__v16bf) _mm256_setzero_si256 (), \ + (A))) + +#define _mm_getmant_pbh(A, B, C) \ + (__builtin_ia32_getmantpbf16128_mask ((A), (int)(((C)<<2) | (B)), \ + (__v8bf) _mm_setzero_si128 (), \ + (__mmask8) (-1))) + +#define _mm_mask_getmant_pbh(A, B, C, D, E) \ + (__builtin_ia32_getmantpbf16128_mask ((C), (int)(((D)<<2) | (E)), (A), (B))) + +#define _mm_maskz_getmant_pbh(A, B, C, D) \ + (__builtin_ia32_getmantpbf16128_mask ((B), (int)(((C)<<2) | (D)), \ + (__v8bf) _mm_setzero_si128 (), (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfpclasspbf16. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fpclass_pbh_mask (__mmask16 __U, __m256bh __A, + const int __imm) +{ + return (__mmask16) + __builtin_ia32_fpclasspbf16256_mask (__A, __imm, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fpclass_pbh_mask (__m256bh __A, const int __imm) +{ + return (__mmask16) + __builtin_ia32_fpclasspbf16256_mask (__A, __imm, (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_pbh_mask (__mmask8 __U, __m128bh __A, const int __imm) +{ + return (__mmask8) + __builtin_ia32_fpclasspbf16128_mask (__A, __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_pbh_mask (__m128bh __A, const int __imm) +{ + return (__mmask8) + __builtin_ia32_fpclasspbf16128_mask (__A, __imm, (__mmask8) -1); +} + +#else +#define _mm256_mask_fpclass_pbh_mask(U, A, B) \ + ((__mmask16) __builtin_ia32_fpclasspbf16256_mask ((A), (B), (U))) + +#define _mm256_fpclass_pbh_mask(A, B) \ + ((__mmask16) __builtin_ia32_fpclasspbf16256_mask ((A), (B), \ + (__mmask16) (-1))) + +#define _mm_mask_fpclass_pbh_mask(U, A, B) \ + ((__mmask8) __builtin_ia32_fpclasspbf16128_mask ((A), (B), (U))) + +#define _mm_fpclass_pbh_mask(A, B) \ + ((__mmask8) __builtin_ia32_fpclasspbf16128_mask ((A), (B), \ + (__mmask8) (-1))) + +#endif /* __OPIMTIZE__ */ + + +/* Intrinsics vcmppbf16. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_pbh_mask (__mmask16 __U, __m256bh __A, + __m256bh __B, const int __imm) +{ + return (__mmask16) + __builtin_ia32_cmppbf16256_mask (__A, __B, __imm, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_pbh_mask (__m256bh __A, __m256bh __B, const int __imm) +{ + return (__mmask16) + __builtin_ia32_cmppbf16256_mask (__A, __B, __imm, (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_pbh_mask (__mmask8 __U, __m128bh __A, + __m128bh __B, const int __imm) +{ + return (__mmask8) + __builtin_ia32_cmppbf16128_mask (__A, __B, __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_pbh_mask (__m128bh __A, __m128bh __B, const int __imm) +{ + return (__mmask8) + __builtin_ia32_cmppbf16128_mask (__A, __B, __imm, (__mmask8) -1); +} + +#else +#define _mm256_mask_cmp_pbh_mask(A, B, C, D) \ + ((__mmask16) __builtin_ia32_cmppbf16256_mask ((B), (C), (D), (A))) + +#define _mm256_cmp_pbh_mask(A, B, C) \ + ((__mmask16) __builtin_ia32_cmppbf16256_mask ((A), (B), (C), \ + (__mmask16) (-1))) + +#define _mm_mask_cmp_pbh_mask(A, B, C, D) \ + ((__mmask8) __builtin_ia32_cmppbf16128_mask ((B), (C), (D), (A))) + +#define _mm_cmp_pbh_mask(A, B, C) \ + ((__mmask8) __builtin_ia32_cmppbf16128_mask ((A), (B), (C), \ + (__mmask8) (-1))) + +#endif /* __OPIMTIZE__ */ + +/* Intrinsics vcomsbf16. */ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_sbh (__m128bh __A, __m128bh __B) +{ + return __builtin_ia32_vcomsbf16eq (__A, __B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_sbh (__m128bh __A, __m128bh __B) +{ + return __builtin_ia32_vcomsbf16lt (__A, __B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_sbh (__m128bh __A, __m128bh __B) +{ + return __builtin_ia32_vcomsbf16le (__A, __B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_sbh (__m128bh __A, __m128bh __B) +{ + return __builtin_ia32_vcomsbf16gt (__A, __B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_sbh (__m128bh __A, __m128bh __B) +{ + return __builtin_ia32_vcomsbf16ge (__A, __B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_sbh (__m128bh __A, __m128bh __B) +{ + return __builtin_ia32_vcomsbf16neq (__A, __B); +} + #ifdef __DISABLE_AVX10_2_256__ #undef __DISABLE_AVX10_2_256__ #pragma GCC pop_options diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index f3838424fd4..e6f53589e70 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -1483,3 +1483,13 @@ DEF_FUNCTION_TYPE (V8BF, V8BF, V8BF, UQI) DEF_FUNCTION_TYPE (V32BF, V32BF, V32BF, V32BF, USI) DEF_FUNCTION_TYPE (V16BF, V16BF, V16BF, V16BF, UHI) DEF_FUNCTION_TYPE (V8BF, V8BF, V8BF, V8BF, UQI) +DEF_FUNCTION_TYPE (V32BF, V32BF, INT, V32BF, USI) +DEF_FUNCTION_TYPE (V16BF, V16BF, INT, V16BF, UHI) +DEF_FUNCTION_TYPE (V8BF, V8BF, INT, V8BF, UQI) +DEF_FUNCTION_TYPE (QI, V8BF, INT, UQI) +DEF_FUNCTION_TYPE (HI, V16BF, INT, UHI) +DEF_FUNCTION_TYPE (SI, V32BF, INT, USI) +DEF_FUNCTION_TYPE (USI, V32BF, V32BF, INT, USI) +DEF_FUNCTION_TYPE (UHI, V16BF, V16BF, INT, UHI) +DEF_FUNCTION_TYPE (UQI, V8BF, V8BF, INT, UQI) +DEF_FUNCTION_TYPE (INT, V8BF, V8BF) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 3f3bc768348..25b8169c1ef 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -3237,6 +3237,39 @@ BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_fnmsubnepbf16_v16bf_mas BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_fnmsubnepbf16_v8bf_mask, "__builtin_ia32_fnmsubnepbf16128_mask", IX86_BUILTIN_FNMSUBNEPBF16128_MASK, UNKNOWN, (int) V8BF_FTYPE_V8BF_V8BF_V8BF_UQI) BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_fnmsubnepbf16_v8bf_mask3, "__builtin_ia32_fnmsubnepbf16128_mask3", IX86_BUILTIN_FNMSUBNEPBF16128_MASK3, UNKNOWN, (int) V8BF_FTYPE_V8BF_V8BF_V8BF_UQI) BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_fnmsubnepbf16_v8bf_maskz, "__builtin_ia32_fnmsubnepbf16128_maskz", IX86_BUILTIN_FNMSUBNEPBF16128_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V8BF_V8BF_V8BF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_rsqrtpbf16_v32bf_mask, "__builtin_ia32_rsqrtpbf16512_mask", IX86_BUILTIN_RSQRTPBF16512_MASK, UNKNOWN, (int) V32BF_FTYPE_V32BF_V32BF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_rsqrtpbf16_v16bf_mask, "__builtin_ia32_rsqrtpbf16256_mask", IX86_BUILTIN_RSQRTPBF16256_MASK, UNKNOWN, (int) V16BF_FTYPE_V16BF_V16BF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_rsqrtpbf16_v8bf_mask, "__builtin_ia32_rsqrtpbf16128_mask", IX86_BUILTIN_RSQRTPBF16128_MASK, UNKNOWN, (int) V8BF_FTYPE_V8BF_V8BF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_sqrtnepbf16_v32bf_mask, "__builtin_ia32_sqrtnepbf16512_mask", IX86_BUILTIN_SQRTNEPBF16512_MASK, UNKNOWN, (int) V32BF_FTYPE_V32BF_V32BF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_sqrtnepbf16_v16bf_mask, "__builtin_ia32_sqrtnepbf16256_mask", IX86_BUILTIN_SQRTNEPBF16256_MASK, UNKNOWN, (int) V16BF_FTYPE_V16BF_V16BF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_sqrtnepbf16_v8bf_mask, "__builtin_ia32_sqrtnepbf16128_mask", IX86_BUILTIN_SQRTNEPBF16128_MASK, UNKNOWN, (int) V8BF_FTYPE_V8BF_V8BF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_rcppbf16_v32bf_mask, "__builtin_ia32_rcppbf16512_mask", IX86_BUILTIN_RCPPBF16512_MASK, UNKNOWN, (int) V32BF_FTYPE_V32BF_V32BF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_rcppbf16_v16bf_mask, "__builtin_ia32_rcppbf16256_mask", IX86_BUILTIN_RCPPBF16256_MASK, UNKNOWN, (int) V16BF_FTYPE_V16BF_V16BF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_rcppbf16_v8bf_mask, "__builtin_ia32_rcppbf16128_mask", IX86_BUILTIN_RCPPBF16128_MASK, UNKNOWN, (int) V8BF_FTYPE_V8BF_V8BF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_getexppbf16_v32bf_mask, "__builtin_ia32_getexppbf16512_mask", IX86_BUILTIN_GETEXPPBF16512_MASK, UNKNOWN, (int) V32BF_FTYPE_V32BF_V32BF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_getexppbf16_v16bf_mask, "__builtin_ia32_getexppbf16256_mask", IX86_BUILTIN_GETEXPPBF16256_MASK, UNKNOWN, (int) V16BF_FTYPE_V16BF_V16BF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_getexppbf16_v8bf_mask, "__builtin_ia32_getexppbf16128_mask", IX86_BUILTIN_GETEXPPBF16128_MASK, UNKNOWN, (int) V8BF_FTYPE_V8BF_V8BF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_rndscalenepbf16_v32bf_mask, "__builtin_ia32_rndscalenepbf16512_mask", IX86_BUILTIN_RNDSCALENEPBF16512_MASK, UNKNOWN, (int) V32BF_FTYPE_V32BF_INT_V32BF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_rndscalenepbf16_v16bf_mask, "__builtin_ia32_rndscalenepbf16256_mask", IX86_BUILTIN_RNDSCALENEPBF16256_MASK, UNKNOWN, (int) V16BF_FTYPE_V16BF_INT_V16BF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_rndscalenepbf16_v8bf_mask, "__builtin_ia32_rndscalenepbf16128_mask", IX86_BUILTIN_RNDSCALENEPBF16128_MASK, UNKNOWN, (int) V8BF_FTYPE_V8BF_INT_V8BF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_reducenepbf16_v32bf_mask, "__builtin_ia32_reducenepbf16512_mask", IX86_BUILTIN_REDUCENEPBF16512_MASK, UNKNOWN, (int) V32BF_FTYPE_V32BF_INT_V32BF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_reducenepbf16_v16bf_mask, "__builtin_ia32_reducenepbf16256_mask", IX86_BUILTIN_REDUCENEPBF16256_MASK, UNKNOWN, (int) V16BF_FTYPE_V16BF_INT_V16BF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_reducenepbf16_v8bf_mask, "__builtin_ia32_reducenepbf16128_mask", IX86_BUILTIN_REDUCENEPBF16128_MASK, UNKNOWN, (int) V8BF_FTYPE_V8BF_INT_V8BF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_getmantpbf16_v32bf_mask, "__builtin_ia32_getmantpbf16512_mask", IX86_BUILTIN_GETMANTPBF16512_MASK, UNKNOWN, (int) V32BF_FTYPE_V32BF_INT_V32BF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_getmantpbf16_v16bf_mask, "__builtin_ia32_getmantpbf16256_mask", IX86_BUILTIN_GETMANTPBF16256_MASK, UNKNOWN, (int) V16BF_FTYPE_V16BF_INT_V16BF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_getmantpbf16_v8bf_mask, "__builtin_ia32_getmantpbf16128_mask", IX86_BUILTIN_GETMANTPBF16128_MASK, UNKNOWN, (int) V8BF_FTYPE_V8BF_INT_V8BF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_fpclasspbf16_v32bf_mask, "__builtin_ia32_fpclasspbf16512_mask", IX86_BUILTIN_FPCLASSPBF16512_MASK, UNKNOWN, (int) SI_FTYPE_V32BF_INT_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_fpclasspbf16_v16bf_mask, "__builtin_ia32_fpclasspbf16256_mask", IX86_BUILTIN_FPCLASSPBF16256_MASK, UNKNOWN, (int) HI_FTYPE_V16BF_INT_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_fpclasspbf16_v8bf_mask, "__builtin_ia32_fpclasspbf16128_mask", IX86_BUILTIN_FPCLASSPBF16128_MASK, UNKNOWN, (int) QI_FTYPE_V8BF_INT_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_cmppbf16_v32bf_mask, "__builtin_ia32_cmppbf16512_mask", IX86_BUILTIN_CMPPBF16512_MASK, UNKNOWN, (int) USI_FTYPE_V32BF_V32BF_INT_USI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_cmppbf16_v16bf_mask, "__builtin_ia32_cmppbf16256_mask", IX86_BUILTIN_CMPPBF16256_MASK, UNKNOWN, (int) UHI_FTYPE_V16BF_V16BF_INT_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_cmppbf16_v8bf_mask, "__builtin_ia32_cmppbf16128_mask", IX86_BUILTIN_CMPPBF16128_MASK, UNKNOWN, (int) UQI_FTYPE_V8BF_V8BF_INT_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_comsbf16_v8bf, "__builtin_ia32_vcomsbf16eq", IX86_BUILTIN_VCOMSBF16EQ, EQ, (int) INT_FTYPE_V8BF_V8BF) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_comsbf16_v8bf, "__builtin_ia32_vcomsbf16gt", IX86_BUILTIN_VCOMSBF16GT, GT, (int) INT_FTYPE_V8BF_V8BF) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_comsbf16_v8bf, "__builtin_ia32_vcomsbf16ge", IX86_BUILTIN_VCOMSBF16GE, GE, (int) INT_FTYPE_V8BF_V8BF) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_comsbf16_v8bf, "__builtin_ia32_vcomsbf16le", IX86_BUILTIN_VCOMSBF16LE, LE, (int) INT_FTYPE_V8BF_V8BF) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_comsbf16_v8bf, "__builtin_ia32_vcomsbf16lt", IX86_BUILTIN_VCOMSBF16LT, LT, (int) INT_FTYPE_V8BF_V8BF) +BDESC (0, OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_comsbf16_v8bf, "__builtin_ia32_vcomsbf16neq", IX86_BUILTIN_VCOMSBF16NE, NE, (int) INT_FTYPE_V8BF_V8BF) /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index dff9e09809e..7ea41924b98 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -11712,6 +11712,9 @@ ix86_expand_args_builtin (const struct builtin_description *d, case QI_FTYPE_V8HF_INT_UQI: case HI_FTYPE_V16HF_INT_UHI: case SI_FTYPE_V32HF_INT_USI: + case QI_FTYPE_V8BF_INT_UQI: + case HI_FTYPE_V16BF_INT_UHI: + case SI_FTYPE_V32BF_INT_USI: case V4SI_FTYPE_V4SI_V4SI_UHI: case V8SI_FTYPE_V8SI_V8SI_UHI: nargs = 3; @@ -11825,9 +11828,12 @@ ix86_expand_args_builtin (const struct builtin_description *d, case USI_FTYPE_V32QI_V32QI_INT_USI: case UHI_FTYPE_V16QI_V16QI_INT_UHI: case USI_FTYPE_V32HI_V32HI_INT_USI: + case USI_FTYPE_V32BF_V32BF_INT_USI: case USI_FTYPE_V32HF_V32HF_INT_USI: case UHI_FTYPE_V16HI_V16HI_INT_UHI: + case UHI_FTYPE_V16BF_V16BF_INT_UHI: case UQI_FTYPE_V8HI_V8HI_INT_UQI: + case UQI_FTYPE_V8BF_V8BF_INT_UQI: nargs = 4; mask_pos = 1; nargs_constant = 1; @@ -11864,6 +11870,9 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V32HI_FTYPE_V32HI_INT_V32HI_USI: case V16HI_FTYPE_V16HI_INT_V16HI_UHI: case V8HI_FTYPE_V8HI_INT_V8HI_UQI: + case V32BF_FTYPE_V32BF_INT_V32BF_USI: + case V16BF_FTYPE_V16BF_INT_V16BF_UHI: + case V8BF_FTYPE_V8BF_INT_V8BF_UQI: case V4DI_FTYPE_V4DI_INT_V4DI_UQI: case V2DI_FTYPE_V2DI_INT_V2DI_UQI: case V8SI_FTYPE_V8SI_INT_V8SI_UQI: @@ -15662,6 +15671,13 @@ rdseed_step: case IX86_BUILTIN_RDPID: return ix86_expand_special_args_builtin (bdesc_args + i, exp, target); + case IX86_BUILTIN_VCOMSBF16EQ: + case IX86_BUILTIN_VCOMSBF16NE: + case IX86_BUILTIN_VCOMSBF16GT: + case IX86_BUILTIN_VCOMSBF16GE: + case IX86_BUILTIN_VCOMSBF16LT: + case IX86_BUILTIN_VCOMSBF16LE: + return ix86_expand_sse_comi (bdesc_args + i, exp, target); case IX86_BUILTIN_FABSQ: case IX86_BUILTIN_COPYSIGNQ: if (!TARGET_SSE) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 50274f01a01..d7d99c6359f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -230,6 +230,11 @@ UNSPEC_VCVTNEPH2HF8S UNSPEC_VCVTHF82PH UNSPEC_VSCALEFPBF16 + UNSPEC_VRNDSCALENEPBF16 + UNSPEC_VREDUCENEPBF16 + UNSPEC_VGETMANTPBF16 + UNSPEC_VFPCLASSPBF16 + UNSPEC_VCOMSBF16 ]) (define_c_enum "unspecv" [ @@ -835,6 +840,7 @@ (define_mode_attr vecmemsuffix [(V32HF "{z}") (V16HF "{y}") (V8HF "{x}") + (V32BF "{z}") (V16BF "{y}") (V8BF "{x}") (V16SF "{z}") (V8SF "{y}") (V4SF "{x}") (V8DF "{z}") (V4DF "{y}") (V2DF "{x}")]) @@ -32105,3 +32111,89 @@ [(set_attr "prefix" "evex") (set_attr "type" "ssemuladd") (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx10_2_rsqrtpbf16_<mode><mask_name>" + [(set (match_operand:VBF_AVX10_2 0 "register_operand" "=v") + (unspec:VBF_AVX10_2 + [(match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "vm")] + UNSPEC_RSQRT))] + "TARGET_AVX10_2_256" + "vrsqrtpbf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "prefix" "evex")]) + +(define_insn "avx10_2_sqrtnepbf16_<mode><mask_name>" + [(set (match_operand:VBF_AVX10_2 0 "register_operand" "=v") + (sqrt:VBF_AVX10_2 + (match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX10_2_256" + "vsqrtnepbf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "prefix" "evex")]) + +(define_insn "avx10_2_rcppbf16_<mode><mask_name>" + [(set (match_operand:VBF_AVX10_2 0 "register_operand" "=v") + (unspec:VBF_AVX10_2 + [(match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "vm")] + UNSPEC_RCP))] + "TARGET_AVX10_2_256" + "vrcppbf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "prefix" "evex")]) + +(define_insn "avx10_2_getexppbf16_<mode><mask_name>" + [(set (match_operand:VBF_AVX10_2 0 "register_operand" "=v") + (unspec:VBF_AVX10_2 + [(match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "vm")] + UNSPEC_GETEXP))] + "TARGET_AVX10_2_256" + "vgetexppbf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "prefix" "evex")]) + +(define_int_iterator BF16IMMOP + [UNSPEC_VRNDSCALENEPBF16 + UNSPEC_VREDUCENEPBF16 + UNSPEC_VGETMANTPBF16]) + +(define_int_attr bf16immop + [(UNSPEC_VRNDSCALENEPBF16 "rndscalene") + (UNSPEC_VREDUCENEPBF16 "reducene") + (UNSPEC_VGETMANTPBF16 "getmant")]) + +(define_insn "avx10_2_<bf16immop>pbf16_<mode><mask_name>" + [(set (match_operand:VBF_AVX10_2 0 "register_operand" "=v") + (unspec:VBF_AVX10_2 + [(match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "vm") + (match_operand:SI 2 "const_0_to_255_operand")] + BF16IMMOP))] + "TARGET_AVX10_2_256" + "v<bf16immop>pbf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" + [(set_attr "prefix" "evex")]) + +(define_insn "avx10_2_fpclasspbf16_<mode><mask_scalar_merge_name>" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "vm") + (match_operand 2 "const_0_to_255_operand")] + UNSPEC_VFPCLASSPBF16))] + "TARGET_AVX10_2_256" + "vfpclasspbf16<vecmemsuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" + [(set_attr "prefix" "evex")]) + +(define_insn "avx10_2_cmppbf16_<mode><mask_scalar_merge_name>" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VBF_AVX10_2 1 "register_operand" "v") + (match_operand:VBF_AVX10_2 2 "nonimmediate_operand" "vm") + (match_operand 3 "const_0_to_31_operand" "n")] + UNSPEC_PCMP))] + "TARGET_AVX10_2_256" + "vcmppbf16\t{%3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %3}" + [(set_attr "prefix" "evex")]) + +(define_insn "avx10_2_comsbf16_v8bf" + [(set (reg:CCFP FLAGS_REG) + (unspec:CCFP + [(match_operand:V8BF 0 "register_operand" "v") + (match_operand:V8BF 1 "nonimmediate_operand" "vm")] + UNSPEC_VCOMSBF16))] + "TARGET_AVX10_2_256" + "vcomsbf16\t{%1, %0|%0, %1}" + [(set_attr "prefix" "evex")]) diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c index 4a47e313096..df4cfdfff8d 100644 --- a/gcc/testsuite/gcc.target/i386/avx-1.c +++ b/gcc/testsuite/gcc.target/i386/avx-1.c @@ -1016,6 +1016,25 @@ /* avx10_2-512convertintrin.h */ #define __builtin_ia32_vcvt2ps2phx512_mask_round(A, B, C, D, E) __builtin_ia32_vcvt2ps2phx512_mask_round(A, B, C, D, 8) +/* avx10_2-512bf16intrin.h */ +#define __builtin_ia32_rndscalenepbf16512_mask(A, B, C, D) __builtin_ia32_rndscalenepbf16512_mask(A, 123, C, D) +#define __builtin_ia32_reducenepbf16512_mask(A, B, C, D) __builtin_ia32_reducenepbf16512_mask(A, 123, C, D) +#define __builtin_ia32_getmantpbf16512_mask(A, B, C, D) __builtin_ia32_getmantpbf16512_mask(A, 1, C, D) +#define __builtin_ia32_fpclasspbf16512_mask(A, B, C) __builtin_ia32_fpclasspbf16512_mask(A, 1, C) +#define __builtin_ia32_cmppbf16512_mask(A, B, C, D) __builtin_ia32_cmppbf16512_mask(A, B, 1, D) + +/* avx10_2bf16intrin.h */ +#define __builtin_ia32_rndscalenepbf16256_mask(A, B, C, D) __builtin_ia32_rndscalenepbf16256_mask(A, 123, C, D) +#define __builtin_ia32_rndscalenepbf16128_mask(A, B, C, D) __builtin_ia32_rndscalenepbf16128_mask(A, 123, C, D) +#define __builtin_ia32_reducenepbf16256_mask(A, B, C, D) __builtin_ia32_reducenepbf16256_mask(A, 123, C, D) +#define __builtin_ia32_reducenepbf16128_mask(A, B, C, D) __builtin_ia32_reducenepbf16128_mask(A, 123, C, D) +#define __builtin_ia32_getmantpbf16256_mask(A, B, C, D) __builtin_ia32_getmantpbf16256_mask(A, 1, C, D) +#define __builtin_ia32_getmantpbf16128_mask(A, B, C, D) __builtin_ia32_getmantpbf16128_mask(A, 1, C, D) +#define __builtin_ia32_fpclasspbf16256_mask(A, B, C) __builtin_ia32_fpclasspbf16256_mask(A, 1, C) +#define __builtin_ia32_fpclasspbf16128_mask(A, B, C) __builtin_ia32_fpclasspbf16128_mask(A, 1, C) +#define __builtin_ia32_cmppbf16256_mask(A, B, C, D) __builtin_ia32_cmppbf16256_mask(A, B, 1, D) +#define __builtin_ia32_cmppbf16128_mask(A, B, C, D) __builtin_ia32_cmppbf16128_mask(A, B, 1, D) + #include <wmmintrin.h> #include <immintrin.h> #include <mm3dnow.h> diff --git a/gcc/testsuite/gcc.target/i386/avx10-check.h b/gcc/testsuite/gcc.target/i386/avx10-check.h index 76c32d7acaa..87fa818f048 100644 --- a/gcc/testsuite/gcc.target/i386/avx10-check.h +++ b/gcc/testsuite/gcc.target/i386/avx10-check.h @@ -5,7 +5,7 @@ #ifndef DO_TEST #define DO_TEST do_test -#if defined(AVX10_512BIT) +#if defined(AVX10_512BIT) || defined(AVX10_SCALAR) static void test_512 (void); #else static void test_256 (void); @@ -16,7 +16,7 @@ __attribute__ ((noinline)) static void do_test (void) { -#if defined(AVX10_512BIT) +#if defined(AVX10_512BIT) || defined(AVX10_SCALAR) test_512 (); #else test_256 (); diff --git a/gcc/testsuite/gcc.target/i386/avx10-helper.h b/gcc/testsuite/gcc.target/i386/avx10-helper.h index 9ff1dd72e92..4d092e27447 100644 --- a/gcc/testsuite/gcc.target/i386/avx10-helper.h +++ b/gcc/testsuite/gcc.target/i386/avx10-helper.h @@ -53,6 +53,34 @@ scalef (float x, float y) return _mm_cvtss_f32 (out); } +float NOINLINE +getexp (float val) +{ + float res; + __m128 px = _mm_load_ss (&val); + __m128 mx = _mm_broadcastss_ps (px); + __m128 out = _mm_getexp_ps (mx); + return _mm_cvtss_f32 (out); +} + +float NOINLINE +rndscale (float val) +{ + __m128 px = _mm_load_ss (&val); + __m128 mx = _mm_broadcastss_ps (px); + __m128 out = _mm_roundscale_ps (mx, 0x10); + return _mm_cvtss_f32 (out); +} + +float NOINLINE +getmant(float val) +{ + __m128 px = _mm_load_ss (&val); + __m128 mx = _mm_broadcastss_ps (px); + __m128 out = _mm_getmant_ps (mx, 0, 0); + return _mm_cvtss_f32 (out); +} + #endif /* AVX10_HELPER_INCLUDED */ /* Intrinsic being tested. It has different deffinitions, diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-bf16-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf16-1.c index 78839fb1297..6d111a10b41 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-512-bf16-1.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-bf16-1.c @@ -37,9 +37,36 @@ /* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[ \\t\]+%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vfnmsub231nepbf16\[ \\t\]+%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[ \\t\]+%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrtpbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrtpbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrtpbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtnepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetexppbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetexppbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetexppbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalenepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalenepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalenepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducenepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducenepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducenepbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetmantpbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetmantpbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetmantpbf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfpclasspbf16z\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n^k\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfpclasspbf16z\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n^k\]*%k\[0-7\]\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcmppbf16\[ \\t\]+\\\$1\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%k\[0-9\](?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcmppbf16\[ \\t\]+\\\$2\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%k\[0-9\]\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ #include <immintrin.h> +#define IMM 123 + volatile __m512bh res, x1, x2; volatile __mmask32 m32; @@ -84,4 +111,35 @@ avx10_2_512_test (void) res = _mm512_mask_fnmsubne_pbh (res, m32, x1, x2); res = _mm512_mask3_fnmsubne_pbh (res, x1, x2, m32); res = _mm512_maskz_fnmsubne_pbh (m32,res, x1, x2); + + res = _mm512_rsqrt_pbh (x1); + res = _mm512_mask_rsqrt_pbh (res, m32, x1); + res = _mm512_maskz_rsqrt_pbh (m32, x1); + res = _mm512_sqrtne_pbh (x1); + res = _mm512_mask_sqrtne_pbh (res, m32, x1); + res = _mm512_maskz_sqrtne_pbh (m32, x1); + res = _mm512_rcp_pbh (x1); + res = _mm512_mask_rcp_pbh (res, m32, x1); + res = _mm512_maskz_rcp_pbh (m32, x1); + res = _mm512_getexp_pbh (x1); + res = _mm512_mask_getexp_pbh (res, m32, x1); + res = _mm512_maskz_getexp_pbh (m32, x1); + + res = _mm512_roundscalene_pbh (x1, IMM); + res = _mm512_mask_roundscalene_pbh (res, m32, x1, IMM); + res = _mm512_maskz_roundscalene_pbh (m32, x1, IMM); + res = _mm512_reducene_pbh (x1, IMM); + res = _mm512_mask_reducene_pbh (res, m32, x1, IMM); + res = _mm512_maskz_reducene_pbh (m32, x1, IMM); + res = _mm512_getmant_pbh (x1, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_src); + res = _mm512_mask_getmant_pbh (res, m32, x1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + res = _mm512_maskz_getmant_pbh (m32, x1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + + m32 = _mm512_fpclass_pbh_mask (x1, 13); + m32 = _mm512_mask_fpclass_pbh_mask (2, x1, 13); + + m32 = _mm512_cmp_pbh_mask (x1, x2, 1); + m32 = _mm512_mask_cmp_pbh_mask (m32, x1, x2, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vcmppbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vcmppbf16-2.c new file mode 100644 index 00000000000..a352890e9bc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vcmppbf16-2.c @@ -0,0 +1,36 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" +#define SIZE_RES (AVX512F_LEN / 16) + +void +TEST (void) +{ + int i; + __mmask32 res1, res2, exp = 0; + UNION_TYPE (AVX512F_LEN, bf16_uw) src1, src2; + MASK_TYPE mask = MASK_VALUE; + + for (i = 0; i < SIZE_RES; i++) + { + float x = 0.5; + float y = 0.25; + src2.a[i] = convert_fp32_to_bf16 (y); + src1.a[i] = convert_fp32_to_bf16 (x); + if (src1.a[i] == src2.a[i]) + exp |= 1 << i; + } + + res1 = INTRINSIC (_cmp_pbh_mask) (src1.x, src2.x, 0); + res2 = INTRINSIC (_mask_cmp_pbh_mask) (mask, src1.x, src2.x, 0); + + if (exp != res1 || exp != res2) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vfpclasspbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vfpclasspbf16-2.c new file mode 100644 index 00000000000..1b25a070eff --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vfpclasspbf16-2.c @@ -0,0 +1,44 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" +#define SIZE_RES (AVX512F_LEN / 16) + +void +TEST (void) +{ + int i; + MASK_TYPE res1 = 0, res2 = 0; + __mmask16 exp = 0; + UNION_TYPE (AVX512F_LEN, bf16_uw) src1; + UNION_TYPE (AVX512F_LEN, ) src2; + MASK_TYPE mask = MASK_VALUE; + + for (i = 0; i < SIZE_RES / 2; i++) + { + src1.a[i] = 0; + src2.a[i] = (uint32_t) (src1.a[i]) << 16; + } + + for (i = SIZE_RES / 2; i < SIZE_RES; i++) + src1.a[i] = 0; + + src1.a[0] = 0x7FC0; + src2.a[0] = convert_bf16_to_fp32 (src1.a[0]); + + _mm_setcsr (0x9FC0); + exp = INTRINSIC (_fpclass_ps_mask) (src2.x, 0x01); + + _mm_setcsr (0x1f80); + res1 = INTRINSIC (_fpclass_pbh_mask) (src1.x, 0x01); + res2 = INTRINSIC (_mask_fpclass_pbh_mask) (mask, src1.x, 1); + + if (exp != res1 || exp != res2) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vgetexppbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vgetexppbf16-2.c new file mode 100644 index 00000000000..def6d93ccad --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vgetexppbf16-2.c @@ -0,0 +1,47 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" +#define SIZE_RES (AVX512F_LEN / 16) + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, bf16_uw) res1, res2, res3, src1; + MASK_TYPE mask = MASK_VALUE; + unsigned short res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + float f, s; + f = 28 * i + 1; + src1.a[i] = convert_fp32_to_bf16 (f); + s = convert_bf16_to_fp32 (src1.a[i]); + res_ref[i] = res_ref2[i] = convert_fp32_to_bf16 (getexp (s)); + } + + res1.x = INTRINSIC (_getexp_pbh) (src1.x); + res2.x = INTRINSIC (_mask_getexp_pbh) (res2.x, mask, src1.x); + res3.x = INTRINSIC (_maskz_getexp_pbh) (mask, src1.x); + + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res1, res_ref)) + abort (); + + MASK_MERGE (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res2, res_ref2)) + abort (); + + MASK_ZERO (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vgetmantpbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vgetmantpbf16-2.c new file mode 100644 index 00000000000..898cf5ccf38 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vgetmantpbf16-2.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" +#define SIZE_RES (AVX512F_LEN / 16) + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, bf16_uw) res1, res2, res3, src1; + MASK_TYPE mask = MASK_VALUE; + unsigned short res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 5.0; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + src1.a[i] = 0.5; + float x = convert_bf16_to_fp32 (src1.a[i]); + res_ref[i] = res_ref2[i] = convert_fp32_to_bf16 (getmant (x)); + } + + res1.x = INTRINSIC (_getmant_pbh) (src1.x, _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_src); + res2.x = INTRINSIC (_mask_getmant_pbh) (res2.x, mask, src1.x, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_src); + res3.x = INTRINSIC (_maskz_getmant_pbh) (mask, src1.x, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_src); + + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res1, res_ref)) + abort (); + + MASK_MERGE (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res2, res_ref2)) + abort (); + + MASK_ZERO (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vrcppbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vrcppbf16-2.c new file mode 100644 index 00000000000..0bca27d504f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vrcppbf16-2.c @@ -0,0 +1,45 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" +#define SIZE_RES (AVX512F_LEN / 16) + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, bf16_uw) res1, res2, res3, src1; + MASK_TYPE mask = MASK_VALUE; + unsigned short res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + float s1 = 2.0; + src1.a[i] = convert_fp32_to_bf16 (s1); + res_ref[i] = res_ref2[i] = convert_fp32_to_bf16 (1.0 / s1); + } + + res1.x = INTRINSIC (_rcp_pbh) (src1.x); + res2.x = INTRINSIC (_mask_rcp_pbh) (res2.x, mask, src1.x); + res3.x = INTRINSIC (_maskz_rcp_pbh) (mask, src1.x); + + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res1, res_ref)) + abort (); + + MASK_MERGE (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res2, res_ref2)) + abort (); + + MASK_ZERO (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vreducenepbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vreducenepbf16-2.c new file mode 100644 index 00000000000..c3e2b36864e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vreducenepbf16-2.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" +#define SIZE_RES (AVX512F_LEN / 16) + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, bf16_uw) res1, res2, res3, src1; + MASK_TYPE mask = MASK_VALUE; + unsigned short res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 5.0; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + float s = (float) (SIZE_RES - 1) / (float) i; + src1.a[i] = convert_fp32_to_bf16 (s); + float x = convert_bf16_to_fp32 (src1.a[i]); + __m128 px = _mm_load_ss (&x); + __m128 mx = _mm_broadcastss_ps (px); + __m128 out = _mm_reduce_ps (mx, 0x10); + float res = _mm_cvtss_f32 (out); + res_ref[i] = res_ref2[i] = convert_fp32_to_bf16_ne (res); + } + + res1.x = INTRINSIC (_reducene_pbh) (src1.x, 0x10); + res2.x = INTRINSIC (_mask_reducene_pbh) (res2.x, mask, src1.x, 0x10); + res3.x = INTRINSIC (_maskz_reducene_pbh) (mask, src1.x, 0x10); + + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res1, res_ref)) + abort (); + + MASK_MERGE (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res2, res_ref2)) + abort (); + + MASK_ZERO (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vrndscalenepbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vrndscalenepbf16-2.c new file mode 100644 index 00000000000..5b0e6a89120 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vrndscalenepbf16-2.c @@ -0,0 +1,46 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" +#define SIZE_RES (AVX512F_LEN / 16) + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, bf16_uw) res1, res2, res3, src1; + MASK_TYPE mask = MASK_VALUE; + unsigned short res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 5.0; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + float s = (float) (SIZE_RES - 1) / (float) i; + src1.a[i] = convert_fp32_to_bf16 (s); + float x = convert_bf16_to_fp32 (src1.a[i]); + res_ref[i] = res_ref2[i] = convert_fp32_to_bf16_ne (rndscale (x)); + } + + res1.x = INTRINSIC (_roundscalene_pbh) (src1.x, 0x10); + res2.x = INTRINSIC (_mask_roundscalene_pbh) (res2.x, mask, src1.x, 0x10); + res3.x = INTRINSIC (_maskz_roundscalene_pbh) (mask, src1.x, 0x10); + + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res1, res_ref)) + abort (); + + MASK_MERGE (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res2, res_ref2)) + abort (); + + MASK_ZERO (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vrsqrtpbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vrsqrtpbf16-2.c new file mode 100644 index 00000000000..a879efce3f8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vrsqrtpbf16-2.c @@ -0,0 +1,47 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" +#include <math.h> +#define SIZE_RES (AVX512F_LEN / 16) + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, bf16_uw) res1, res2, res3, src1; + MASK_TYPE mask = MASK_VALUE; + unsigned short res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + float s1 = 2.0; + float rs = 1.0 / sqrtf (s1); + src1.a[i] = convert_fp32_to_bf16 (s1); + res_ref[i] = res_ref2[i] = convert_fp32_to_bf16 (rs); + } + + res1.x = INTRINSIC (_rsqrt_pbh) (src1.x); + res2.x = INTRINSIC (_mask_rsqrt_pbh) (res2.x, mask, src1.x); + res3.x = INTRINSIC (_maskz_rsqrt_pbh) (mask, src1.x); + + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res1, res_ref)) + abort (); + + MASK_MERGE (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res2, res_ref2)) + abort (); + + MASK_ZERO (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vscalefpbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vscalefpbf16-2.c index 867f77ad3a7..78df474240d 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-512-vscalefpbf16-2.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vscalefpbf16-2.c @@ -31,7 +31,7 @@ TEST (void) xx = convert_bf16_to_fp32 (src1.a[i]); yy = convert_bf16_to_fp32 (src2.a[i]); res = scalef (xx, yy); - res_ref[i] = res_ref2[i] = convert_fp32_to_bf16_ne(res); + res_ref[i] = res_ref2[i] = convert_fp32_to_bf16 (res); } res1.x = INTRINSIC (_scalef_pbh) (src1.x, src2.x); diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-vsqrtnepbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-vsqrtnepbf16-2.c new file mode 100644 index 00000000000..987c9b1abe9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-vsqrtnepbf16-2.c @@ -0,0 +1,47 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2-512" } */ +/* { dg-require-effective-target avx10_2_512 } */ + +#ifndef AVX10_2 +#define AVX10_2 +#define AVX10_2_512 +#define AVX10_512BIT +#endif +#include "avx10-helper.h" +#include <math.h> +#define SIZE_RES (AVX512F_LEN / 16) + +void +TEST (void) +{ + int i; + UNION_TYPE (AVX512F_LEN, bf16_uw) res1, res2, res3, src1; + MASK_TYPE mask = MASK_VALUE; + unsigned short res_ref[SIZE_RES], res_ref2[SIZE_RES]; + + for (i = 0; i < SIZE_RES; i++) + { + res1.a[i] = 0; + res2.a[i] = DEFAULT_VALUE; + res3.a[i] = DEFAULT_VALUE; + float s1 = i + 1.0; + float rs = sqrtf (s1); + src1.a[i] = convert_fp32_to_bf16_ne (s1); + res_ref[i] = res_ref2[i] = convert_fp32_to_bf16_ne (rs); + } + + res1.x = INTRINSIC (_sqrtne_pbh) (src1.x); + res2.x = INTRINSIC (_mask_sqrtne_pbh) (res2.x, mask, src1.x); + res3.x = INTRINSIC (_maskz_sqrtne_pbh) (mask, src1.x); + + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res1, res_ref)) + abort (); + + MASK_MERGE (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res2, res_ref2)) + abort (); + + MASK_ZERO (bf16_uw) (res_ref2, mask, SIZE_RES); + if (UNION_CHECK (AVX512F_LEN, bf16_uw) (res3, res_ref2)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-bf16-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-bf16-1.c index 831c8f849ef..56cec6df1d6 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-bf16-1.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-bf16-1.c @@ -74,9 +74,60 @@ /* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[ \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vfnmsub231nepbf16\[ \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vfnmsub132nepbf16\[ \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrtpbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrtpbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrtpbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrtpbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrtpbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrtpbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtnepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetexppbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetexppbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetexppbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetexppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetexppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetexppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalenepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalenepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalenepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalenepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalenepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalenepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducenepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducenepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducenepbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducenepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducenepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducenepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetmantpbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetmantpbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetmantpbf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetmantpbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetmantpbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vgetmantpbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfpclasspbf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n^k\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfpclasspbf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n^k\]*%k\[0-7\]\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfpclasspbf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n^k\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vfpclasspbf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n^k\]*%k\[0-7\]\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcmppbf16\[ \\t\]+\\\$1\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%k\[0-9\](?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcmppbf16\[ \\t\]+\\\$2\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%k\[0-9\]\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcmppbf16\[ \\t\]+\\\$1\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%k\[0-9\](?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcmppbf16\[ \\t\]+\\\$2\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%k\[0-9\]\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ #include <immintrin.h> +#define IMM 123 volatile __m256bh res, x1, x2; volatile __m128bh res1, x3, x4; volatile __mmask16 m16; @@ -169,4 +220,67 @@ avx10_2_test (void) res1 = _mm_mask_fnmsubne_pbh (res1, m8, x3, x4); res1 = _mm_mask3_fnmsubne_pbh (res1, x3, x4, m8); res1 = _mm_maskz_fnmsubne_pbh (m8,res1, x3, x4); + + res = _mm256_rsqrt_pbh (x1); + res = _mm256_mask_rsqrt_pbh (res, m16, x1); + res = _mm256_maskz_rsqrt_pbh (m16, x1); + res1 = _mm_rsqrt_pbh (x3); + res1 = _mm_mask_rsqrt_pbh (res1, m8, x3); + res1 = _mm_maskz_rsqrt_pbh (m8, x3); + + res = _mm256_sqrtne_pbh (x1); + res = _mm256_mask_sqrtne_pbh (res, m16, x1); + res = _mm256_maskz_sqrtne_pbh (m16, x1); + res1 = _mm_sqrtne_pbh (x3); + res1 = _mm_mask_sqrtne_pbh (res1, m8, x3); + res1 = _mm_maskz_sqrtne_pbh (m8, x3); + + res = _mm256_rcp_pbh (x1); + res = _mm256_mask_rcp_pbh (res, m16, x1); + res = _mm256_maskz_rcp_pbh (m16, x1); + res1 = _mm_rcp_pbh (x3); + res1 = _mm_mask_rcp_pbh (res1, m8, x3); + res1 = _mm_maskz_rcp_pbh (m8, x3); + + res = _mm256_getexp_pbh (x1); + res = _mm256_mask_getexp_pbh (res, m16, x1); + res = _mm256_maskz_getexp_pbh (m16, x1); + res1 = _mm_getexp_pbh (x3); + res1 = _mm_mask_getexp_pbh (res1, m8, x3); + res1 = _mm_maskz_getexp_pbh (m8, x3); + + res = _mm256_roundscalene_pbh (x1, IMM); + res = _mm256_mask_roundscalene_pbh (res, m16, x1, IMM); + res = _mm256_maskz_roundscalene_pbh (m16, x1, IMM); + res1 = _mm_roundscalene_pbh (x3, IMM); + res1 = _mm_mask_roundscalene_pbh (res1, m8, x3, IMM); + res1 = _mm_maskz_roundscalene_pbh (m8, x3, IMM); + + res = _mm256_reducene_pbh (x1, IMM); + res = _mm256_mask_reducene_pbh (res, m16, x1, IMM); + res = _mm256_maskz_reducene_pbh (m16, x1, IMM); + res1 = _mm_reducene_pbh (x3, IMM); + res1 = _mm_mask_reducene_pbh (res1, m8, x3, IMM); + res1 = _mm_maskz_reducene_pbh (m8, x3, IMM); + + res = _mm256_getmant_pbh (x1, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_src); + res = _mm256_mask_getmant_pbh (res, m16, x1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + res = _mm256_maskz_getmant_pbh (m16, x1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + res1 = _mm_getmant_pbh (x3, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_src); + res1 = _mm_mask_getmant_pbh (res1, m8, x3, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + res1 = _mm_maskz_getmant_pbh (m8, x3, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + + m16 = _mm256_fpclass_pbh_mask (x1, 13); + m16 = _mm256_mask_fpclass_pbh_mask (2, x1, 13); + m8 = _mm_fpclass_pbh_mask (x3, 13); + m8 = _mm_mask_fpclass_pbh_mask (2, x3, 13); + + m16 = _mm256_cmp_pbh_mask (x1, x2, 1); + m16 = _mm256_mask_cmp_pbh_mask (m16, x1, x2, 2); + m8 = _mm_cmp_pbh_mask (x3, x4, 1); + m8 = _mm_mask_cmp_pbh_mask (m8, x3, x4, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vcmppbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vcmppbf16-2.c new file mode 100644 index 00000000000..fa8be3e8e8b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vcmppbf16-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vcmppbf16-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vcmppbf16-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vcomsbf16-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-vcomsbf16-1.c new file mode 100644 index 00000000000..e603aad27bd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vcomsbf16-1.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx10.2 -O2" } */ +/* { dg-final { scan-assembler-times "vcomsbf16\[ \\t\]+\[^{}\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 6 } } */ +/* { dg-final { scan-assembler-times "jp" 2 } } */ +#include <immintrin.h> + +volatile __m128bh x1, x2; +volatile int res; + +void extern +avx10_2_vcom_test (void) +{ + res = _mm_comeq_sbh (x1, x2); + res = _mm_comlt_sbh (x1, x2); + res = _mm_comle_sbh (x1, x2); + res = _mm_comgt_sbh (x1, x2); + res = _mm_comge_sbh (x1, x2); + res = _mm_comneq_sbh (x1, x2); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vcomsbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vcomsbf16-2.c new file mode 100644 index 00000000000..c4f0c822678 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vcomsbf16-2.c @@ -0,0 +1,58 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX10_SCALAR +#include "avx10-helper.h" +#define SIZE_RES (128 / 16) + +#define CMP(PRED, IMM) \ + exp = _mm_comi_round_ss (__A, __B, IMM, _MM_FROUND_NO_EXC); \ + res1 = _mm_com##PRED##_sbh (src1.x, src2.x); \ + if (exp != res1) \ + abort (); + +void +TEST (void) +{ + int i; + int res1, exp; + UNION_TYPE (128, bf16_uw) src1, src2; + + struct + { + float x1; + float x2; + } + inputs[] = + { + { 4.3, 2.18 }, + { -4.3, 3.18 }, + { __builtin_nanf (""), -5.8 }, + { -4.8, __builtin_nansf ("") }, + { 3.8, __builtin_nansf ("") }, + { 4.2, 4.2 }, + { __builtin_nanf (""), __builtin_nansf ("") }, + }; + + for (i = 0; i < sizeof (inputs) / sizeof (inputs[0]); i++) + { + float x = inputs[i].x1; + float y = inputs[i].x2; + + __m128 __A = _mm_load_ss (&x); + __m128 __B = _mm_load_ss (&y); + for (int n = 0; n < SIZE_RES; n++) + { + src2.a[n] = convert_fp32_to_bf16(y); + src1.a[n] = convert_fp32_to_bf16(x); + } + CMP (eq, _CMP_EQ_OQ); + CMP (ge, _CMP_GE_OS); + CMP (gt, _CMP_GT_OS); + CMP (lt, _CMP_LT_OS); + CMP (le, _CMP_LE_OS); + CMP (neq, _CMP_NEQ_UQ); + } +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vfpclasspbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vfpclasspbf16-2.c new file mode 100644 index 00000000000..2aa57496c1f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vfpclasspbf16-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vfpclasspbf16-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vfpclasspbf16-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vgetexppbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vgetexppbf16-2.c new file mode 100644 index 00000000000..804a32a4525 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vgetexppbf16-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vgetexppbf16-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vgetexppbf16-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vgetmantpbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vgetmantpbf16-2.c new file mode 100644 index 00000000000..53e0a5e0588 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vgetmantpbf16-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vgetmantpbf16-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vgetmantpbf16-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vrcppbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vrcppbf16-2.c new file mode 100644 index 00000000000..332010aba57 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vrcppbf16-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vrcppbf16-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vrcppbf16-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vreducenepbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vreducenepbf16-2.c new file mode 100644 index 00000000000..809baf7c284 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vreducenepbf16-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vreducenepbf16-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vreducenepbf16-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vrndscalenepbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vrndscalenepbf16-2.c new file mode 100644 index 00000000000..ee6e71da3ba --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vrndscalenepbf16-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vrndscalenepbf16-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vrndscalenepbf16-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vrsqrtpbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vrsqrtpbf16-2.c new file mode 100644 index 00000000000..80c8ba38815 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vrsqrtpbf16-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vrsqrtpbf16-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vrsqrtpbf16-2.c" diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vsqrtnepbf16-2.c b/gcc/testsuite/gcc.target/i386/avx10_2-vsqrtnepbf16-2.c new file mode 100644 index 00000000000..c6d6ca4c7bd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-vsqrtnepbf16-2.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx10.2" } */ +/* { dg-require-effective-target avx10_2 } */ + +#define AVX10_2 +#define AVX512VL +#define AVX512F_LEN 256 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vsqrtnepbf16-2.c" + +#undef AVX512F_LEN +#undef AVX512F_LEN_HALF + +#define AVX512F_LEN 128 +#define AVX512F_LEN_HALF 128 +#include "avx10_2-512-vsqrtnepbf16-2.c" diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c index a5ba3decc97..e92d04af3f5 100644 --- a/gcc/testsuite/gcc.target/i386/sse-13.c +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -1024,4 +1024,23 @@ /* avx10_2-512convertintrin.h */ #define __builtin_ia32_vcvt2ps2phx512_mask_round(A, B, C, D, E) __builtin_ia32_vcvt2ps2phx512_mask_round(A, B, C, D, 8) +/* avx10_2-512bf16intrin.h */ +#define __builtin_ia32_rndscalenepbf16512_mask(A, B, C, D) __builtin_ia32_rndscalenepbf16512_mask(A, 123, C, D) +#define __builtin_ia32_reducenepbf16512_mask(A, B, C, D) __builtin_ia32_reducenepbf16512_mask(A, 123, C, D) +#define __builtin_ia32_getmantpbf16512_mask(A, B, C, D) __builtin_ia32_getmantpbf16512_mask(A, 1, C, D) +#define __builtin_ia32_fpclasspbf16512_mask(A, B, C) __builtin_ia32_fpclasspbf16512_mask(A, 1, C) +#define __builtin_ia32_cmppbf16512_mask(A, B, C, D) __builtin_ia32_cmppbf16512_mask(A, B, 1, D) + +/* avx10_2bf16intrin.h */ +#define __builtin_ia32_rndscalenepbf16256_mask(A, B, C, D) __builtin_ia32_rndscalenepbf16256_mask(A, 123, C, D) +#define __builtin_ia32_rndscalenepbf16128_mask(A, B, C, D) __builtin_ia32_rndscalenepbf16128_mask(A, 123, C, D) +#define __builtin_ia32_reducenepbf16256_mask(A, B, C, D) __builtin_ia32_reducenepbf16256_mask(A, 123, C, D) +#define __builtin_ia32_reducenepbf16128_mask(A, B, C, D) __builtin_ia32_reducenepbf16128_mask(A, 123, C, D) +#define __builtin_ia32_getmantpbf16256_mask(A, B, C, D) __builtin_ia32_getmantpbf16256_mask(A, 1, C, D) +#define __builtin_ia32_getmantpbf16128_mask(A, B, C, D) __builtin_ia32_getmantpbf16128_mask(A, 1, C, D) +#define __builtin_ia32_fpclasspbf16256_mask(A, B, C) __builtin_ia32_fpclasspbf16256_mask(A, 1, C) +#define __builtin_ia32_fpclasspbf16128_mask(A, B, C) __builtin_ia32_fpclasspbf16128_mask(A, 1, C) +#define __builtin_ia32_cmppbf16256_mask(A, B, C, D) __builtin_ia32_cmppbf16256_mask(A, B, 1, D) +#define __builtin_ia32_cmppbf16128_mask(A, B, C, D) __builtin_ia32_cmppbf16128_mask(A, B, 1, D) + #include <x86intrin.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c index 9253e5eb905..49a82d8a2d5 100644 --- a/gcc/testsuite/gcc.target/i386/sse-14.c +++ b/gcc/testsuite/gcc.target/i386/sse-14.c @@ -1388,3 +1388,46 @@ test_2 (_mm256_cvtx_round2ps_ph, __m256h, __m256, __m256, 4) /* avx10_2-512convertintrin.h */ test_2 (_mm512_cvtx_round2ps_ph, __m512h, __m512, __m512, 4) + +/* avx10_2-512bf16intrin.h */ +test_1 (_mm512_roundscalene_pbh, __m512bh, __m512bh, 123) +test_2 (_mm512_maskz_roundscalene_pbh, __m512bh, __mmask32, __m512bh, 123) +test_3 (_mm512_mask_roundscalene_pbh, __m512bh, __m512bh, __mmask32, __m512bh, 123) +test_1 (_mm512_reducene_pbh, __m512bh, __m512bh, 123) +test_2 (_mm512_maskz_reducene_pbh, __m512bh, __mmask32, __m512bh, 123) +test_3 (_mm512_mask_reducene_pbh, __m512bh, __m512bh, __mmask32, __m512bh, 123) +test_1x (_mm512_getmant_pbh, __m512bh, __m512bh, 1, 1) +test_2x (_mm512_maskz_getmant_pbh, __m512bh, __mmask32,__m512bh, 1, 1) +test_3x (_mm512_mask_getmant_pbh, __m512bh, __m512bh, __mmask32,__m512bh, 1, 1) +test_1 (_mm512_fpclass_pbh_mask, __mmask32, __m512bh, 13) +test_2 (_mm512_mask_fpclass_pbh_mask, __mmask32, __mmask32, __m512bh, 13) +test_2 (_mm512_cmp_pbh_mask, __mmask32, __m512bh, __m512bh, 1) +test_3 (_mm512_mask_cmp_pbh_mask, __mmask32, __mmask32,__m512bh, __m512bh, 1) + +/* avx10_2bf16intrin.h */ +test_1 (_mm256_roundscalene_pbh, __m256bh, __m256bh, 123) +test_1 (_mm_roundscalene_pbh, __m128bh, __m128bh, 123) +test_2 (_mm256_maskz_roundscalene_pbh, __m256bh, __mmask16, __m256bh, 123) +test_2 (_mm_maskz_roundscalene_pbh, __m128bh, __mmask8, __m128bh, 123) +test_3 (_mm256_mask_roundscalene_pbh, __m256bh, __m256bh, __mmask16, __m256bh, 123) +test_3 (_mm_mask_roundscalene_pbh, __m128bh, __m128bh, __mmask8, __m128bh, 123) +test_1 (_mm256_reducene_pbh, __m256bh, __m256bh, 123) +test_1 (_mm_reducene_pbh, __m128bh, __m128bh, 123) +test_2 (_mm256_maskz_reducene_pbh, __m256bh, __mmask16, __m256bh, 123) +test_2 (_mm_maskz_reducene_pbh, __m128bh, __mmask8, __m128bh, 123) +test_3 (_mm256_mask_reducene_pbh, __m256bh, __m256bh, __mmask16, __m256bh, 123) +test_3 (_mm_mask_reducene_pbh, __m128bh, __m128bh, __mmask8, __m128bh, 123) +test_1x (_mm256_getmant_pbh, __m256bh, __m256bh, 1, 1) +test_1x (_mm_getmant_pbh, __m128bh, __m128bh, 1, 1) +test_2x (_mm256_maskz_getmant_pbh, __m256bh, __mmask16,__m256bh, 1, 1) +test_2x (_mm_maskz_getmant_pbh, __m128bh, __mmask8, __m128bh, 1, 1) +test_3x (_mm256_mask_getmant_pbh, __m256bh, __m256bh, __mmask16,__m256bh, 1, 1) +test_3x (_mm_mask_getmant_pbh, __m128bh, __m128bh, __mmask8, __m128bh, 1, 1) +test_1 (_mm256_fpclass_pbh_mask, __mmask16, __m256bh, 13) +test_1 (_mm_fpclass_pbh_mask, __mmask8, __m128bh, 13) +test_2 (_mm256_mask_fpclass_pbh_mask, __mmask16, __mmask16, __m256bh, 13) +test_2 (_mm_mask_fpclass_pbh_mask, __mmask8, __mmask8, __m128bh, 13) +test_2 (_mm256_cmp_pbh_mask, __mmask16, __m256bh, __m256bh, 1) +test_2 (_mm_cmp_pbh_mask, __mmask8, __m128bh, __m128bh, 1) +test_3 (_mm256_mask_cmp_pbh_mask, __mmask16, __mmask16, __m256bh, __m256bh, 1) +test_3 (_mm_mask_cmp_pbh_mask, __mmask8, __mmask8, __m128bh, __m128bh, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c index d57bbc41a49..193057a4719 100644 --- a/gcc/testsuite/gcc.target/i386/sse-22.c +++ b/gcc/testsuite/gcc.target/i386/sse-22.c @@ -1427,3 +1427,46 @@ test_2 (_mm256_cvtx_round2ps_ph, __m256h, __m256, __m256, 4) /* avx10_2-512convertintrin.h */ test_2 (_mm512_cvtx_round2ps_ph, __m512h, __m512, __m512, 4) + +/* avx10_2-512bf16intrin.h */ +test_1 (_mm512_roundscalene_pbh, __m512bh, __m512bh, 123) +test_2 (_mm512_maskz_roundscalene_pbh, __m512bh, __mmask32, __m512bh, 123) +test_3 (_mm512_mask_roundscalene_pbh, __m512bh, __m512bh, __mmask32, __m512bh, 123) +test_1 (_mm512_reducene_pbh, __m512bh, __m512bh, 123) +test_2 (_mm512_maskz_reducene_pbh, __m512bh, __mmask32, __m512bh, 123) +test_3 (_mm512_mask_reducene_pbh, __m512bh, __m512bh, __mmask32, __m512bh, 123) +test_1x (_mm512_getmant_pbh, __m512bh, __m512bh, 1, 1) +test_2x (_mm512_maskz_getmant_pbh, __m512bh, __mmask32,__m512bh, 1, 1) +test_3x (_mm512_mask_getmant_pbh, __m512bh, __m512bh, __mmask32,__m512bh, 1, 1) +test_1 (_mm512_fpclass_pbh_mask, __mmask32, __m512bh, 13) +test_2 (_mm512_mask_fpclass_pbh_mask, __mmask32, __mmask32, __m512bh, 13) +test_2 (_mm512_cmp_pbh_mask, __mmask32, __m512bh, __m512bh, 1) +test_3 (_mm512_mask_cmp_pbh_mask, __mmask32, __mmask32,__m512bh, __m512bh, 1) + +/* avx10_2bf16intrin.h */ +test_1 (_mm256_roundscalene_pbh, __m256bh, __m256bh, 123) +test_1 (_mm_roundscalene_pbh, __m128bh, __m128bh, 123) +test_2 (_mm256_maskz_roundscalene_pbh, __m256bh, __mmask16, __m256bh, 123) +test_2 (_mm_maskz_roundscalene_pbh, __m128bh, __mmask8, __m128bh, 123) +test_3 (_mm256_mask_roundscalene_pbh, __m256bh, __m256bh, __mmask16, __m256bh, 123) +test_3 (_mm_mask_roundscalene_pbh, __m128bh, __m128bh, __mmask8, __m128bh, 123) +test_1 (_mm256_reducene_pbh, __m256bh, __m256bh, 123) +test_1 (_mm_reducene_pbh, __m128bh, __m128bh, 123) +test_2 (_mm256_maskz_reducene_pbh, __m256bh, __mmask16, __m256bh, 123) +test_2 (_mm_maskz_reducene_pbh, __m128bh, __mmask8, __m128bh, 123) +test_3 (_mm256_mask_reducene_pbh, __m256bh, __m256bh, __mmask16, __m256bh, 123) +test_3 (_mm_mask_reducene_pbh, __m128bh, __m128bh, __mmask8, __m128bh, 123) +test_1x (_mm256_getmant_pbh, __m256bh, __m256bh, 1, 1) +test_1x (_mm_getmant_pbh, __m128bh, __m128bh, 1, 1) +test_2x (_mm256_maskz_getmant_pbh, __m256bh, __mmask16,__m256bh, 1, 1) +test_2x (_mm_maskz_getmant_pbh, __m128bh, __mmask8, __m128bh, 1, 1) +test_3x (_mm256_mask_getmant_pbh, __m256bh, __m256bh, __mmask16,__m256bh, 1, 1) +test_3x (_mm_mask_getmant_pbh, __m128bh, __m128bh, __mmask8, __m128bh, 1, 1) +test_1 (_mm256_fpclass_pbh_mask, __mmask16, __m256bh, 13) +test_1 (_mm_fpclass_pbh_mask, __mmask8, __m128bh, 13) +test_2 (_mm256_mask_fpclass_pbh_mask, __mmask16, __mmask16, __m256bh, 13) +test_2 (_mm_mask_fpclass_pbh_mask, __mmask8, __mmask8, __m128bh, 13) +test_2 (_mm256_cmp_pbh_mask, __mmask16, __m256bh, __m256bh, 1) +test_2 (_mm_cmp_pbh_mask, __mmask8, __m128bh, __m128bh, 1) +test_3 (_mm256_mask_cmp_pbh_mask, __mmask16, __mmask16, __m256bh, __m256bh, 1) +test_3 (_mm_mask_cmp_pbh_mask, __mmask8, __mmask8, __m128bh, __m128bh, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c index 438974cb0c6..a33eb9945dd 100644 --- a/gcc/testsuite/gcc.target/i386/sse-23.c +++ b/gcc/testsuite/gcc.target/i386/sse-23.c @@ -998,6 +998,25 @@ /* avx10_2-512convertintrin.h */ #define __builtin_ia32_vcvt2ps2phx512_mask_round(A, B, C, D, E) __builtin_ia32_vcvt2ps2phx512_mask_round(A, B, C, D, 8) +/* avx10_2-512bf16intrin.h */ +#define __builtin_ia32_rndscalenepbf16512_mask(A, B, C, D) __builtin_ia32_rndscalenepbf16512_mask(A, 123, C, D) +#define __builtin_ia32_reducenepbf16512_mask(A, B, C, D) __builtin_ia32_reducenepbf16512_mask(A, 123, C, D) +#define __builtin_ia32_getmantpbf16512_mask(A, B, C, D) __builtin_ia32_getmantpbf16512_mask(A, 1, C, D) +#define __builtin_ia32_fpclasspbf16512_mask(A, B, C) __builtin_ia32_fpclasspbf16512_mask(A, 1, C) +#define __builtin_ia32_cmppbf16512_mask(A, B, C, D) __builtin_ia32_cmppbf16512_mask(A, B, 1, D) + +/* avx10_2bf16intrin.h */ +#define __builtin_ia32_rndscalenepbf16256_mask(A, B, C, D) __builtin_ia32_rndscalenepbf16256_mask(A, 123, C, D) +#define __builtin_ia32_rndscalenepbf16128_mask(A, B, C, D) __builtin_ia32_rndscalenepbf16128_mask(A, 123, C, D) +#define __builtin_ia32_reducenepbf16256_mask(A, B, C, D) __builtin_ia32_reducenepbf16256_mask(A, 123, C, D) +#define __builtin_ia32_reducenepbf16128_mask(A, B, C, D) __builtin_ia32_reducenepbf16128_mask(A, 123, C, D) +#define __builtin_ia32_getmantpbf16256_mask(A, B, C, D) __builtin_ia32_getmantpbf16256_mask(A, 1, C, D) +#define __builtin_ia32_getmantpbf16128_mask(A, B, C, D) __builtin_ia32_getmantpbf16128_mask(A, 1, C, D) +#define __builtin_ia32_fpclasspbf16256_mask(A, B, C) __builtin_ia32_fpclasspbf16256_mask(A, 1, C) +#define __builtin_ia32_fpclasspbf16128_mask(A, B, C) __builtin_ia32_fpclasspbf16128_mask(A, 1, C) +#define __builtin_ia32_cmppbf16256_mask(A, B, C, D) __builtin_ia32_cmppbf16256_mask(A, B, 1, D) +#define __builtin_ia32_cmppbf16128_mask(A, B, C, D) __builtin_ia32_cmppbf16128_mask(A, B, 1, D) + #pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512") #include <x86intrin.h> -- 2.43.5