Since BF8 and FP16 have same bits for exponent, the type conversion between them is just a cast for fraction part. We will use a sequence of instrctions instead of new instructions to do that. For convenience, intrins are also provided.
gcc/ChangeLog: * config/i386/avx10_2-512convertintrin.h (_mm512_cvtpbf8_ph): New. (_mm512_mask_cvtpbf8_ph): Ditto. (_mm512_maskz_cvtpbf8_ph): Ditto. * config/i386/avx10_2convertintrin.h (_mm_cvtpbf8_ph): Ditto. (_mm_mask_cvtpbf8_ph): Ditto. (_mm_maskz_cvtpbf8_ph): Ditto. (_mm256_cvtpbf8_ph): Ditto. (_mm256_mask_cvtpbf8_ph): Ditto. (_mm256_maskz_cvtpbf8_ph): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx10_2-512-convert-1.c: Add tests for new intrin. * gcc.target/i386/avx10_2-convert-1.c: Ditto. --- gcc/config/i386/avx10_2-512convertintrin.h | 24 ++++++++++ gcc/config/i386/avx10_2convertintrin.h | 48 +++++++++++++++++++ .../gcc.target/i386/avx10_2-512-convert-1.c | 16 ++++++- .../gcc.target/i386/avx10_2-convert-1.c | 26 ++++++++-- 4 files changed, 109 insertions(+), 5 deletions(-) diff --git a/gcc/config/i386/avx10_2-512convertintrin.h b/gcc/config/i386/avx10_2-512convertintrin.h index 4ad339bbbf9..dfbdfc3e51b 100644 --- a/gcc/config/i386/avx10_2-512convertintrin.h +++ b/gcc/config/i386/avx10_2-512convertintrin.h @@ -540,6 +540,30 @@ _mm512_maskz_cvtnesph_phf8 (__mmask32 __U, __m512h __A) (__mmask32) __U); } +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpbf8_ph (__m256i __A) +{ + return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_slli_epi16 ( + (__m512i) _mm512_cvtepi8_epi16 (__A), 8)); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpbf8_ph (__m512h __S, __mmask16 __U, __m256i __A) +{ + return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_mask_slli_epi16 ( + (__m512i) __S, __U, (__m512i) _mm512_cvtepi8_epi16 (__A), 8)); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpbf8_ph (__mmask16 __U, __m256i __A) +{ + return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_slli_epi16 ( + (__m512i) _mm512_maskz_cvtepi8_epi16 (__U, __A), 8)); +} + #ifdef __DISABLE_AVX10_2_512__ #undef __DISABLE_AVX10_2_512__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx10_2convertintrin.h b/gcc/config/i386/avx10_2convertintrin.h index ac62d1290a5..8d2c1a54147 100644 --- a/gcc/config/i386/avx10_2convertintrin.h +++ b/gcc/config/i386/avx10_2convertintrin.h @@ -970,6 +970,54 @@ _mm256_maskz_cvtnesph_phf8 (__mmask16 __U, __m256h __A) (__mmask16) __U); } +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpbf8_ph (__m128i __A) +{ + return (__m128h) _mm_castsi128_ph ((__m128i) _mm_slli_epi16 ( + (__m128i) _mm_cvtepi8_epi16 (__A), 8)); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpbf8_ph (__m128h __S, __mmask8 __U, __m128i __A) +{ + return (__m128h) _mm_castsi128_ph ((__m128i) _mm_mask_slli_epi16 ( + (__m128i) __S, __U, (__m128i) _mm_cvtepi8_epi16 (__A), 8)); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpbf8_ph (__mmask8 __U, __m128i __A) +{ + return (__m128h) _mm_castsi128_ph ((__m128i) _mm_slli_epi16 ( + (__m128i) _mm_maskz_cvtepi8_epi16 (__U, __A), 8)); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpbf8_ph (__m128i __A) +{ + return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_slli_epi16 ( + (__m256i) _mm256_cvtepi8_epi16 (__A), 8)); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpbf8_ph (__m256h __S, __mmask8 __U, __m128i __A) +{ + return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_mask_slli_epi16 ( + (__m256i) __S, __U, (__m256i) _mm256_cvtepi8_epi16 (__A), 8)); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpbf8_ph (__mmask8 __U, __m128i __A) +{ + return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_slli_epi16 ( + (__m256i) _mm256_maskz_cvtepi8_epi16 (__U, __A), 8)); +} + #ifdef __DISABLE_AVX10_2_256__ #undef __DISABLE_AVX10_2_256__ #pragma GCC pop_options diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c index bbbff186d0a..f67138c237c 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c @@ -45,13 +45,17 @@ /* { dg-final { scan-assembler-times "vcvtneph2hf8s\[ \\t\]*%zmm\[0-9\]+,\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vcvtneph2hf8s\[ \\t\]*%zmm\[0-9\]+,\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vcvtneph2hf8s\[ \\t\]*%zmm\[0-9\]+,\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$8, %zmm\[0-9]\+, %zmm\[0-9]\+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$8, %zmm\[0-9]\+, %zmm\[0-9]\+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbw\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%zmm\[0-9\](?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbw\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include <immintrin.h> -volatile __m256i x256i; +volatile __m256i x256i, z1; volatile __m512i x512i; volatile __m512 x, a1, b1; -volatile __m512h y, x512h; +volatile __m512h y, x512h, z; volatile __mmask16 m16; volatile __mmask32 m32; volatile __mmask64 m64; @@ -174,3 +178,11 @@ avx10_2_512_vcvtneph2hf8s_test (void) x256i = _mm512_mask_cvtnesph_phf8 (x256i, m32, x512h); x256i = _mm512_maskz_cvtnesph_phf8 (m32, x512h); } + +void extern +avx10_2_512_cvtbf8_fp16_test (void) +{ + y = _mm512_cvtpbf8_ph (z1); + y = _mm512_mask_cvtpbf8_ph (z, m16, z1); + y = _mm512_maskz_cvtpbf8_ph (m16, z1); +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c index 015474f8cf3..9c3e85718f2 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c @@ -87,14 +87,22 @@ /* { dg-final { scan-assembler-times "vcvtneph2hf8sy\[ \\t\]*%ymm\[0-9\]+,\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vcvtneph2hf8sy\[ \\t\]*%ymm\[0-9\]+,\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vcvtneph2hf8sy\[ \\t\]*%ymm\[0-9\]+,\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbw\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%ymm\[0-9\](?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$8, %ymm\[0-9]\+, %ymm\[0-9]\+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$8, %ymm\[0-9]\+, %ymm\[0-9]\+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbw\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbw\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$8, %xmm\[0-9]\+, %xmm\[0-9]\+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vpsllw\[ \t]\+\\\$8, %xmm\[0-9]\+, %xmm\[0-9]\+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vpmovsxbw\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include <immintrin.h> volatile __m128 x1,a1,b1; volatile __m256 x2,a2,b2; -volatile __m128h y,x128h; -volatile __m256h y2,x256h; -volatile __m128i x128i; +volatile __m128h y,x128h,z; +volatile __m256h y2,x256h,z2; +volatile __m128i x128i,z3; volatile __m256i x256i; volatile __mmask8 m8; volatile __mmask16 m16; @@ -272,3 +280,15 @@ avx10_2_vcvtneph2hf8s_test (void) x128i = _mm256_mask_cvtnesph_phf8 (x128i, m16, x256h); x128i = _mm256_maskz_cvtnesph_phf8 (m16, x256h); } + +void extern +avx10_2_cvtbf8_fp16_test (void) +{ + y = _mm_cvtpbf8_ph (z3); + y = _mm_mask_cvtpbf8_ph (z, m8, z3); + y = _mm_maskz_cvtpbf8_ph (m8, z3); + + y2 = _mm256_cvtpbf8_ph (z3); + y2 = _mm256_mask_cvtpbf8_ph (z2, m8, z3); + y2 = _mm256_maskz_cvtpbf8_ph (m8, z3); +} -- 2.43.5