Hi all, For bf8 -> pf16 convert, when dst is 256 bit, the mask should be 16 bit since 16*16=256, not the 8 bit in the current intrin. In 512 bit intrin, the mask bit is also halved. This patch will fix both of them.
Ok for trunk? Thx, Haochen gcc/ChangeLog: * config/i386/avx10_2-512convertintrin.h (_mm512_mask_cvtbf8_ph): Correct mask width. (_mm512_maskz_cvtbf8_ph): Ditto. * config/i386/avx10_2convertintrin.h (_mm256_mask_cvtbf8_ph): Ditto. (_mm256_maskz_cvtbf8_ph): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx10_2-512-convert-1.c: Change function call. * gcc.target/i386/avx10_2-convert-1.c: Ditto. --- gcc/config/i386/avx10_2-512convertintrin.h | 4 ++-- gcc/config/i386/avx10_2convertintrin.h | 4 ++-- gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c | 4 ++-- gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/avx10_2-512convertintrin.h b/gcc/config/i386/avx10_2-512convertintrin.h index 1079e0a2bda..a44481e0b4e 100644 --- a/gcc/config/i386/avx10_2-512convertintrin.h +++ b/gcc/config/i386/avx10_2-512convertintrin.h @@ -550,7 +550,7 @@ _mm512_cvtbf8_ph (__m256i __A) extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtbf8_ph (__m512h __S, __mmask16 __U, __m256i __A) +_mm512_mask_cvtbf8_ph (__m512h __S, __mmask32 __U, __m256i __A) { return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_mask_slli_epi16 ( (__m512i) __S, __U, (__m512i) _mm512_cvtepi8_epi16 (__A), 8)); @@ -558,7 +558,7 @@ _mm512_mask_cvtbf8_ph (__m512h __S, __mmask16 __U, __m256i __A) extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_cvtbf8_ph (__mmask16 __U, __m256i __A) +_mm512_maskz_cvtbf8_ph (__mmask32 __U, __m256i __A) { return (__m512h) _mm512_castsi512_ph ((__m512i) _mm512_slli_epi16 ( (__m512i) _mm512_maskz_cvtepi8_epi16 (__U, __A), 8)); diff --git a/gcc/config/i386/avx10_2convertintrin.h b/gcc/config/i386/avx10_2convertintrin.h index 3fc51b17435..7c9c238a3b4 100644 --- a/gcc/config/i386/avx10_2convertintrin.h +++ b/gcc/config/i386/avx10_2convertintrin.h @@ -1004,7 +1004,7 @@ _mm256_cvtbf8_ph (__m128i __A) extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtbf8_ph (__m256h __S, __mmask8 __U, __m128i __A) +_mm256_mask_cvtbf8_ph (__m256h __S, __mmask16 __U, __m128i __A) { return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_mask_slli_epi16 ( (__m256i) __S, __U, (__m256i) _mm256_cvtepi8_epi16 (__A), 8)); @@ -1012,7 +1012,7 @@ _mm256_mask_cvtbf8_ph (__m256h __S, __mmask8 __U, __m128i __A) extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtbf8_ph (__mmask8 __U, __m128i __A) +_mm256_maskz_cvtbf8_ph (__mmask16 __U, __m128i __A) { return (__m256h) _mm256_castsi256_ph ((__m256i) _mm256_slli_epi16 ( (__m256i) _mm256_maskz_cvtepi8_epi16 (__U, __A), 8)); diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c index bda74b5776b..c1e44efdb2f 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-512-convert-1.c @@ -183,6 +183,6 @@ void extern avx10_2_512_cvtbf8_fp16_test (void) { y = _mm512_cvtbf8_ph (z1); - y = _mm512_mask_cvtbf8_ph (z, m16, z1); - y = _mm512_maskz_cvtbf8_ph (m16, z1); + y = _mm512_mask_cvtbf8_ph (z, m32, z1); + y = _mm512_maskz_cvtbf8_ph (m32, z1); } diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c index 57b5fce7fb6..729496f7173 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-convert-1.c @@ -289,6 +289,6 @@ avx10_2_cvtbf8_fp16_test (void) y = _mm_maskz_cvtbf8_ph (m8, z3); y2 = _mm256_cvtbf8_ph (z3); - y2 = _mm256_mask_cvtbf8_ph (z2, m8, z3); - y2 = _mm256_maskz_cvtbf8_ph (m8, z3); + y2 = _mm256_mask_cvtbf8_ph (z2, m16, z3); + y2 = _mm256_maskz_cvtbf8_ph (m16, z3); } -- 2.31.1