On Thu, Jul 05, 2018 at 08:30:27PM +0300, Grazvydas Ignotas wrote: > gcc/ChangeLog: > > 2018-07-05 Grazvydas Ignotas <nota...@gmail.com> > > * config/i386/avx512bwintrin.h: (_mm512_mask_cmp_epi8_mask, > _mm512_mask_cmp_epu8_mask): Fix mask arguments.
LGTM, but 1) I think it would be nice to add a runtime testcase that fails (on avx512bw hw) without this patch and succeeds with this patch (have some non-zero and zero bits in the high 32 bits of the mask and test that the result is correct 2) there are other functions that have this bug, e.g. _mm_mask_cmp_epi8_mask, _mm256_mask_cmp_epi8_mask, _mm_mask_cmp_epu8_mask, _mm256_mask_cmp_epu8_mask in avx512vlbwintrin.h Let's grep for all suspicious parts: echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 's/@@@/}\n/g' | grep '__mmask8.*__mmask\(16\|32\|64\)' _mm512_mask_bitshuffle_epi64_mask (__mmask8 __M, __m512i __A, __m512i __B) { return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, (__v64qi) __B, (__mmask64) __M); } _mm_mask_cmp_epi8_mask (__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, (__v16qi) __Y, __P, (__mmask16) __U); } _mm_mask_cmp_epu8_mask (__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, (__v16qi) __Y, __P, (__mmask16) __U); } echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 's/@@@/}\n/g' | grep '__mmask16.*__mmask\(8\|32\|64\)' _mm512_mask_xor_epi64 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, (__v8di) __W, (__mmask8) __U); } _mm512_maskz_xor_epi64 (__mmask16 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, (__v8di) _mm512_setzero_si512 (), (__mmask8) __U); } _mm512_mask_cmpneq_epi64_mask (__mmask16 __M, __m512i __X, __m512i __Y) { return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, (__v8di) __Y, 4, (__mmask8) __M); } _mm256_mask_cmp_epi8_mask (__mmask16 __U, __m256i __X, __m256i __Y, const int __P) { return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, (__v32qi) __Y, __P, (__mmask32) __U); } _mm256_mask_cmp_epu8_mask (__mmask16 __U, __m256i __X, __m256i __Y, const int __P) { return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, (__v32qi) __Y, __P, (__mmask32) __U); } _mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) __W, (__mmask8) __U); } _mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) _mm_setzero_ps (), (__mmask8) __U); } _mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) __W, (__mmask8) __U); } _mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) _mm256_setzero_ps (), (__mmask8) __U); } _mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) __W, (__mmask8) __U); } _mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) _mm_setzero_ps (), (__mmask8) __U); } _mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) __W, (__mmask8) __U); } _mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) _mm256_setzero_ps (), (__mmask8) __U); } _mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) { return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, (__v8sf) _mm256_setzero_ps (), (__mmask8) __U); } _mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) { return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, (__v4sf) _mm_setzero_ps (), (__mmask8) __U); } echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 's/@@@/}\n/g' | grep '__mmask32.*__mmask\(8\|16\|64\)' _mm512_mask_cmp_epi8_mask (__mmask32 __U, __m512i __X, __m512i __Y, const int __P) { return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, (__v64qi) __Y, __P, (__mmask64) __U); } _mm512_mask_cmp_epu8_mask (__mmask32 __U, __m512i __X, __m512i __Y, const int __P) { return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, (__v64qi) __Y, __P, (__mmask64) __U); } _mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) { return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, (__v16hi) __W, (__mmask16) __U); } _mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) { return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, (__v8hi) __W, (__mmask8) __U); } _mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) { return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, (__v16hi) __W, (__mmask16) __U); } _mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) { return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, (__v8hi) __W, (__mmask8) __U); } echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 's/@@@/}\n/g' | grep '__mmask64.*__mmask\(8\|16\|32\)' I'm not saying all these are bugs, but at least all of them need close inspection (check them up with https://software.intel.com/sites/landingpage/IntrinsicsGuide/ check what the builtins actually expect and if it is correct etc.). Are you willing to do that, or shall I do that? Jakub