On Thu, Jul 05, 2018 at 08:30:27PM +0300, Grazvydas Ignotas wrote:
> gcc/ChangeLog:
> 
> 2018-07-05  Grazvydas Ignotas  <nota...@gmail.com>
> 
>       * config/i386/avx512bwintrin.h: (_mm512_mask_cmp_epi8_mask,
>       _mm512_mask_cmp_epu8_mask): Fix mask arguments.

LGTM, but
1) I think it would be nice to add a runtime testcase that fails (on avx512bw 
hw)
   without this patch and succeeds with this patch (have some non-zero and
   zero bits in the high 32 bits of the mask and test that the result is
   correct
2) there are other functions that have this bug, e.g.
   _mm_mask_cmp_epi8_mask, _mm256_mask_cmp_epi8_mask,
   _mm_mask_cmp_epu8_mask, _mm256_mask_cmp_epu8_mask in avx512vlbwintrin.h

Let's grep for all suspicious parts:
echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 
's/@@@/}\n/g' | grep '__mmask8.*__mmask\(16\|32\|64\)'
 _mm512_mask_bitshuffle_epi64_mask (__mmask8 __M, __m512i __A, __m512i __B) { 
return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, 
(__v64qi) __B, (__mmask64) __M); }
 _mm_mask_cmp_epi8_mask (__mmask8 __U, __m128i __X, __m128i __Y, const int __P) 
{ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, (__v16qi) __Y, 
__P, (__mmask16) __U); }
 _mm_mask_cmp_epu8_mask (__mmask8 __U, __m128i __X, __m128i __Y, const int __P) 
{ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, (__v16qi) 
__Y, __P, (__mmask16) __U); }
echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 
's/@@@/}\n/g' | grep '__mmask16.*__mmask\(8\|32\|64\)'
 _mm512_mask_xor_epi64 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { 
return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, 
(__v8di) __W, (__mmask8) __U); }
 _mm512_maskz_xor_epi64 (__mmask16 __U, __m512i __A, __m512i __B) { return 
(__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, (__v8di) 
_mm512_setzero_si512 (), (__mmask8) __U); }
 _mm512_mask_cmpneq_epi64_mask (__mmask16 __M, __m512i __X, __m512i __Y) { 
return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, (__v8di) __Y, 4, 
(__mmask8) __M); }
 _mm256_mask_cmp_epi8_mask (__mmask16 __U, __m256i __X, __m256i __Y, const int 
__P) { return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, (__v32qi) 
__Y, __P, (__mmask32) __U); }
 _mm256_mask_cmp_epu8_mask (__mmask16 __U, __m256i __X, __m256i __Y, const int 
__P) { return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, 
(__v32qi) __Y, __P, (__mmask32) __U); }
 _mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { return 
(__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) 
__W, (__mmask8) __U); }
 _mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) { return (__m128) 
__builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) 
_mm_setzero_ps (), (__mmask8) __U); }
 _mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { 
return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, 
(__v8sf) __W, (__mmask8) __U); }
 _mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) { return (__m256) 
__builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) 
_mm256_setzero_ps (), (__mmask8) __U); }
 _mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { return 
(__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) 
__W, (__mmask8) __U); }
 _mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) { return (__m128) 
__builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) 
_mm_setzero_ps (), (__mmask8) __U); }
 _mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { 
return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, 
(__v8sf) __W, (__mmask8) __U); }
 _mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) { return (__m256) 
__builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) 
_mm256_setzero_ps (), (__mmask8) __U); }
 _mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) { return (__m256) 
__builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, (__v8sf) _mm256_setzero_ps (), 
(__mmask8) __U); }
 _mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) { return (__m128) 
__builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, (__v4sf) _mm_setzero_ps (), 
(__mmask8) __U); }
echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 
's/@@@/}\n/g' | grep '__mmask32.*__mmask\(8\|16\|64\)'
 _mm512_mask_cmp_epi8_mask (__mmask32 __U, __m512i __X, __m512i __Y, const int 
__P) { return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, (__v64qi) 
__Y, __P, (__mmask64) __U); }
 _mm512_mask_cmp_epu8_mask (__mmask32 __U, __m512i __X, __m512i __Y, const int 
__P) { return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, 
(__v64qi) __Y, __P, (__mmask64) __U); }
 _mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) { return 
(__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, (__v16hi) __W, 
(__mmask16) __U); }
 _mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) { return 
(__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, (__v8hi) __W, 
(__mmask8) __U); }
 _mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) { return 
(__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, (__v16hi) __W, 
(__mmask16) __U); }
 _mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) { return 
(__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, (__v8hi) __W, 
(__mmask8) __U); }
echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 
's/@@@/}\n/g' | grep '__mmask64.*__mmask\(8\|16\|32\)'

I'm not saying all these are bugs, but at least all of them need close
inspection (check them up with 
https://software.intel.com/sites/landingpage/IntrinsicsGuide/
check what the builtins actually expect and if it is correct etc.).

Are you willing to do that, or shall I do that?

        Jakub

Reply via email to