On Thu, Jul 5, 2018 at 9:28 PM, Jakub Jelinek <ja...@redhat.com> wrote: > On Thu, Jul 05, 2018 at 08:30:27PM +0300, Grazvydas Ignotas wrote: >> gcc/ChangeLog: >> >> 2018-07-05 Grazvydas Ignotas <nota...@gmail.com> >> >> * config/i386/avx512bwintrin.h: (_mm512_mask_cmp_epi8_mask, >> _mm512_mask_cmp_epu8_mask): Fix mask arguments. > > LGTM, but > 1) I think it would be nice to add a runtime testcase that fails (on avx512bw > hw) > without this patch and succeeds with this patch (have some non-zero and > zero bits in the high 32 bits of the mask and test that the result is > correct
Looks like the existing tests can already do it if we correct an apparent mistake (see attached patch). > 2) there are other functions that have this bug, e.g. > _mm_mask_cmp_epi8_mask, _mm256_mask_cmp_epi8_mask, > _mm_mask_cmp_epu8_mask, _mm256_mask_cmp_epu8_mask in avx512vlbwintrin.h > > Let's grep for all suspicious parts: > echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed > 's/@@@/}\n/g' | grep '__mmask8.*__mmask\(16\|32\|64\)' > _mm512_mask_bitshuffle_epi64_mask (__mmask8 __M, __m512i __A, __m512i __B) { > return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, > (__v64qi) __B, (__mmask64) __M); } > _mm_mask_cmp_epi8_mask (__mmask8 __U, __m128i __X, __m128i __Y, const int > __P) { return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, > (__v16qi) __Y, __P, (__mmask16) __U); } > _mm_mask_cmp_epu8_mask (__mmask8 __U, __m128i __X, __m128i __Y, const int > __P) { return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, > (__v16qi) __Y, __P, (__mmask16) __U); } > echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed > 's/@@@/}\n/g' | grep '__mmask16.*__mmask\(8\|32\|64\)' > _mm512_mask_xor_epi64 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) > { return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, > (__v8di) __W, (__mmask8) __U); } > _mm512_maskz_xor_epi64 (__mmask16 __U, __m512i __A, __m512i __B) { return > (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, (__v8di) > _mm512_setzero_si512 (), (__mmask8) __U); } > _mm512_mask_cmpneq_epi64_mask (__mmask16 __M, __m512i __X, __m512i __Y) { > return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, (__v8di) __Y, 4, > (__mmask8) __M); } > _mm256_mask_cmp_epi8_mask (__mmask16 __U, __m256i __X, __m256i __Y, const > int __P) { return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, > (__v32qi) __Y, __P, (__mmask32) __U); } > _mm256_mask_cmp_epu8_mask (__mmask16 __U, __m256i __X, __m256i __Y, const > int __P) { return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, > (__v32qi) __Y, __P, (__mmask32) __U); } > _mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { return > (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) > __W, (__mmask8) __U); } > _mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) { return (__m128) > __builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) > _mm_setzero_ps (), (__mmask8) __U); } > _mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { > return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, > (__v8sf) __W, (__mmask8) __U); } > _mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) { return > (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) > _mm256_setzero_ps (), (__mmask8) __U); } > _mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) { return > (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) > __W, (__mmask8) __U); } > _mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) { return (__m128) > __builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) > _mm_setzero_ps (), (__mmask8) __U); } > _mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) { > return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, > (__v8sf) __W, (__mmask8) __U); } > _mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) { return > (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, (__v8sf) > _mm256_setzero_ps (), (__mmask8) __U); } > _mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) { return (__m256) > __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, (__v8sf) _mm256_setzero_ps (), > (__mmask8) __U); } > _mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) { return (__m128) > __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, (__v4sf) _mm_setzero_ps (), > (__mmask8) __U); } > echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed > 's/@@@/}\n/g' | grep '__mmask32.*__mmask\(8\|16\|64\)' > _mm512_mask_cmp_epi8_mask (__mmask32 __U, __m512i __X, __m512i __Y, const > int __P) { return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, > (__v64qi) __Y, __P, (__mmask64) __U); } > _mm512_mask_cmp_epu8_mask (__mmask32 __U, __m512i __X, __m512i __Y, const > int __P) { return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, > (__v64qi) __Y, __P, (__mmask64) __U); } > _mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) { return > (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, (__v16hi) __W, > (__mmask16) __U); } > _mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) { return > (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, (__v8hi) __W, > (__mmask8) __U); } > _mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) { return > (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, (__v16hi) __W, > (__mmask16) __U); } > _mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) { return > (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, (__v8hi) __W, > (__mmask8) __U); } > echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed > 's/@@@/}\n/g' | grep '__mmask64.*__mmask\(8\|16\|32\)' > > I'm not saying all these are bugs, but at least all of them need close > inspection (check them up with > https://software.intel.com/sites/landingpage/IntrinsicsGuide/ > check what the builtins actually expect and if it is correct etc.). I've noticed at least one mistake in this IntrinsicsGuide (_mm512_maskz_sub_epi8 operation pseudocode is not correct), so it's unclear how much it can be trusted. > Are you willing to do that, or shall I do that? I think it would be more efficient if you took care of it. I won't have time for at least a few days anyway. GraÅžvydas
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vpcmpb-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-vpcmpb-2.c index b6f5677..21c4e3f 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bw-vpcmpb-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512bw-vpcmpb-2.c @@ -6,7 +6,7 @@ #include "avx512f-helper.h" #include <math.h> -#define SIZE (AVX512F_LEN / 16) +#define SIZE (AVX512F_LEN / 8) #include "avx512f-mask-type.h" #if AVX512F_LEN == 512 diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vpcmpub-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-vpcmpub-2.c index 8fdc9f2..a70d3fd 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bw-vpcmpub-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512bw-vpcmpub-2.c @@ -6,7 +6,7 @@ #include "avx512f-helper.h" #include <math.h> -#define SIZE (AVX512F_LEN / 16) +#define SIZE (AVX512F_LEN / 8) #include "avx512f-mask-type.h" #if AVX512F_LEN == 512