Hello, I’ve noticed that _mm512_permutexvar_epi[64|32] intrinsics have wrong arguments order. As per [1] first argument is index. For vmpermps/vpermpd intrinsics are fine, but I’ve changed tests to call CALC with same arg order as intrinsic. here is the same problem (wrong argument order) with vrcp14s[d|s]. Also avx512er-vrcp28ss-2.c test called wrong intrinsic.
[1] http://software.intel.com/sites/landingpage/IntrinsicsGuide/ gcc/ * config/i386/avx512fintrin.h (_mm512_maskz_permutexvar_epi64): Swap arguments order in builtin. (_mm512_permutexvar_epi64): Ditto. (_mm512_mask_permutexvar_epi64): Ditto (_mm512_maskz_permutexvar_epi32): Ditto (_mm512_permutexvar_epi32): Ditto (_mm512_mask_permutexvar_epi32): Ditto * config/i386/sse.md (srcp14<mode>): Swap operands. gcc/testsuite/ * gcc.target/i386/avx512er-vrcp28ss-2.c: Call rigth intrinsic. * gcc.target/i386/avx512f-vpermd-2.c: Fix reference calculations. * gcc.target/i386/avx512f-vpermpd-2.c: Ditto. * gcc.target/i386/avx512f-vpermps-2.c: Ditto. * gcc.target/i386/avx512f-vpermq-var-2.c: Ditto. * gcc.target/i386/avx512f-vrcp14sd-2.c: Ditto. * gcc.target/i386/avx512f-vrcp14ss-2.c: Ditto. Is it ok for trunk? Or we should wait until 4.9 fork? -- Thanks, K --- gcc/config/i386/avx512fintrin.h | 24 +++++++++++----------- gcc/config/i386/sse.md | 6 +++--- .../gcc.target/i386/avx512er-vrcp28ss-2.c | 2 +- gcc/testsuite/gcc.target/i386/avx512f-vpermd-2.c | 2 +- gcc/testsuite/gcc.target/i386/avx512f-vpermpd-2.c | 4 ++-- gcc/testsuite/gcc.target/i386/avx512f-vpermps-2.c | 4 ++-- .../gcc.target/i386/avx512f-vpermq-var-2.c | 2 +- gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c | 4 ++-- gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c | 8 ++++---- 9 files changed, 28 insertions(+), 28 deletions(-) diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h index d53a40d..b3a4f3a 100644 --- a/gcc/config/i386/avx512fintrin.h +++ b/gcc/config/i386/avx512fintrin.h @@ -6148,8 +6148,8 @@ extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __X, - (__v8di) __Y, + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, (__v8di) _mm512_setzero_si512 (), __M); @@ -6159,8 +6159,8 @@ extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __X, - (__v8di) __Y, + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, (__v8di) _mm512_setzero_si512 (), (__mmask8) -1); @@ -6171,8 +6171,8 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __X, - (__v8di) __Y, + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, (__v8di) __W, __M); } @@ -6181,8 +6181,8 @@ extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __X, - (__v16si) __Y, + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, (__v16si) _mm512_setzero_si512 (), __M); @@ -6192,8 +6192,8 @@ extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __X, - (__v16si) __Y, + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, (__v16si) _mm512_setzero_si512 (), (__mmask16) -1); @@ -6204,8 +6204,8 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __X, - (__v16si) __Y, + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, (__v16si) __W, __M); } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a04b289..d3b2dc5 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1456,12 +1456,12 @@ [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 - [(match_operand:VF_128 1 "nonimmediate_operand" "vm")] + [(match_operand:VF_128 2 "nonimmediate_operand" "vm")] UNSPEC_RCP14) - (match_operand:VF_128 2 "register_operand" "v") + (match_operand:VF_128 1 "register_operand" "v") (const_int 1)))] "TARGET_AVX512F" - "vrcp14<ssescalarmodesuffix>\t{%1, %2, %0|%0, %2, %1}" + "vrcp14<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c index 499a977..a7be27c 100644 --- a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c @@ -22,7 +22,7 @@ avx512er_test (void) res_ref[0] = 1.0 / src.a[0]; - res.x = _mm_rsqrt28_round_ss (src.x, src.x, _MM_FROUND_NO_EXC); + res.x = _mm_rcp28_round_ss (src.x, src.x, _MM_FROUND_NO_EXC); if (checkVf (res.a, res_ref, 4)) abort (); diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpermd-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vpermd-2.c index db5fd09..1c494e3 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vpermd-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vpermd-2.c @@ -11,7 +11,7 @@ #include "avx512f-mask-type.h" static void -CALC (int *src1, int *mask, int *dst) +CALC (int *mask, int *src1, int *dst) { int i; diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpermpd-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vpermpd-2.c index 3d168be..00d171b 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vpermpd-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vpermpd-2.c @@ -10,7 +10,7 @@ #include "avx512f-mask-type.h" static void -CALC (double *s1, long long *mask, double *r) +CALC (long long *mask, double *s1, double *r) { int i; @@ -41,7 +41,7 @@ TEST (void) res2.x = INTRINSIC (_mask_permutexvar_pd) (res2.x, mask, src2.x, src1.x); res3.x = INTRINSIC (_maskz_permutexvar_pd) (mask, src2.x, src1.x); - CALC (src1.a, src2.a, res_ref); + CALC (src2.a, src1.a, res_ref); if (UNION_CHECK (AVX512F_LEN, d) (res1, res_ref)) abort (); diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpermps-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vpermps-2.c index 6182948..53081c4 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vpermps-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vpermps-2.c @@ -10,7 +10,7 @@ #include "avx512f-mask-type.h" static void -CALC (float *s1, int *mask, float *r) +CALC (int *mask, float *s1, float *r) { int i; @@ -41,7 +41,7 @@ TEST (void) res2.x = INTRINSIC (_mask_permutexvar_ps) (res2.x, mask, src2.x, src1.x); res3.x = INTRINSIC (_maskz_permutexvar_ps) (mask, src2.x, src1.x); - CALC (src1.a, src2.a, res_ref); + CALC (src2.a, src1.a, res_ref); if (UNION_CHECK (AVX512F_LEN, ) (res1, res_ref)) abort (); diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpermq-var-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vpermq-var-2.c index 2733e17..ff330a5 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vpermq-var-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vpermq-var-2.c @@ -11,7 +11,7 @@ #include "avx512f-mask-type.h" static void -CALC (long long *src1, long long *mask, long long *dst) +CALC (long long *mask, long long *src1, long long *dst) { int i; diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c index 0c9211a..f944600 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vrcp14sd-2.c @@ -8,8 +8,8 @@ static void compute_vrcp14sd (double *s1, double *s2, double *r) { - r[0] = 1.0 / s1[0]; - r[1] = s2[1]; + r[0] = 1.0 / s2[0]; + r[1] = s1[1]; } static void diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c index 3344dad..7aca591 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vrcp14ss-2.c @@ -8,10 +8,10 @@ static void compute_vrcp14ss (float *s1, float *s2, float *r) { - r[0] = 1.0 / s1[0]; - r[1] = s2[1]; - r[2] = s2[2]; - r[3] = s2[3]; + r[0] = 1.0 / s2[0]; + r[1] = s1[1]; + r[2] = s1[2]; + r[3] = s1[3]; } static void