Hello,the way full masks are generated currently in avx2intrin.h is questionable: opaque for the inline functions, weird/wrong for the macros.
It is possible we may want to add code so the constant mask with all ones may be generated with vxorpd+vcmpeqpd instead of loading it from memory, but that looks like something that should be decided globally, not in each instruction that uses it.
Bootstrap+regtest on x86_64-pc-linux-gnu (skylake). 2017-11-27 Marc Glisse <marc.gli...@inria.fr> PR target/80885 * config/i386/avx2intrin.h (_mm_i32gather_pd): Rewrite mask generation. (_mm256_i32gather_pd): Likewise. (_mm_i64gather_pd): Likewise. (_mm256_i64gather_pd): Likewise. (_mm_i32gather_ps): Likewise. (_mm256_i32gather_ps): Likewise. (_mm_i64gather_ps): Likewise. (_mm256_i64gather_ps): Likewise. -- Marc Glisse
Index: gcc/config/i386/avx2intrin.h =================================================================== --- gcc/config/i386/avx2intrin.h (revision 255140) +++ gcc/config/i386/avx2intrin.h (working copy) @@ -1241,22 +1241,21 @@ __attribute__ ((__gnu_inline__, __always _mm_srlv_epi64 (__m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); } #ifdef __OPTIMIZE__ extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_i32gather_pd (double const *__base, __m128i __index, const int __scale) { - __v2df __zero = _mm_setzero_pd (); - __v2df __mask = _mm_cmpeq_pd (__zero, __zero); + __v2df __mask = (__v2df)_mm_set1_epi64x (-1); return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (), __base, (__v4si)__index, __mask, __scale); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1267,22 +1266,21 @@ _mm_mask_i32gather_pd (__m128d __src, do __base, (__v4si)__index, (__v2df)__mask, __scale); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale) { - __v4df __zero = _mm256_setzero_pd (); - __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ); + __v4df __mask = (__v4df)_mm256_set1_epi64x (-1); return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (), __base, (__v4si)__index, __mask, __scale); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1294,21 +1292,21 @@ _mm256_mask_i32gather_pd (__m256d __src, (__v4si)__index, (__v4df)__mask, __scale); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_i64gather_pd (double const *__base, __m128i __index, const int __scale) { __v2df __src = _mm_setzero_pd (); - __v2df __mask = _mm_cmpeq_pd (__src, __src); + __v2df __mask = (__v2df)_mm_set1_epi64x (-1); return (__m128d) __builtin_ia32_gatherdiv2df (__src, __base, (__v2di)__index, __mask, __scale); } extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1320,21 +1318,21 @@ _mm_mask_i64gather_pd (__m128d __src, do (__v2di)__index, (__v2df)__mask, __scale); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale) { __v4df __src = _mm256_setzero_pd (); - __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ); + __v4df __mask = (__v4df)_mm256_set1_epi64x (-1); return (__m256d) __builtin_ia32_gatherdiv4df (__src, __base, (__v4di)__index, __mask, __scale); } extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1346,21 +1344,21 @@ _mm256_mask_i64gather_pd (__m256d __src, (__v4di)__index, (__v4df)__mask, __scale); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_i32gather_ps (float const *__base, __m128i __index, const int __scale) { __v4sf __src = _mm_setzero_ps (); - __v4sf __mask = _mm_cmpeq_ps (__src, __src); + __v4sf __mask = (__v4sf)_mm_set1_epi64x (-1); return (__m128) __builtin_ia32_gathersiv4sf (__src, __base, (__v4si)__index, __mask, __scale); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1372,21 +1370,21 @@ _mm_mask_i32gather_ps (__m128 __src, flo (__v4si)__index, (__v4sf)__mask, __scale); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale) { __v8sf __src = _mm256_setzero_ps (); - __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ); + __v8sf __mask = (__v8sf)_mm256_set1_epi64x (-1); return (__m256) __builtin_ia32_gathersiv8sf (__src, __base, (__v8si)__index, __mask, __scale); } extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1398,21 +1396,21 @@ _mm256_mask_i32gather_ps (__m256 __src, (__v8si)__index, (__v8sf)__mask, __scale); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_i64gather_ps (float const *__base, __m128i __index, const int __scale) { __v4sf __src = _mm_setzero_ps (); - __v4sf __mask = _mm_cmpeq_ps (__src, __src); + __v4sf __mask = (__v4sf)_mm_set1_epi64x (-1); return (__m128) __builtin_ia32_gatherdiv4sf (__src, __base, (__v2di)__index, __mask, __scale); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1424,21 +1422,21 @@ _mm_mask_i64gather_ps (__m128 __src, flo (__v2di)__index, (__v4sf)__mask, __scale); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale) { __v4sf __src = _mm_setzero_ps (); - __v4sf __mask = _mm_cmpeq_ps (__src, __src); + __v4sf __mask = (__v4sf)_mm_set1_epi64x (-1); return (__m128) __builtin_ia32_gatherdiv4sf256 (__src, __base, (__v4di)__index, __mask, __scale); } extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1665,126 +1663,119 @@ _mm256_mask_i64gather_epi32 (__m128i __s __base, (__v4di)__index, (__v4si)__mask, __scale); } #else /* __OPTIMIZE__ */ #define _mm_i32gather_pd(BASE, INDEX, SCALE) \ (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ (double const *)BASE, \ (__v4si)(__m128i)INDEX, \ - (__v2df)_mm_set1_pd( \ - (double)(long long int) -1), \ + (__v2df)_mm_set1_epi64x (-1), \ (int)SCALE) #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \ (double const *)BASE, \ (__v4si)(__m128i)INDEX, \ (__v2df)(__m128d)MASK, \ (int)SCALE) #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ (double const *)BASE, \ (__v4si)(__m128i)INDEX, \ - (__v4df)_mm256_set1_pd( \ - (double)(long long int) -1), \ + (__v4df)_mm256_set1_epi64x(-1),\ (int)SCALE) #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \ (double const *)BASE, \ (__v4si)(__m128i)INDEX, \ (__v4df)(__m256d)MASK, \ (int)SCALE) #define _mm_i64gather_pd(BASE, INDEX, SCALE) \ (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ (double const *)BASE, \ (__v2di)(__m128i)INDEX, \ - (__v2df)_mm_set1_pd( \ - (double)(long long int) -1), \ + (__v2df)_mm_set1_epi64x (-1), \ (int)SCALE) #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \ (double const *)BASE, \ (__v2di)(__m128i)INDEX, \ (__v2df)(__m128d)MASK, \ (int)SCALE) #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ (double const *)BASE, \ (__v4di)(__m256i)INDEX, \ - (__v4df)_mm256_set1_pd( \ - (double)(long long int) -1), \ + (__v4df)_mm256_set1_epi64x(-1),\ (int)SCALE) #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \ (double const *)BASE, \ (__v4di)(__m256i)INDEX, \ (__v4df)(__m256d)MASK, \ (int)SCALE) #define _mm_i32gather_ps(BASE, INDEX, SCALE) \ (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ (float const *)BASE, \ (__v4si)(__m128i)INDEX, \ - _mm_set1_ps ((float)(int) -1), \ + (__v4sf)_mm_set1_epi64x (-1), \ (int)SCALE) #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \ (float const *)BASE, \ (__v4si)(__m128i)INDEX, \ (__v4sf)(__m128d)MASK, \ (int)SCALE) #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ (float const *)BASE, \ (__v8si)(__m256i)INDEX, \ - (__v8sf)_mm256_set1_ps ( \ - (float)(int) -1), \ + (__v8sf)_mm256_set1_epi64x(-1),\ (int)SCALE) #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \ (float const *)BASE, \ (__v8si)(__m256i)INDEX, \ (__v8sf)(__m256d)MASK, \ (int)SCALE) #define _mm_i64gather_ps(BASE, INDEX, SCALE) \ (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ (float const *)BASE, \ (__v2di)(__m128i)INDEX, \ - (__v4sf)_mm_set1_ps ( \ - (float)(int) -1), \ + (__v4sf)_mm_set1_epi64x (-1), \ (int)SCALE) #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \ (float const *)BASE, \ (__v2di)(__m128i)INDEX, \ (__v4sf)(__m128d)MASK, \ (int)SCALE) #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ (float const *)BASE, \ (__v4di)(__m256i)INDEX, \ - (__v4sf)_mm_set1_ps( \ - (float)(int) -1), \ + (__v4sf)_mm_set1_epi64x (-1),\ (int)SCALE) #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \ (float const *)BASE, \ (__v4di)(__m256i)INDEX, \ (__v4sf)(__m128)MASK, \ (int)SCALE) #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \