Hello,

for the first patch, it is actually easier to repost the old patch with some new testcases. That doesn't mean it has to go in all at once, but you can comment on various things. If that's a problem I'll separate them and repost separately.

For simple +-*/ for double, I mostly wonder if __A + __B is good enough or if (__m128d)((__v2df)__A + (__v2df)__B) would be better. Note that for integer vectors (future patch) those casts will be necessary. Maybe I should write the integer patch, for a single header, so you can compare.

I implemented _mm_cvtsi128_si64 with simply __A[0], but the corresponding V4SI operation will need ((__v4si)__A)[0], and I don't know if it is better to cast everywhere for uniformity, or just where it is necessary. I doubt it matters much for the generated code.

Since we are keeping the builtins for Ada, it would be possible to follow Ulrich's suggestion and keep the old version of the intrinsics, protected by a macro. I would not like that much...

Something like _mm_load[hl]_pd is in my opinion roughly where the complexity limit should be. They would be nice to have, I expect the compiler will almost always generate a sequence at least as good, but we are getting quite far from the user's code so the likelihood of pessimizing somehow may increase. Also, an alternate implementation would be __A[i] = *__B, but IIRC the middle-end optimizers have a harder time handling this form. Those 2 intrinsics should probably not be included in the first batch, but I wanted to show them.

All testcases fail without the patch. The 3rd testcase almost works, there is a note in RTL saying that the result is 0.0, but it doesn't use it. (I just noticed that all functions are still called myfma, I'll rename those not related to fma to 'f')

If I omit the avx512fintrin.h part, I think it is very unlikely this can conflict with Kirill's work in any way (but I can still wait / use a branch).

Bootstrap+testsuite on x86_64-linux-gnu.

2014-10-13  Marc Glisse  <marc.gli...@inria.fr>

gcc/
        * config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
        _mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
        instead of builtins.
        * config/i386/avxintrin.h (_mm256_add_pd, _mm256_add_ps,
        _mm256_div_pd, _mm256_div_ps, _mm256_mul_pd, _mm256_mul_ps,
        _mm256_sub_pd, _mm256_sub_ps): Likewise.
        * config/i386/avx512fintrin.h (_mm512_add_pd, _mm512_add_ps,
        _mm512_sub_pd, _mm512_sub_ps, _mm512_mul_pd, _mm512_mul_ps,
        _mm512_div_pd, _mm512_div_ps): Likewise.
        * config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd,
        _mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
        _mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64,
        _mm_loadh_pd, _mm_loadl_pd): Likewise.
        (_mm_sqrt_sd): Fix comment.
gcc/testsuite/
        * gcc.target/i386/intrinsics_opt-1.c: New testcase.
        * gcc.target/i386/intrinsics_opt-2.c: Likewise.
        * gcc.target/i386/intrinsics_opt-3.c: Likewise.
        * gcc.target/i386/intrinsics_opt-4.c: Likewise.


--
Marc Glisse
Index: gcc/config/i386/avx512fintrin.h
===================================================================
--- gcc/config/i386/avx512fintrin.h     (revision 216116)
+++ gcc/config/i386/avx512fintrin.h     (working copy)
@@ -10742,26 +10742,21 @@ _mm512_maskz_sqrt_ps (__mmask16 __U, __m
                                                 (__v16sf)
                                                 _mm512_setzero_ps (),
                                                 (__mmask16) __U,
                                                 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_add_pd (__m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
-                                                (__v8df) __B,
-                                                (__v8df)
-                                                _mm512_undefined_pd (),
-                                                (__mmask8) -1,
-                                                _MM_FROUND_CUR_DIRECTION);
+  return __A + __B;
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
                                                 (__v8df) __B,
                                                 (__v8df) __W,
                                                 (__mmask8) __U,
@@ -10777,26 +10772,21 @@ _mm512_maskz_add_pd (__mmask8 __U, __m51
                                                 (__v8df)
                                                 _mm512_setzero_pd (),
                                                 (__mmask8) __U,
                                                 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_add_ps (__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
-                                               (__v16sf) __B,
-                                               (__v16sf)
-                                               _mm512_undefined_ps (),
-                                               (__mmask16) -1,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return __A + __B;
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
                                                (__v16sf) __B,
                                                (__v16sf) __W,
                                                (__mmask16) __U,
@@ -10812,26 +10802,21 @@ _mm512_maskz_add_ps (__mmask16 __U, __m5
                                                (__v16sf)
                                                _mm512_setzero_ps (),
                                                (__mmask16) __U,
                                                _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_sub_pd (__m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
-                                                (__v8df) __B,
-                                                (__v8df)
-                                                _mm512_undefined_pd (),
-                                                (__mmask8) -1,
-                                                _MM_FROUND_CUR_DIRECTION);
+  return __A - __B;
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
                                                 (__v8df) __B,
                                                 (__v8df) __W,
                                                 (__mmask8) __U,
@@ -10847,26 +10832,21 @@ _mm512_maskz_sub_pd (__mmask8 __U, __m51
                                                 (__v8df)
                                                 _mm512_setzero_pd (),
                                                 (__mmask8) __U,
                                                 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_sub_ps (__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
-                                               (__v16sf) __B,
-                                               (__v16sf)
-                                               _mm512_undefined_ps (),
-                                               (__mmask16) -1,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return __A - __B;
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
                                                (__v16sf) __B,
                                                (__v16sf) __W,
                                                (__mmask16) __U,
@@ -10882,26 +10862,21 @@ _mm512_maskz_sub_ps (__mmask16 __U, __m5
                                                (__v16sf)
                                                _mm512_setzero_ps (),
                                                (__mmask16) __U,
                                                _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mul_pd (__m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
-                                                (__v8df) __B,
-                                                (__v8df)
-                                                _mm512_undefined_pd (),
-                                                (__mmask8) -1,
-                                                _MM_FROUND_CUR_DIRECTION);
+  return __A * __B;
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
                                                 (__v8df) __B,
                                                 (__v8df) __W,
                                                 (__mmask8) __U,
@@ -10917,26 +10892,21 @@ _mm512_maskz_mul_pd (__mmask8 __U, __m51
                                                 (__v8df)
                                                 _mm512_setzero_pd (),
                                                 (__mmask8) __U,
                                                 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mul_ps (__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
-                                               (__v16sf) __B,
-                                               (__v16sf)
-                                               _mm512_undefined_ps (),
-                                               (__mmask16) -1,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return __A * __B;
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
                                                (__v16sf) __B,
                                                (__v16sf) __W,
                                                (__mmask16) __U,
@@ -10952,26 +10922,21 @@ _mm512_maskz_mul_ps (__mmask16 __U, __m5
                                                (__v16sf)
                                                _mm512_setzero_ps (),
                                                (__mmask16) __U,
                                                _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_div_pd (__m512d __M, __m512d __V)
 {
-  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
-                                                (__v8df) __V,
-                                                (__v8df)
-                                                _mm512_undefined_pd (),
-                                                (__mmask8) -1,
-                                                _MM_FROUND_CUR_DIRECTION);
+  return __M / __V;
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V)
 {
   return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
                                                 (__v8df) __V,
                                                 (__v8df) __W,
                                                 (__mmask8) __U,
@@ -10987,26 +10952,21 @@ _mm512_maskz_div_pd (__mmask8 __U, __m51
                                                 (__v8df)
                                                 _mm512_setzero_pd (),
                                                 (__mmask8) __U,
                                                 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_div_ps (__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
-                                               (__v16sf) __B,
-                                               (__v16sf)
-                                               _mm512_undefined_ps (),
-                                               (__mmask16) -1,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return __A / __B;
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
                                                (__v16sf) __B,
                                                (__v16sf) __W,
                                                (__mmask16) __U,
Index: gcc/config/i386/avxintrin.h
===================================================================
--- gcc/config/i386/avxintrin.h (revision 216116)
+++ gcc/config/i386/avxintrin.h (working copy)
@@ -117,27 +117,27 @@ typedef double __m256d __attribute__ ((_
 /* Greater-than-or-equal (ordered, non-signaling)  */
 #define _CMP_GE_OQ     0x1d
 /* Greater-than (ordered, non-signaling)  */
 #define _CMP_GT_OQ     0x1e
 /* True (unordered, signaling)  */
 #define _CMP_TRUE_US   0x1f
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_add_pd (__m256d __A, __m256d __B)
 {
-  return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
+  return __A + __B;
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_add_ps (__m256 __A, __m256 __B)
 {
-  return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
+  return __A + __B;
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_addsub_pd (__m256d __A, __m256d __B)
 {
   return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_addsub_ps (__m256 __A, __m256 __B)
@@ -211,27 +211,27 @@ extern __inline __m256 __attribute__((__
 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
 {
   return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
                                              (__v8sf)__Y,
                                              (__v8sf)__M);
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_div_pd (__m256d __A, __m256d __B)
 {
-  return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
+  return __A / __B;
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_div_ps (__m256 __A, __m256 __B)
 {
-  return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
+  return __A / __B;
 }
 
 /* Dot product instructions with mask-defined summing and zeroing parts
    of result.  */
 
 #ifdef __OPTIMIZE__
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
 {
   return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
@@ -288,27 +288,27 @@ _mm256_min_pd (__m256d __A, __m256d __B)
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_min_ps (__m256 __A, __m256 __B)
 {
   return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_mul_pd (__m256d __A, __m256d __B)
 {
-  return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
+  return __A * __B;
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_mul_ps (__m256 __A, __m256 __B)
 {
-  return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
+  return __A * __B;
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_or_pd (__m256d __A, __m256d __B)
 {
   return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_or_ps (__m256 __A, __m256 __B)
@@ -336,27 +336,27 @@ _mm256_shuffle_ps (__m256 __A, __m256 __
                                      (__v4df)(__m256d)(B), (int)(N)))
 
 #define _mm256_shuffle_ps(A, B, N)                                     \
   ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),             \
                                      (__v8sf)(__m256)(B), (int)(N)))
 #endif
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_sub_pd (__m256d __A, __m256d __B)
 {
-  return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
+  return __A - __B;
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_sub_ps (__m256 __A, __m256 __B)
 {
-  return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
+  return __A - __B;
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_xor_pd (__m256d __A, __m256d __B)
 {
   return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_xor_ps (__m256 __A, __m256 __B)
Index: gcc/config/i386/emmintrin.h
===================================================================
--- gcc/config/i386/emmintrin.h (revision 216116)
+++ gcc/config/i386/emmintrin.h (working copy)
@@ -161,40 +161,40 @@ _mm_store_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_storeu_pd (double *__P, __m128d __A)
 {
   __builtin_ia32_storeupd (__P, __A);
 }
 
 /* Stores the lower DPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
+  *__P = __A[0];
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_cvtsd_f64 (__m128d __A)
 {
-  return __builtin_ia32_vec_ext_v2df (__A, 0);
+  return __A[0];
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_storel_pd (double *__P, __m128d __A)
 {
   _mm_store_sd (__P, __A);
 }
 
 /* Stores the upper DPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_storeh_pd (double *__P, __m128d __A)
 {
-  *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
+  *__P = __A[1];
 }
 
 /* Store the lower DPFP value across two words.
    The address must be 16-byte aligned.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store1_pd (double *__P, __m128d __A)
 {
   _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
 }
 
@@ -215,86 +215,86 @@ extern __inline int __attribute__((__gnu
 _mm_cvtsi128_si32 (__m128i __A)
 {
   return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
 }
 
 #ifdef __x86_64__
 /* Intel intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_cvtsi128_si64 (__m128i __A)
 {
-  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+  return __A[0];
 }
 
 /* Microsoft intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_cvtsi128_si64x (__m128i __A)
 {
-  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+  return __A[0];
 }
 #endif
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_add_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+  return __A + __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_add_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_sub_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+  return __A - __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_sub_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_mul_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+  return __A * __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_mul_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_div_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+  return __A / __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_div_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_sqrt_pd (__m128d __A)
 {
   return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
 }
 
-/* Return pair {sqrt (A[0), B[1]}.  */
+/* Return pair {sqrt (B[0]), A[1]}.  */
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_sqrt_sd (__m128d __A, __m128d __B)
 {
   __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
   return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_min_pd (__m128d __A, __m128d __B)
 {
@@ -708,27 +708,27 @@ _mm_store_si128 (__m128i *__P, __m128i _
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_storeu_si128 (__m128i *__P, __m128i __B)
 {
   __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_storel_epi64 (__m128i *__P, __m128i __B)
 {
-  *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+  *(long long *)__P = __B[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_movepi64_pi64 (__m128i __B)
 {
-  return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+  return (__m64) __B[0];
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_movpi64_epi64 (__m64 __A)
 {
   return _mm_set_epi64 ((__m64)0LL, __A);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_move_epi64 (__m128i __A)
@@ -915,27 +915,27 @@ _mm_unpackhi_pd (__m128d __A, __m128d __
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_unpacklo_pd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
+  return __extension__ (__m128d){ __A[0], __B[0] };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
+  return __extension__ (__m128d){ __B[0], __A[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_movemask_pd (__m128d __A)
 {
   return __builtin_ia32_movmskpd ((__v2df)__A);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_packs_epi16 (__m128i __A, __m128i __B)
Index: gcc/config/i386/xmmintrin.h
===================================================================
--- gcc/config/i386/xmmintrin.h (revision 216116)
+++ gcc/config/i386/xmmintrin.h (working copy)
@@ -173,39 +173,39 @@ extern __inline __m128 __attribute__((__
 _mm_max_ss (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
 }
 
 /* Perform the respective operation on the four SPFP values in A and B.  */
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_add_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+  return __A + __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_sub_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+  return __A - __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_mul_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+  return __A * __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_div_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+  return __A / __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_sqrt_ps (__m128 __A)
 {
   return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_rcp_ps (__m128 __A)
@@ -950,27 +950,27 @@ _mm_set_ps (const float __Z, const float
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 {
   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 }
 
 /* Stores the lower SPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_ss (float *__P, __m128 __A)
 {
-  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+  *__P = __A[0];
 }
 
 extern __inline float __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_cvtss_f32 (__m128 __A)
 {
-  return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+  return __A[0];
 }
 
 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_ps (float *__P, __m128 __A)
 {
   *(__v4sf *)__P = (__v4sf)__A;
 }
 
 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
Index: gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c    (revision 0)
+++ gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c    (working copy)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfma" } */
+
+#include <emmintrin.h>
+
+__m128d myfma(__m128d x, __m128d y, __m128d z){
+  __m128d m = _mm_mul_pd (x, y);
+  return _mm_add_pd (m, z);
+}
+
+/* { dg-final { scan-assembler "vfmadd" } } */
Index: gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c    (revision 0)
+++ gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c    (working copy)
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O -ffast-math -msse2 -fdump-tree-optimized" } */
+
+#include <emmintrin.h>
+
+int f(__m128d x){
+  x = _mm_sub_pd (x, x);
+  x = _mm_mul_pd (x, x);
+  double r = 42;
+  _mm_storeh_pd (&r, x);
+  int z = r == 0;
+  return __builtin_constant_p (z) && z;
+}
+
+/* { dg-final { scan-tree-dump "return 1;" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c
===================================================================
--- gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c    (revision 0)
+++ gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c    (working copy)
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msse2" } */
+
+#include <emmintrin.h>
+
+double myfma(){
+  __m128d x = _mm_set1_pd (0.);
+  double r = 42;
+  _mm_storeh_pd (&r, x);
+  return r;
+}
+
+/* { dg-final { scan-assembler-not "unpckhpd" } } */
Index: gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c
===================================================================
--- gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c    (revision 0)
+++ gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c    (working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O -ffast-math -msse2" } */
+
+#include <emmintrin.h>
+
+__m128d myfma(__m128d x, __m128d y, __m128d z){
+  y = _mm_add_pd (x, y);
+  y = _mm_add_pd (z, y);
+  return _mm_sub_pd (y, x);
+}
+
+/* { dg-final { scan-assembler-not "subpd" } } */

Reply via email to