Hello,
the attached patch is very incomplete (it passes bootstrap+testsuite on
x86_64-linux-gnu), but it raises a number of questions that I'd like to
settle before continuing.
* Is there any chance of a patch in this direction being accepted?
* May I remove the builtins (from i386.c and the doc) when they become unused?
* Do we want to keep the casts even when they don't seem strictly
necessary? For instance for _mm_add_ps, we can write:
return __A + __B;
or:
return (__m128) ((__v4sf)__A + (__v4sf)__B);
Note that for _mm_add_epi8 for instance we do need the casts.
* For integer operations like _mm_add_epi16 I should probably use the
unsigned typedefs to make it clear overflow is well defined? (the patch
still has the signed version)
* Any better name than __v4su for the unsigned version of __v4si?
* Other comments?
2013-04-07 Marc Glisse <marc.gli...@inria.fr>
* emmintrin.h (__v2du, __v4su, __v8hu): New typedefs.
(_mm_add_pd, _mm_sub_pd, _mm_mul_pd, _mm_div_pd,
_mm_cmpeq_pd, _mm_cmplt_pd, _mm_cmple_pd, _mm_cmpgt_pd, _mm_cmpge_pd,
_mm_cmpneq_pd, _mm_add_epi8, _mm_add_epi16, _mm_add_epi32,
_mm_add_epi64, _mm_slli_epi16, _mm_slli_epi32, _mm_slli_epi64,
_mm_srai_epi16, _mm_srai_epi32, _mm_srli_epi16, _mm_srli_epi32,
_mm_srli_epi64): Replace builtins with vector extensions.
* xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps, _mm_div_ps,
_mm_cmpeq_ps, _mm_cmplt_ps, _mm_cmple_ps, _mm_cmpgt_ps, _mm_cmpge_ps,
_mm_cmpneq_ps): Likewise.
--
Marc Glisse
Index: config/i386/xmmintrin.h
===================================================================
--- config/i386/xmmintrin.h (revision 197549)
+++ config/i386/xmmintrin.h (working copy)
@@ -147,39 +147,39 @@ extern __inline __m128 __attribute__((__
_mm_max_ss (__m128 __A, __m128 __B)
{
return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
}
/* Perform the respective operation on the four SPFP values in A and B. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_add_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+ return __A + __B;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_sub_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+ return __A - __B;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_mul_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+ return __A * __B;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_div_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+ return __A / __B;
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_sqrt_ps (__m128 __A)
{
return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_rcp_ps (__m128 __A)
@@ -323,51 +323,51 @@ _mm_cmpunord_ss (__m128 __A, __m128 __B)
return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
}
/* Perform a comparison on the four SPFP values of A and B. For each
element, if the comparison is true, place a mask of all ones in the
result, otherwise a mask of zeros. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpeq_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) (__A == __B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmplt_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) (__A < __B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmple_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) (__A <= __B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpgt_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) (__A > __B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpge_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) (__A >= __B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpneq_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) (__A != __B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpnlt_ps (__m128 __A, __m128 __B)
{
return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpnle_ps (__m128 __A, __m128 __B)
Index: config/i386/emmintrin.h
===================================================================
--- config/i386/emmintrin.h (revision 197549)
+++ config/i386/emmintrin.h (working copy)
@@ -30,22 +30,25 @@
#ifndef __SSE2__
# error "SSE2 instruction set not enabled"
#else
/* We need definitions from the SSE header files*/
#include <xmmintrin.h>
/* SSE2 */
typedef double __v2df __attribute__ ((__vector_size__ (16)));
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
typedef char __v16qi __attribute__ ((__vector_size__ (16)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
typedef long long __m128i __attribute__ ((__vector_size__ (16),
__may_alias__));
typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
/* Create a selector for use with the SHUFPD instruction. */
#define _MM_SHUFFLE2(fp1,fp0) \
(((fp1) << 1) | (fp0))
@@ -219,72 +222,72 @@ _mm_cvtsi128_si64 (__m128i __A)
extern __inline long long __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cvtsi128_si64x (__m128i __A)
{
return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
}
#endif
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_add_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+ return __A + __B;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_add_sd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_sub_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+ return __A - __B;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_sub_sd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_mul_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+ return __A * __B;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_mul_sd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_div_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+ return __A / __B;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_div_sd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_sqrt_pd (__m128d __A)
{
return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
}
-/* Return pair {sqrt (A[0), B[1]}. */
+/* Return pair {sqrt (A[0]), B[1]}. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_sqrt_sd (__m128d __A, __m128d __B)
{
__v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_min_pd (__m128d __A, __m128d __B)
{
@@ -329,51 +332,51 @@ _mm_or_pd (__m128d __A, __m128d __B)
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_xor_pd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpeq_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) (__A == __B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmplt_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) (__A < __B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmple_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) (__A <= __B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpgt_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) (__A > __B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpge_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) (__A >= __B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpneq_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) (__A != __B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpnlt_pd (__m128d __A, __m128d __B)
{
return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_cmpnle_pd (__m128d __A, __m128d __B)
@@ -981,39 +984,39 @@ _mm_unpacklo_epi32 (__m128i __A, __m128i
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_add_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
+ return (__m128i) ((__v16qi)__A + (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_add_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hi)__A + (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_add_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
+ return (__m128i) ((__v4si)__A + (__v4si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_add_epi64 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2di)__A + (__v2di)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_adds_epi8 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_adds_epi16 (__m128i __A, __m128i __B)
@@ -1107,45 +1110,45 @@ _mm_mul_su32 (__m64 __A, __m64 __B)
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_mul_epu32 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_slli_epi16 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+ return (__m128i) ((__v8hi)__A << __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_slli_epi32 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+ return (__m128i) ((__v4si)__A << __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_slli_epi64 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+ return (__m128i) ((__v2di)__A << __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_srai_epi16 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
+ return (__m128i) ((__v8hi)__A >> __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_srai_epi32 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
+ return (__m128i) ((__v4si)__A >> __B);
}
#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_srli_si128 (__m128i __A, const int __N)
{
return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
@@ -1156,33 +1159,33 @@ _mm_slli_si128 (__m128i __A, const int _
#else
#define _mm_srli_si128(A, N) \
((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
#define _mm_slli_si128(A, N) \
((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
#endif
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_srli_epi16 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+ return (__m128i) ((__v8hu)__A >> __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_srli_epi32 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+ return (__m128i) ((__v4su)__A >> __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_srli_epi64 (__m128i __A, int __B)
{
- return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
+ return (__m128i) ((__v2du)__A >> __B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_sll_epi16 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_sll_epi32 (__m128i __A, __m128i __B)