In my software SSE2 emulation I am currently using this sort of approach to extract data fields out of __m128i and __m128d vectors:
#define EMM_SINT4(a) ((int *)&(a)) static __inline __m128i __attribute__((__always_inline__)) _mm_slli_epi32 (__m128i __A, int __B) { __v4si __tmp = { EMM_SINT4(__A)[0] << __B, EMM_SINT4(__A)[1] << __B, EMM_SINT4(__A)[2] << __B, EMM_SINT4(__A)[3] << __B}; return (__m128i)__tmp; } This works fine when testing one _mm function at a time, but does not work reliably in real programs unless -O0 is used. I think at least part of the problem is that once the function is inlined the parameter __A is in some cases a register variable, and the pointer method is not valid there. To get around that I'm think of introducing an explicit local variable, like this: static __inline __m128i __attribute__((__always_inline__)) _mm_slli_epi32 (__m128i __A, int __B) { __m128i A = __A; __v4si __tmp = { EMM_SINT4(A)[0] << __B, EMM_SINT4(A)[1] << __B, EMM_SINT4(A)[2] << __B, EMM_SINT4(A)[3] << __B}; return (__m128i)__tmp; } I'm not sure that will work all the time either. The only other approach I an aware of would be something like this: #typedef union { __m128i vi; __m128d vd; int s[4]; unsigned int us[4]; /* etc. for other types */ } emm_universal ; #define EMM_SINT4(a) (a).s static __inline __m128i __attribute__((__always_inline__)) _mm_slli_epi32 (__m128i __A, int __B) { emm_universal A; A.vi = __A; __v4si __tmp = { EMM_SINT4(A)[0] << __B, EMM_SINT4(A)[1] << __B, EMM_SINT4(A)[2] << __B, EMM_SINT4(A)[3] << __B}; return (__m128i)__tmp; } The union approach seems to be just a different a way to spin the pointer operations. For gcc in particular, is one approach or the other to be preferred and why? Thanks, David Mathog mat...@caltech.edu Manager, Sequence Analysis Facility, Biology Division, Caltech