------- Comment #1 from ubizjak at gmail dot com 2008-04-22 08:36 ------- I have tested a couple of approaches using following code:
--cut here-- #include <xmmintrin.h> __m128 test_0 (__m64 __A, __m64 __B) { __v4sf __zero = (__v4sf) _mm_setzero_ps (); __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); } __m128 test_1 (__m64 __A, __m64 __B) { __v4sf __sfa = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__A); __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B); return (__m128) (__sfa = __builtin_ia32_movlhps (__sfa, __sfb)); } __m128 test_2 (__m64 __A, __m64 __B) { __v4sf __zero = (__v4sf) _mm_setzero_ps (); __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); __sfa = __builtin_ia32_movlhps (__sfa, __sfa); return (__m128) __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B); } __m128 test_3 (__m64 __A, __m64 __B) { __v4sf __zero = (__v4sf) _mm_setzero_ps (); __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B); return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); } #define N 1024*1024 int main() { __v2si *a, *b; __v4sf *c; int i; a = _mm_malloc (N * sizeof (__v2si), 16); b = _mm_malloc (N * sizeof (__v2si), 16); c = _mm_malloc (N * sizeof (__v4sf), 16); for (i = 0; i < N; i++) c[i] = test_X (a[i], b[i]); return 0; } --cut here-- gcc -O2 -msse2 -fomit-frame-pointer: test_0: xorps %xmm1, %xmm1 movaps %xmm1, %xmm0 cvtpi2ps %mm0, %xmm0 cvtpi2ps %mm1, %xmm1 movlhps %xmm1, %xmm0 ret test_1: xorps %xmm0, %xmm0 cvtpi2ps %mm0, %xmm0 movaps %xmm0, %xmm1 cvtpi2ps %mm1, %xmm1 movlhps %xmm1, %xmm0 ret test_2: xorps %xmm0, %xmm0 cvtpi2ps %mm0, %xmm0 movlhps %xmm0, %xmm0 cvtpi2ps %mm1, %xmm0 ret test_3: xorps %xmm0, %xmm0 cvtpi2ps %mm0, %xmm0 movaps %xmm0, %xmm1 cvtpi2ps %mm1, %xmm1 movlhps %xmm1, %xmm0 ret Timings (10 samples) on xeon-3.6 (32bit): 0: 0.6500s +- 0.03633 1: 0.5868s +- 0.04212 2: 0.7684s +- 0.06498 3: 0.5812s +- 0.03919 Since this is a low-hanging fruit, let's change this function to faster implementation. -- ubizjak at gmail dot com changed: What |Removed |Added ---------------------------------------------------------------------------- AssignedTo|unassigned at gcc dot gnu |ubizjak at gmail dot com |dot org | Status|UNCONFIRMED |ASSIGNED Ever Confirmed|0 |1 Last reconfirmed|0000-00-00 00:00:00 |2008-04-22 08:36:08 date| | http://gcc.gnu.org/bugzilla/show_bug.cgi?id=29096