------- Comment #1 from ubizjak at gmail dot com  2008-04-22 08:36 -------
I have tested a couple of approaches using following code:

--cut here--
#include <xmmintrin.h>

__m128 test_0 (__m64 __A, __m64 __B)
{
  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B);
  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
}

__m128 test_1 (__m64 __A, __m64 __B)
{
  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__A);
  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
  return (__m128) (__sfa = __builtin_ia32_movlhps (__sfa, __sfb));
}

__m128 test_2 (__m64 __A, __m64 __B)
{
  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);

  __sfa = __builtin_ia32_movlhps (__sfa, __sfa);
  return (__m128) __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
}

__m128 test_3 (__m64 __A, __m64 __B)
{
  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);

  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
}

#define N 1024*1024

int main()
{
  __v2si *a, *b;
  __v4sf *c;

  int i;

  a = _mm_malloc (N * sizeof (__v2si), 16);
  b = _mm_malloc (N * sizeof (__v2si), 16);
  c = _mm_malloc (N * sizeof (__v4sf), 16);

  for (i = 0; i < N; i++)
    c[i] = test_X (a[i], b[i]);

  return 0;
}
--cut here--

gcc -O2 -msse2 -fomit-frame-pointer:

test_0:
        xorps   %xmm1, %xmm1
        movaps  %xmm1, %xmm0
        cvtpi2ps        %mm0, %xmm0
        cvtpi2ps        %mm1, %xmm1
        movlhps %xmm1, %xmm0
        ret

test_1:
        xorps   %xmm0, %xmm0
        cvtpi2ps        %mm0, %xmm0
        movaps  %xmm0, %xmm1
        cvtpi2ps        %mm1, %xmm1
        movlhps %xmm1, %xmm0
        ret

test_2:
        xorps   %xmm0, %xmm0
        cvtpi2ps        %mm0, %xmm0
        movlhps %xmm0, %xmm0
        cvtpi2ps        %mm1, %xmm0
        ret

test_3:
        xorps   %xmm0, %xmm0
        cvtpi2ps        %mm0, %xmm0
        movaps  %xmm0, %xmm1
        cvtpi2ps        %mm1, %xmm1
        movlhps %xmm1, %xmm0
        ret

Timings (10 samples) on xeon-3.6 (32bit):

0: 0.6500s +- 0.03633
1: 0.5868s +- 0.04212
2: 0.7684s +- 0.06498
3: 0.5812s +- 0.03919

Since this is a low-hanging fruit, let's change this function to faster
implementation.


-- 

ubizjak at gmail dot com changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         AssignedTo|unassigned at gcc dot gnu   |ubizjak at gmail dot com
                   |dot org                     |
             Status|UNCONFIRMED                 |ASSIGNED
     Ever Confirmed|0                           |1
   Last reconfirmed|0000-00-00 00:00:00         |2008-04-22 08:36:08
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=29096

Reply via email to