the attached code which does complex float multiplication using sse3 produces 4

unnecessary integer additions if the NaN fallback function comp_mult is inlined

the assembly for the loop generated with -msse3 -O3 -std=c99 in gcc 4.4, 4.6,

4.7 and 4.8 svn 195604 looks like this:

  28:    0f 28 0e                 movaps (%esi),%xmm1

  2b:    f3 0f 12 c1              movsldup %xmm1,%xmm0

  2f:    8b 55 08                 mov    0x8(%ebp),%edx

  32:    0f 28 13                 movaps (%ebx),%xmm2

  35:    f3 0f 16 c9              movshdup %xmm1,%xmm1

  39:    0f 59 c2                 mulps  %xmm2,%xmm0

  3c:    0f c6 d2 b1              shufps $0xb1,%xmm2,%xmm2

  40:    0f 59 ca                 mulps  %xmm2,%xmm1

  43:    f2 0f d0 c1              addsubps %xmm1,%xmm0

  47:    0f 29 04 fa              movaps %xmm0,(%edx,%edi,8)

  4b:    0f c2 c0 04              cmpneqps %xmm0,%xmm0

  4f:    0f 50 c0                 movmskps %xmm0,%eax

  52:    85 c0                    test   %eax,%eax

  54:    75 1d                    jne    73 <sse3_mult+0x73> // inlined


  56:    83 c7 02                 add    $0x2,%edi

  59:    83 c6 10                 add    $0x10,%esi

  5c:    83 c3 10                 add    $0x10,%ebx

  5f:    83 c1 10                 add    $0x10,%ecx

  62:    83 45 e4 10              addl   $0x10,-0x1c(%ebp)

  66:    39 7d 14                 cmp    %edi,0x14(%ebp)

  69:    7f bd                    jg     28 <sse3_mult+0x28>


the 4 adds for esi ebx ecx and ebp are completely unnecessary and reduce

performance by about 20% on my core2duo.

on amd64 it also creates to seemingly unnecessary additions but I did not test

the performance.

a way to coax gcc to emit proper code is to not allow it to inline the fallback

it then generates following good assembly with only one integer add:

  a8:    0f 28 0c df              movaps (%edi,%ebx,8),%xmm1

  ac:    f3 0f 12 c1              movsldup %xmm1,%xmm0

  b0:    8b 45 08                 mov    0x8(%ebp),%eax

  b3:    0f 28 14 de              movaps (%esi,%ebx,8),%xmm2

  b7:    f3 0f 16 c9              movshdup %xmm1,%xmm1

  bb:    0f 59 c2                 mulps  %xmm2,%xmm0

  be:    0f c6 d2 b1              shufps $0xb1,%xmm2,%xmm2

  c2:    0f 59 ca                 mulps  %xmm2,%xmm1

  c5:    f2 0f d0 c1              addsubps %xmm1,%xmm0

  c9:    0f 29 04 d8              movaps %xmm0,(%eax,%ebx,8)

  cd:    0f c2 c0 04              cmpneqps %xmm0,%xmm0

  d1:    0f 50 c0                 movmskps %xmm0,%eax

  d4:    85 c0                    test   %eax,%eax

  d6:    75 10                    jne    e8 <sse3_mult+0x58> // non-inlined


  d8:    83 c3 02                 add    $0x2,%ebx

  db:    39 5d 14                 cmp    %ebx,0x14(%ebp)

  de:    7f c8                    jg     a8 <sse3_mult+0x18>


