
I am playing with following code (from ffmpeg) translated to intrinsics:

Original code:

#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)

void diff_pixels_mmx(char *block, const uint8_t *s1, const uint8_t *s2, long 
    long offset = -128;
    do {
        asm volatile(
            "movq (%0), %%mm0         \n\t"
            "movq (%1), %%mm2         \n\t"
            "movq %%mm0, %%mm1        \n\t"
            "movq %%mm2, %%mm3        \n\t"
            "punpcklbw %%mm7, %%mm0   \n\t"
            "punpckhbw %%mm7, %%mm1   \n\t"
            "punpcklbw %%mm7, %%mm2   \n\t"
            "punpckhbw %%mm7, %%mm3   \n\t"
            "psubw %%mm2, %%mm0       \n\t"
            "psubw %%mm3, %%mm1       \n\t"
            "movq %%mm0, (%2, %3)     \n\t"
            "movq %%mm1, 8(%2, %3)    \n\t"
            : : "r" (s1), "r" (s2), "r" (block+64),  "r" (offset)
            : "memory");
        s1 += stride;
        s2 += stride;
        offset += 16;
    } while (offset < 0);

compiles to:

0000000000000000 <diff_pixels_mmx>:
   0:   0f ef ff                pxor   %mm7,%mm7
   3:   48 c7 c0 80 ff ff ff    mov    $0xffffffffffffff80,%rax
   a:   48 83 c7 40             add    $0x40,%rdi
   e:   66 90                   xchg   %ax,%ax
  10:   0f 6f 06                movq   (%rsi),%mm0
  13:   0f 6f 12                movq   (%rdx),%mm2
  16:   0f 6f c8                movq   %mm0,%mm1
  19:   0f 6f da                movq   %mm2,%mm3
  1c:   0f 60 c7                punpcklbw %mm7,%mm0
  1f:   0f 68 cf                punpckhbw %mm7,%mm1
  22:   0f 60 d7                punpcklbw %mm7,%mm2
  25:   0f 68 df                punpckhbw %mm7,%mm3
  28:   0f f9 c2                psubw  %mm2,%mm0
  2b:   0f f9 cb                psubw  %mm3,%mm1
  2e:   0f 7f 04 07             movq   %mm0,(%rdi,%rax,1)
  32:   0f 7f 4c 07 08          movq   %mm1,0x8(%rdi,%rax,1)
  37:   48 01 ce                add    %rcx,%rsi
  3a:   48 01 ca                add    %rcx,%rdx
  3d:   48 83 c0 10             add    $0x10,%rax
  41:   75 cd                   jne    10 <diff_pixels_mmx+0x10>
  43:   f3 c3                   repz retq
  45:   66 66 2e 0f 1f 84 00    nopw   %cs:0x0(%rax,%rax,1)
  4c:   00 00 00 00

This is the intrinsic version:

#include <mmintrin.h>
void diff_pixels_mmx3(char *block, const uint8_t *s1, const uint8_t *s2, long 
        long offset = -128;
        __m64 mm7 = _mm_setzero_si64();
        do {
                __m64 mm0 = *(__m64*)s1;
                __m64 mm2 = *(__m64*)s2;
                __m64 mm1 = mm0;
                __m64 mm3 = mm2;
                mm0 = _mm_unpacklo_pi8(mm0, mm7);
                mm1 = _mm_unpackhi_pi8(mm1, mm7);
                mm2 = _mm_unpacklo_pi8(mm2, mm7);
                mm3 = _mm_unpackhi_pi8(mm3, mm7);
                mm0 = _mm_sub_pi16(mm0, mm2);
                mm1 = _mm_sub_pi16(mm1, mm3);
                *(__m64*)(block+offset) = mm0;
                *(__m64*)(block+offset+8) = mm1;
                s1 += stride;
                s2 += stride;
                offset +=16;
        } while (offset < 0);

compiles to
00000000000000c0 <diff_pixels_mmx3>:
  c0:   53                      push   %rbx
  c1:   0f ef e4                pxor   %mm4,%mm4
  c4:   48 c7 c0 80 ff ff ff    mov    $0xffffffffffffff80,%rax
  cb:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  d0:   0f 6f 0e                movq   (%rsi),%mm1
  d3:   48 01 ce                add    %rcx,%rsi
  d6:   0f 6f 02                movq   (%rdx),%mm0
  d9:   48 01 ca                add    %rcx,%rdx
  dc:   0f 6f d1                movq   %mm1,%mm2
  df:   0f 6f d9                movq   %mm1,%mm3
  e2:   0f 6f c8                movq   %mm0,%mm1
  e5:   0f 68 c4                punpckhbw %mm4,%mm0
  e8:   0f 60 d4                punpcklbw %mm4,%mm2
  eb:   0f 68 dc                punpckhbw %mm4,%mm3
  ee:   0f 60 cc                punpcklbw %mm4,%mm1
  f1:   0f f9 d8                psubw  %mm0,%mm3
  f4:   0f f9 d1                psubw  %mm1,%mm2
  f7:   0f 7f 5c 24 f0          movq   %mm3,-0x10(%rsp)
  fc:   0f 7f 54 24 f8          movq   %mm2,-0x8(%rsp)
 101:   48 8b 5c 24 f8          mov    -0x8(%rsp),%rbx
 106:   48 89 5c 38 40          mov    %rbx,0x40(%rax,%rdi,1)
 10b:   48 8b 5c 24 f0          mov    -0x10(%rsp),%rbx
 110:   48 89 5c 38 48          mov    %rbx,0x48(%rax,%rdi,1)
 115:   48 83 c0 10             add    $0x10,%rax
 119:   75 b5                   jne    d0 <diff_pixels_mmx3+0x10>
 11b:   5b                      pop    %rbx
 11c:   c3                      retq

Flags used are:-O2 -march=k8
Compiler: gcc 4.2.3 (gentoo) x86_64

As you see in the intrinsic version gcc moves to mmx register to the stack, 
reloads from the stack and writes to the destination. Why?

I don't know whether earlier gcc 4.2 versions produced such stupid code.
Compiling as 32 does similar stupidity, though gcc reloads into a mmx 

(As I note on a side: Why does gcc want to use rbx, requiring push and pop? I 
think other registers are free...)

