Hi, I am playing with following code (from ffmpeg) translated to intrinsics:
Original code: #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) void diff_pixels_mmx(char *block, const uint8_t *s1, const uint8_t *s2, long stride) { long offset = -128; MOVQ_ZERO(mm7); do { asm volatile( "movq (%0), %%mm0 \n\t" "movq (%1), %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" "punpckhbw %%mm7, %%mm1 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm3 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" "movq %%mm0, (%2, %3) \n\t" "movq %%mm1, 8(%2, %3) \n\t" : : "r" (s1), "r" (s2), "r" (block+64), "r" (offset) : "memory"); s1 += stride; s2 += stride; offset += 16; } while (offset < 0); } compiles to: 0000000000000000 <diff_pixels_mmx>: 0: 0f ef ff pxor %mm7,%mm7 3: 48 c7 c0 80 ff ff ff mov $0xffffffffffffff80,%rax a: 48 83 c7 40 add $0x40,%rdi e: 66 90 xchg %ax,%ax 10: 0f 6f 06 movq (%rsi),%mm0 13: 0f 6f 12 movq (%rdx),%mm2 16: 0f 6f c8 movq %mm0,%mm1 19: 0f 6f da movq %mm2,%mm3 1c: 0f 60 c7 punpcklbw %mm7,%mm0 1f: 0f 68 cf punpckhbw %mm7,%mm1 22: 0f 60 d7 punpcklbw %mm7,%mm2 25: 0f 68 df punpckhbw %mm7,%mm3 28: 0f f9 c2 psubw %mm2,%mm0 2b: 0f f9 cb psubw %mm3,%mm1 2e: 0f 7f 04 07 movq %mm0,(%rdi,%rax,1) 32: 0f 7f 4c 07 08 movq %mm1,0x8(%rdi,%rax,1) 37: 48 01 ce add %rcx,%rsi 3a: 48 01 ca add %rcx,%rdx 3d: 48 83 c0 10 add $0x10,%rax 41: 75 cd jne 10 <diff_pixels_mmx+0x10> 43: f3 c3 repz retq 45: 66 66 2e 0f 1f 84 00 nopw %cs:0x0(%rax,%rax,1) 4c: 00 00 00 00 This is the intrinsic version: #include <mmintrin.h> void diff_pixels_mmx3(char *block, const uint8_t *s1, const uint8_t *s2, long stride) { long offset = -128; block+=64; __m64 mm7 = _mm_setzero_si64(); do { __m64 mm0 = *(__m64*)s1; __m64 mm2 = *(__m64*)s2; __m64 mm1 = mm0; __m64 mm3 = mm2; mm0 = _mm_unpacklo_pi8(mm0, mm7); mm1 = _mm_unpackhi_pi8(mm1, mm7); mm2 = _mm_unpacklo_pi8(mm2, mm7); mm3 = _mm_unpackhi_pi8(mm3, mm7); mm0 = _mm_sub_pi16(mm0, mm2); mm1 = _mm_sub_pi16(mm1, mm3); *(__m64*)(block+offset) = mm0; *(__m64*)(block+offset+8) = mm1; s1 += stride; s2 += stride; offset +=16; } while (offset < 0); } compiles to 00000000000000c0 <diff_pixels_mmx3>: c0: 53 push %rbx c1: 0f ef e4 pxor %mm4,%mm4 c4: 48 c7 c0 80 ff ff ff mov $0xffffffffffffff80,%rax cb: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) d0: 0f 6f 0e movq (%rsi),%mm1 d3: 48 01 ce add %rcx,%rsi d6: 0f 6f 02 movq (%rdx),%mm0 d9: 48 01 ca add %rcx,%rdx dc: 0f 6f d1 movq %mm1,%mm2 df: 0f 6f d9 movq %mm1,%mm3 e2: 0f 6f c8 movq %mm0,%mm1 e5: 0f 68 c4 punpckhbw %mm4,%mm0 e8: 0f 60 d4 punpcklbw %mm4,%mm2 eb: 0f 68 dc punpckhbw %mm4,%mm3 ee: 0f 60 cc punpcklbw %mm4,%mm1 f1: 0f f9 d8 psubw %mm0,%mm3 f4: 0f f9 d1 psubw %mm1,%mm2 f7: 0f 7f 5c 24 f0 movq %mm3,-0x10(%rsp) fc: 0f 7f 54 24 f8 movq %mm2,-0x8(%rsp) 101: 48 8b 5c 24 f8 mov -0x8(%rsp),%rbx 106: 48 89 5c 38 40 mov %rbx,0x40(%rax,%rdi,1) 10b: 48 8b 5c 24 f0 mov -0x10(%rsp),%rbx 110: 48 89 5c 38 48 mov %rbx,0x48(%rax,%rdi,1) 115: 48 83 c0 10 add $0x10,%rax 119: 75 b5 jne d0 <diff_pixels_mmx3+0x10> 11b: 5b pop %rbx 11c: c3 retq Flags used are:-O2 -march=k8 Compiler: gcc 4.2.3 (gentoo) x86_64 As you see in the intrinsic version gcc moves to mmx register to the stack, reloads from the stack and writes to the destination. Why? I don't know whether earlier gcc 4.2 versions produced such stupid code. Compiling as 32 does similar stupidity, though gcc reloads into a mmx register... (As I note on a side: Why does gcc want to use rbx, requiring push and pop? I think other registers are free...) bye, -- (°= =°) //\ Prakash Punnoor /\\ V_/ \_V
signature.asc
Description: This is a digitally signed message part.