The only way to get the following sample code run properly is to compile it with -O3 or -Os (this code has been design to illustrate a point, it results from thourough examination of more complex code):
//--------------------------------------------------------- #include <stdio.h> unsigned long hehe(int vv) { unsigned long result; asm volatile ( "movq $0x0102030405060708, %%rax\n\t" "mov %1, %%eax\n\t" : "=a" (result) : "r" (vv) ); return result; } int main(int argc, char** argv) { printf("%lx\n", hehe(42)); } //--------------------------------------------------------- the expected output is 42. compiled with -O3 -Os the programs gives 42 compiled with no option the program gives 0x05060708 The reason seem to be that the compiler does not take into account the dependency between %rax and %eax (where %eax contains the lower 32 bits of %rax). Here is the wrong code generated: unsigned long hehe(int vv) { 4004e4: 55 push %rbp 4004e5: 48 89 e5 mov %rsp,%rbp 4004e8: 89 7d ec mov %edi,-0x14(%rbp) unsigned long result; asm volatile ( 4004eb: 8b 45 ec mov -0x14(%rbp),%eax 4004ee: 48 b8 08 07 06 05 04 mov $0x102030405060708,%rax 4004f5: 03 02 01 // the following line is crazy! 4004f8: 89 c0 mov %eax,%eax 4004fa: 48 89 45 f8 mov %rax,-0x8(%rbp) "movq $0x0102030405060708, %%rax\n\t" "mov %1, %%eax\n\t" : "=a" (result) : "r" (vv) ); return result; 4004fe: 48 8b 45 f8 mov -0x8(%rbp),%rax } 400502: c9 leaveq 400503: c3 retq The following code faces the same type of problem but with a more radical register allocation error: inline int locateEndToken(char* data, unsigned long pos, unsigned long max) { int result; unsigned long tmp; asm volatile ( "mov $0x0A0D20, %%rax\n\t" "movq %%rax, %%xmm15\n\t" "lea -16(%2,%3,1),%1\n\t" "add %2, %4\n\t" ".align 16\n\t" "1:\n\t" "add $16, %1\n\t" "cmp %4, %1\n\t" "jge 2f\n\t" "pcmpistri $0,(%1),%%xmm15\n\t" "jnc 1b\n\t" "lea 0x1(%1,%%rcx,1),%%rax\n\t" "cmpb $0x020, -1(%%rax)\n\t" "je 4f\n\t" "2:\n\t" "movq $-1, %%rax\n\t" "jmp 3f\n\t" "4:\n\t" "sub %2, %%rax\n\t" "3:\n\t" : "=a" (result), "=r" (tmp) //%0, %1 : "r" (data), //%2 "r" (pos), //%3 "r" (max) //%4 : "%rcx" // pcmpistri modifies rcx ); return result; } Works perfectly in -O3 but allocates the same register for %1 and %4 in other cicurmstances. Thus the test "cmp %4, %1" at the sixth line becomes "cmp %1, %1" which do not represent the right functional test. Bottom line, using ASM can get things 15 times faster, but the generated code MUST be manually validated!!! This is a nightmare. (tests have been done with 4.4.1, 4.4.2, 4.4.3) -- Summary: Lack of dependency between %rax and %eax in inline assembly Product: gcc Version: 4.4.3 Status: UNCONFIRMED Severity: major Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: ff at vedicis dot com GCC host triplet: Debian 5.02 x86_64 GCC target triplet: X86_64 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43110