The only way to get the following sample code run properly is to compile it
with -O3 or -Os (this code has been design to illustrate a point, it results
from thourough examination of more complex code):

//---------------------------------------------------------
#include <stdio.h>

unsigned long hehe(int vv) {
        unsigned long result;
        asm volatile (
                "movq $0x0102030405060708, %%rax\n\t"
                "mov %1, %%eax\n\t"
                : "=a" (result)
                : "r" (vv)
        );
        return result;
}

int main(int argc, char** argv) {
        printf("%lx\n", hehe(42));
}
//---------------------------------------------------------

the expected output is 42.

compiled with -O3 -Os the programs gives 42
compiled with no option the program gives 0x05060708


The reason seem to be that the compiler does not take into account the
dependency between %rax and %eax (where %eax contains the lower 32 bits of
%rax). Here is the wrong code generated:

unsigned long hehe(int vv) {
  4004e4:       55                      push   %rbp
  4004e5:       48 89 e5                mov    %rsp,%rbp
  4004e8:       89 7d ec                mov    %edi,-0x14(%rbp)
        unsigned long result;
        asm volatile (
  4004eb:       8b 45 ec                mov    -0x14(%rbp),%eax
  4004ee:       48 b8 08 07 06 05 04    mov    $0x102030405060708,%rax
  4004f5:       03 02 01
// the following line is crazy!
  4004f8:       89 c0                   mov    %eax,%eax
  4004fa:       48 89 45 f8             mov    %rax,-0x8(%rbp)
                "movq $0x0102030405060708, %%rax\n\t"
                "mov %1, %%eax\n\t"
                : "=a" (result)
                : "r" (vv)
        );
        return result;
  4004fe:       48 8b 45 f8             mov    -0x8(%rbp),%rax
}
  400502:       c9                      leaveq
  400503:       c3                      retq



The following code faces the same type of problem but with a more radical
register allocation error:

inline int locateEndToken(char* data, unsigned long pos, unsigned long max) {
        int result;
        unsigned long tmp;
        asm  volatile (
                "mov $0x0A0D20, %%rax\n\t"
                "movq %%rax, %%xmm15\n\t"
                "lea    -16(%2,%3,1),%1\n\t"
                "add %2, %4\n\t"
                ".align 16\n\t"
"1:\n\t"
                "add $16, %1\n\t"
                "cmp %4, %1\n\t"
                "jge 2f\n\t"
                "pcmpistri $0,(%1),%%xmm15\n\t"
                "jnc 1b\n\t"
                "lea    0x1(%1,%%rcx,1),%%rax\n\t"
                "cmpb $0x020, -1(%%rax)\n\t"
                "je 4f\n\t"
"2:\n\t"
                "movq $-1, %%rax\n\t"
                "jmp 3f\n\t"
"4:\n\t"
                "sub %2, %%rax\n\t"

"3:\n\t"
                : "=a" (result), "=r" (tmp)             //%0, %1
                :       "r" (data),     //%2
                        "r" (pos),      //%3
                        "r" (max)       //%4
                : "%rcx"                                // pcmpistri modifies
rcx
         );
        return result;
}

Works perfectly in -O3 but allocates the same register for %1 and %4 in other
cicurmstances. Thus the test "cmp %4, %1" at the sixth line becomes "cmp %1,
%1" which do not represent the right functional test.

Bottom line, using ASM can get things 15 times faster, but the generated code
MUST be manually validated!!! This is a nightmare.

(tests have been done with 4.4.1, 4.4.2, 4.4.3)


-- 
           Summary: Lack of dependency between %rax and %eax in inline
                    assembly
           Product: gcc
           Version: 4.4.3
            Status: UNCONFIRMED
          Severity: major
          Priority: P3
         Component: c
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: ff at vedicis dot com
  GCC host triplet: Debian 5.02 x86_64
GCC target triplet: X86_64


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43110

Reply via email to