http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50168

             Bug #: 50168
           Summary: __builtin_ctz() and intrinsics __bsr(), __bsf()
                    generate suboptimal code on x86_64
    Classification: Unclassified
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: gp...@web.de


Testcase:

--------------------
#include <x86intrin.h>

static inline long my_bsfq(long x) __attribute__((__always_inline__));
static inline long my_bsfq(long x) {
    long result;
    asm(" bsfq %1, %0 \n"
        : "=r"(result)
        : "r"(x)
    );
    return result;
}

long c[64];

long f(long i) {
    return c[ __bsfq(i) ];
}

long g(long i) {
    return c[ __builtin_ctzll(i) ];
}

long h(long i) {
    return c[ my_bsfq(i) ];
}
----------------------



When I compile this with 'gcc -O3 -g testcase.c -c -o testcase.o
&& objdump -d testcase', I get



----------------------
0000000000000000 <f>:
   0:   48 0f bc ff             bsf    %rdi,%rdi
   4:   48 63 ff                movslq %edi,%rdi
   7:   48 8b 04 fd 00 00 00    mov    0x0(,%rdi,8),%rax
   e:   00 
   f:   c3                      retq   

0000000000000010 <g>:
  10:   48 0f bc ff             bsf    %rdi,%rdi
  14:   48 63 ff                movslq %edi,%rdi
  17:   48 8b 04 fd 00 00 00    mov    0x0(,%rdi,8),%rax
  1e:   00 
  1f:   c3                      retq   

0000000000000020 <h>:
  20:   48 0f bc ff             bsf    %rdi,%rdi
  24:   48 8b 04 fd 00 00 00    mov    0x0(,%rdi,8),%rax
  2b:   00 
  2c:   c3                      retq   
-----------------------



Please note the unneeded 32 to 64 bit conversion 'movslq ...' inserted by the
compiler in functions f() and g(). It should look like h() instead.

I suspect the source is the prototype of the builtin, whose return type 'int'
does not match the "natural" return type on x86_64, which is 64 bit, the same
register size as the input register.

If I replace the builtin/intrinsic with the selfmade asm one, I get a nice
speedup of 2% in my chessengine.

Reply via email to