http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49035
--- Comment #1 from Ryan Johnson <scovich at gmail dot com> 2011-05-18 02:56:23 UTC --- Update: using __attribute__((noinline)) it is actually possible to force the compiler to do the right thing, though it makes the code significantly less readable: === example.cpp ============ struct link { link* prev; long go_slow; void frob(link* parent, link* grandparent); }; link* __attribute__((noinline)) foo_slow(link* list, link* prev) { do { link* pprev = __sync_lock_test_and_set(&prev->prev, 0); pprev->frob(prev, list); prev = pprev; } while(__builtin_expect(prev->go_slow, 0)); return prev; } link* foo_fast(link* list) { link* prev = list->prev; if (__builtin_expect(prev->go_slow, 0)) return foo_slow(list, prev); return prev; } === example.cpp ============ The above compiles down to something much better, though the calling convention requires an extra movq and there are more jumps than required (the compiler probably doesn't ever perform tail recursion using a conditional jump): _Z8foo_fastP4link: movq (%rdi), %rax cmpq $0, 8(%rax) jne .L7 rep ret .L7: movq %rax, %rsi jmp _Z8foo_slowP4linkS0_ _Z8foo_slowP4linkS0_: movq %rbp, -16(%rsp) movq %r12, -8(%rsp) xorl %ebp, %ebp movq %rbx, -24(%rsp) movq %rdi, %r12 subq $24, %rsp .L2: movq %rbp, %rbx xchgq (%rsi), %rbx movq %r12, %rdx movq %rbx, %rdi call _ZN4link4frobEPS_S0_ cmpq $0, 8(%rbx) jne .L3 movq %rbx, %rax movq 8(%rsp), %rbp movq (%rsp), %rbx movq 16(%rsp), %r12 addq $24, %rsp ret .L3: movq %rbx, %rsi jmp .L2