https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66867
--- Comment #3 from Sebastian Huber <sebastian.hu...@embedded-brains.de> --- clang 3.7 generates optimal code on x86 in both cases: .text .file "test.c" .globl f .align 16, 0x90 .type f,@function f: # @f .cfi_startproc # BB#0: movl $1, %ecx xorl %eax, %eax lock cmpxchgl %ecx, (%rdi) retq .Lfunc_end0: .size f, .Lfunc_end0-f .cfi_endproc .globl g .align 16, 0x90 .type g,@function g: # @g .cfi_startproc # BB#0: movl $1, %ecx xorl %eax, %eax lock cmpxchgl %ecx, (%rdi) retq .Lfunc_end1: .size g, .Lfunc_end1-g .cfi_endproc .ident "clang version 3.7.0 (tags/RELEASE_370/final)" .section ".note.GNU-stack","",@progbits