This way we can acquire the lock with xchg+test, instead of test+xchg+test. Most spinlocks should be uncontended so this should result in a ne performance gain.
Before: 4ad957: eb 09 jmp 4ad962 <qht_insert+0x32> 4ad959: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 4ad960: f3 90 pause 4ad962: 8b 03 mov (%rbx),%eax 4ad964: 85 c0 test %eax,%eax 4ad966: 75 f8 jne 4ad960 <qht_insert+0x30> 4ad968: 89 f8 mov %edi,%eax 4ad96a: 87 03 xchg %eax,(%rbx) 4ad96c: 85 c0 test %eax,%eax 4ad96e: 75 f2 jne 4ad962 <qht_insert+0x32> After: 4ad980: 89 f8 mov %edi,%eax 4ad982: 87 03 xchg %eax,(%rbx) 4ad984: 85 c0 test %eax,%eax 4ad986: 74 12 je 4ad99a <qht_insert+0x4a> 4ad988: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1) 4ad98f: 00 4ad990: 8b 03 mov (%rbx),%eax 4ad992: 85 c0 test %eax,%eax 4ad994: 74 ea je 4ad980 <qht_insert+0x30> 4ad996: f3 90 pause 4ad998: eb f6 jmp 4ad990 <qht_insert+0x40> Signed-off-by: Emilio G. Cota <c...@braap.org> --- include/qemu/thread.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/qemu/thread.h b/include/qemu/thread.h index 599965e..e2af57c 100644 --- a/include/qemu/thread.h +++ b/include/qemu/thread.h @@ -74,11 +74,11 @@ static inline void qemu_spin_init(QemuSpin *spin) static inline void qemu_spin_lock(QemuSpin *spin) { - do { + while (atomic_xchg(&spin->value, true)) { while (atomic_read(&spin->value)) { cpu_relax(); } - } while (atomic_xchg(&spin->value, true)); + } } static inline int qemu_spin_trylock(QemuSpin *spin) -- 2.5.0