Hello! As shown in the PR, the attached patch substantial improves generated code when cmpxchg}8,16}b insn is involved. Following testcase:
--cut here-- __int128_t i; int main() { __atomic_store_16(&i, -1, 0); if (i != -1) __builtin_abort(); return 0; } --cut here-- compiles with -O2 -mcx16 to: movq i(%rip), %rax movq $-1, %rcx movq i+8(%rip), %rdx .L2: movq %rcx, %rbx lock cmpxchg16b i(%rip) jne .L2 where without the patch, the compiler generated: movq i(%rip), %rsi movq $-1, %rcx movq i+8(%rip), %rdi .L2: movq %rsi, %rax movq %rdi, %rdx movq %rcx, %rbx lock cmpxchg16b i(%rip) movq %rdx, %rdi movq %rax, %rsi jne .L2 2015-03-31 Uros Bizjak <ubiz...@gmail.com> PR target/58945 * config/i386/sync.md (atomic_compare_and_swap<dwi>_doubleword): Do not split operands 0 and operands 2 to halfmode. (atomic_compare_and_swap<mode>): Update for atomic_compare_and_swap<dwi>_doubleword changes. Patch was bootstrapped and regression tested on x86_64-linux-gnu {,-m32} and was committed to mainline. Uros.
Index: config/i386/sync.md =================================================================== --- config/i386/sync.md (revision 221786) +++ config/i386/sync.md (working copy) @@ -351,21 +351,12 @@ else { machine_mode hmode = <CASHMODE>mode; - rtx lo_o, lo_e, lo_n, hi_o, hi_e, hi_n; - lo_o = operands[1]; - lo_e = operands[3]; - lo_n = operands[4]; - hi_o = gen_highpart (hmode, lo_o); - hi_e = gen_highpart (hmode, lo_e); - hi_n = gen_highpart (hmode, lo_n); - lo_o = gen_lowpart (hmode, lo_o); - lo_e = gen_lowpart (hmode, lo_e); - lo_n = gen_lowpart (hmode, lo_n); - emit_insn (gen_atomic_compare_and_swap<mode>_doubleword - (lo_o, hi_o, operands[2], lo_e, hi_e, lo_n, hi_n, operands[6])); + (operands[1], operands[2], operands[3], + gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]), + operands[6])); } ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG), @@ -389,31 +380,26 @@ "lock{%;} %K4cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}") ;; For double-word compare and swap, we are obliged to play tricks with -;; the input newval (op5:op6) because the Intel register numbering does +;; the input newval (op3:op4) because the Intel register numbering does ;; not match the gcc register numbering, so the pair must be CX:BX. -;; That said, in order to take advantage of possible lower-subreg opts, -;; treat all of the integral operands in the same way. (define_mode_attr doublemodesuffix [(SI "8") (DI "16")]) (define_insn "atomic_compare_and_swap<dwi>_doubleword" - [(set (match_operand:DWIH 0 "register_operand" "=a") - (unspec_volatile:DWIH - [(match_operand:<DWI> 2 "memory_operand" "+m") - (match_operand:DWIH 3 "register_operand" "0") - (match_operand:DWIH 4 "register_operand" "1") - (match_operand:DWIH 5 "register_operand" "b") - (match_operand:DWIH 6 "register_operand" "c") - (match_operand:SI 7 "const_int_operand")] + [(set (match_operand:<DWI> 0 "register_operand" "=A") + (unspec_volatile:<DWI> + [(match_operand:<DWI> 1 "memory_operand" "+m") + (match_operand:<DWI> 2 "register_operand" "0") + (match_operand:DWIH 3 "register_operand" "b") + (match_operand:DWIH 4 "register_operand" "c") + (match_operand:SI 5 "const_int_operand")] UNSPECV_CMPXCHG)) - (set (match_operand:DWIH 1 "register_operand" "=d") - (unspec_volatile:DWIH [(const_int 0)] UNSPECV_CMPXCHG)) - (set (match_dup 2) + (set (match_dup 1) (unspec_volatile:<DWI> [(const_int 0)] UNSPECV_CMPXCHG)) (set (reg:CCZ FLAGS_REG) (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))] "TARGET_CMPXCHG<doublemodesuffix>B" - "lock{%;} %K7cmpxchg<doublemodesuffix>b\t%2") + "lock{%;} %K5cmpxchg<doublemodesuffix>b\t%1") ;; For operand 2 nonmemory_operand predicate is used instead of ;; register_operand to allow combiner to better optimize atomic