On Mon, Feb 21, 2022 at 10:29 AM Hongyu Wang <hongyu.w...@intel.com> wrote: > > Hi, > > For cmpxchg, it is commonly used in spin loop, and several user code > such as pthread directly takes cmpxchg as loop condition, which cause > huge cache bouncing. > > This patch extends previous implementation to relax all cmpxchg > instruction under -mrelax-cmpxchg-loop with an extra atomic load, > compare and emulate the failed cmpxchg behavior. > > For original spin loop which looks like > > loop: mov %eax,%r8d > or $1,%r8d > lock cmpxchg %r8d,(%rdi) > jne loop > > It will now truns to > > loop: mov %eax,%r8d > or $1,%r8d > mov (%r8),%rsi <--- load lock first > cmp %rsi,%rax <--- compare with expected input > jne .L2 <--- lock ne expected > lock cmpxchg %r8d,(%rdi) > jne loop > L2: mov %rsi,%rax <--- perform the behavior of failed cmpxchg > jne loop > > under -mrelax-cmpxchg-loop. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} > > OK for master? > > gcc/ChangeLog: > > PR target/103069 > * config/i386/i386-expand.cc (ix86_expand_atomic_fetch_op_loop): > Split atomic fetch and loop part. > (ix86_expand_cmpxchg_loop): New expander for cmpxchg loop. > * config/i386/i386-protos.h (ix86_expand_cmpxchg_loop): New > prototype. > * config/i386/sync.md (atomic_compare_and_swap<mode>): Call new > expander under TARGET_RELAX_CMPXCHG_LOOP. > (atomic_compare_and_swap<mode>): Likewise for doubleword modes. > > gcc/testsuite/ChangeLog: > > PR target/103069 > * gcc.target/i386/pr103069-2.c: Adjust result check. > * gcc.target/i386/pr103069-3.c: New test. > * gcc.target/i386/pr103069-4.c: Likewise.
OK. Thanks, Uros. > --- > gcc/config/i386/i386-expand.cc | 153 +++++++++++++++------ > gcc/config/i386/i386-protos.h | 2 + > gcc/config/i386/sync.md | 65 +++++---- > gcc/testsuite/gcc.target/i386/pr103069-2.c | 4 +- > gcc/testsuite/gcc.target/i386/pr103069-3.c | 24 ++++ > gcc/testsuite/gcc.target/i386/pr103069-4.c | 43 ++++++ > 6 files changed, 226 insertions(+), 65 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-4.c > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index ce9607e36de..6cf1a0b9cb6 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -23203,16 +23203,14 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, > rtx mem, rtx val, > enum rtx_code code, bool after, > bool doubleword) > { > - rtx old_reg, new_reg, old_mem, success, oldval, new_mem; > - rtx_code_label *loop_label, *pause_label, *done_label; > + rtx old_reg, new_reg, old_mem, success; > machine_mode mode = GET_MODE (target); > + rtx_code_label *loop_label = NULL; > > old_reg = gen_reg_rtx (mode); > new_reg = old_reg; > - loop_label = gen_label_rtx (); > - pause_label = gen_label_rtx (); > - done_label = gen_label_rtx (); > old_mem = copy_to_reg (mem); > + loop_label = gen_label_rtx (); > emit_label (loop_label); > emit_move_insn (old_reg, old_mem); > > @@ -23234,50 +23232,125 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, > rtx mem, rtx val, > if (after) > emit_move_insn (target, new_reg); > > - /* Load memory again inside loop. */ > - new_mem = copy_to_reg (mem); > - /* Compare mem value with expected value. */ > + success = NULL_RTX; > + > + ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg, > + gen_int_mode (MEMMODEL_SYNC_SEQ_CST, > + SImode), > + doubleword, loop_label); > +} > + > +/* Relax cmpxchg instruction, param loop_label indicates whether > + the instruction should be relaxed with a pause loop. If not, > + it will be relaxed to an atomic load + compare, and skip > + cmpxchg instruction if mem != exp_input. */ > + > +void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val, > + rtx mem, rtx exp_input, rtx new_input, > + rtx mem_model, bool doubleword, > + rtx_code_label *loop_label) > +{ > + rtx_code_label *cmp_label = NULL; > + rtx_code_label *done_label = NULL; > + rtx target_bool = NULL_RTX, new_mem = NULL_RTX; > + rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL; > + rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL; > + machine_mode mode = GET_MODE (target_val), hmode = mode; > + > + if (*ptarget_bool == NULL) > + target_bool = gen_reg_rtx (QImode); > + else > + target_bool = *ptarget_bool; > + > + cmp_label = gen_label_rtx (); > + done_label = gen_label_rtx (); > + > + new_mem = gen_reg_rtx (mode); > + /* Load memory first. */ > + expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST); > + > + switch (mode) > + { > + case TImode: > + gendw = gen_atomic_compare_and_swapti_doubleword; > + hmode = DImode; > + break; > + case DImode: > + if (doubleword) > + { > + gendw = gen_atomic_compare_and_swapdi_doubleword; > + hmode = SImode; > + } > + else > + gen = gen_atomic_compare_and_swapdi_1; > + break; > + case SImode: > + gen = gen_atomic_compare_and_swapsi_1; break; > + case HImode: > + gen = gen_atomic_compare_and_swaphi_1; break; > + case QImode: > + gen = gen_atomic_compare_and_swapqi_1; break; > + default: > + gcc_unreachable (); > + } > > + /* Compare mem value with expected value. */ > if (doubleword) > { > - machine_mode half_mode = (mode == DImode)? SImode : DImode; > - rtx low_new_mem = gen_lowpart (half_mode, new_mem); > - rtx low_old_mem = gen_lowpart (half_mode, old_mem); > - rtx high_new_mem = gen_highpart (half_mode, new_mem); > - rtx high_old_mem = gen_highpart (half_mode, old_mem); > - emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX, > - half_mode, 1, pause_label, > + rtx low_new_mem = gen_lowpart (hmode, new_mem); > + rtx low_exp_input = gen_lowpart (hmode, exp_input); > + rtx high_new_mem = gen_highpart (hmode, new_mem); > + rtx high_exp_input = gen_highpart (hmode, exp_input); > + emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX, > + hmode, 1, cmp_label, > profile_probability::guessed_never ()); > - emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX, > - half_mode, 1, pause_label, > + emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX, > + hmode, 1, cmp_label, > profile_probability::guessed_never ()); > } > else > - emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX, > - GET_MODE (old_mem), 1, pause_label, > + emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX, > + GET_MODE (exp_input), 1, cmp_label, > profile_probability::guessed_never ()); > > - success = NULL_RTX; > - oldval = old_mem; > - expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg, > - new_reg, false, MEMMODEL_SYNC_SEQ_CST, > - MEMMODEL_RELAXED); > - if (oldval != old_mem) > - emit_move_insn (old_mem, oldval); > - > - emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx, > - GET_MODE (success), 1, loop_label, > - profile_probability::guessed_never ()); > - > - emit_jump_insn (gen_jump (done_label)); > - emit_barrier (); > - > - /* If mem is not expected, pause and loop back. */ > - emit_label (pause_label); > - emit_insn (gen_pause ()); > - emit_jump_insn (gen_jump (loop_label)); > - emit_barrier (); > - emit_label (done_label); > + /* Directly emits cmpxchg here. */ > + if (doubleword) > + emit_insn (gendw (target_val, mem, exp_input, > + gen_lowpart (hmode, new_input), > + gen_highpart (hmode, new_input), > + mem_model)); > + else > + emit_insn (gen (target_val, mem, exp_input, new_input, mem_model)); > + > + if (!loop_label) > + { > + emit_jump_insn (gen_jump (done_label)); > + emit_barrier (); > + emit_label (cmp_label); > + emit_move_insn (target_val, new_mem); > + emit_label (done_label); > + ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG), > + const0_rtx); > + } > + else > + { > + ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG), > + const0_rtx); > + emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx, > + GET_MODE (target_bool), 1, loop_label, > + profile_probability::guessed_never ()); > + emit_jump_insn (gen_jump (done_label)); > + emit_barrier (); > + > + /* If mem is not expected, pause and loop back. */ > + emit_label (cmp_label); > + emit_insn (gen_pause ()); > + emit_jump_insn (gen_jump (loop_label)); > + emit_barrier (); > + emit_label (done_label); > + } > + > + *ptarget_bool = target_bool; > } > > #include "gt-i386-expand.h" > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index b7e9aa75d25..d5e11259a5a 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -221,6 +221,8 @@ extern void ix86_split_mmx_punpck (rtx[], bool); > extern void ix86_expand_avx_vzeroupper (void); > extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code, > bool, bool); > +extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx, > + bool, rtx_code_label *); > > #ifdef TREE_CODE > extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); > diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md > index 36417c54c11..820e9ca911a 100644 > --- a/gcc/config/i386/sync.md > +++ b/gcc/config/i386/sync.md > @@ -373,11 +373,20 @@ (define_expand "atomic_compare_and_swap<mode>" > (match_operand:SI 7 "const_int_operand")] ;; failure model > "TARGET_CMPXCHG" > { > - emit_insn > - (gen_atomic_compare_and_swap<mode>_1 > - (operands[1], operands[2], operands[3], operands[4], operands[6])); > - ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG), > - const0_rtx); > + if (TARGET_RELAX_CMPXCHG_LOOP) > + { > + ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2], > + operands[3], operands[4], operands[6], > + false, NULL); > + } > + else > + { > + emit_insn > + (gen_atomic_compare_and_swap<mode>_1 > + (operands[1], operands[2], operands[3], operands[4], operands[6])); > + ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG), > + const0_rtx); > + } > DONE; > }) > > @@ -397,25 +406,35 @@ (define_expand "atomic_compare_and_swap<mode>" > (match_operand:SI 7 "const_int_operand")] ;; failure model > "TARGET_CMPXCHG" > { > - if (<MODE>mode == DImode && TARGET_64BIT) > - { > - emit_insn > - (gen_atomic_compare_and_swapdi_1 > - (operands[1], operands[2], operands[3], operands[4], operands[6])); > - } > + int doubleword = !(<MODE>mode == DImode && TARGET_64BIT); > + if (TARGET_RELAX_CMPXCHG_LOOP) > + { > + ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2], > + operands[3], operands[4], operands[6], > + doubleword, NULL); > + } > else > - { > - machine_mode hmode = <CASHMODE>mode; > - > - emit_insn > - (gen_atomic_compare_and_swap<mode>_doubleword > - (operands[1], operands[2], operands[3], > - gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]), > - operands[6])); > - } > - > - ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG), > - const0_rtx); > + { > + if (!doubleword) > + { > + emit_insn > + (gen_atomic_compare_and_swapdi_1 > + (operands[1], operands[2], operands[3], operands[4], operands[6])); > + } > + else > + { > + machine_mode hmode = <CASHMODE>mode; > + > + emit_insn > + (gen_atomic_compare_and_swap<mode>_doubleword > + (operands[1], operands[2], operands[3], > + gen_lowpart (hmode, operands[4]), gen_highpart (hmode, > operands[4]), > + operands[6])); > + } > + > + ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG), > + const0_rtx); > + } > DONE; > }) > > diff --git a/gcc/testsuite/gcc.target/i386/pr103069-2.c > b/gcc/testsuite/gcc.target/i386/pr103069-2.c > index b3f2235fd55..2ceb54e013a 100644 > --- a/gcc/testsuite/gcc.target/i386/pr103069-2.c > +++ b/gcc/testsuite/gcc.target/i386/pr103069-2.c > @@ -40,12 +40,12 @@ FUNC_ATOMIC_RELAX (char, xor) > TYPE c = 11, d = 101; \ > res = relax_##TYPE##_##OP##_fetch (&a, b); \ > exp = f_##TYPE##_##OP##_fetch (&c, d); \ > - if (res != exp) \ > + if (res != exp || a != c) \ > abort (); \ > a = c = 21, b = d = 92; \ > res = relax_##TYPE##_fetch_##OP (&a, b); \ > exp = f_##TYPE##_fetch_##OP (&c, d); \ > - if (res != exp) \ > + if (res != exp || a != c) \ > abort (); \ > } > > diff --git a/gcc/testsuite/gcc.target/i386/pr103069-3.c > b/gcc/testsuite/gcc.target/i386/pr103069-3.c > new file mode 100644 > index 00000000000..fee708905ec > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103069-3.c > @@ -0,0 +1,24 @@ > +/* PR target/103068 */ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic > -mrelax-cmpxchg-loop" } */ > + > +#include <stdint.h> > + > +#define FUNC_CMPXCHG(TYPE) \ > +__attribute__ ((noinline, noclone)) \ > +TYPE f_##TYPE##_cmpxchg (TYPE *lock, TYPE newval, TYPE oldval) \ > +{ \ > + do \ > + { \ > + newval = oldval | 1; \ > + } while (! __atomic_compare_exchange_n (lock, &oldval, newval, \ > + 0, __ATOMIC_RELEASE, \ > + __ATOMIC_RELAXED)); \ > + return *lock; \ > +} > + > + > +FUNC_CMPXCHG (int64_t) > +FUNC_CMPXCHG (int) > +FUNC_CMPXCHG (short) > +FUNC_CMPXCHG (char) > diff --git a/gcc/testsuite/gcc.target/i386/pr103069-4.c > b/gcc/testsuite/gcc.target/i386/pr103069-4.c > new file mode 100644 > index 00000000000..5abcbca2a98 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103069-4.c > @@ -0,0 +1,43 @@ > +/* PR target/103069 */ > +/* { dg-do run } */ > +/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic" } */ > + > +#include <stdlib.h> > +#include "pr103069-3.c" > + > +#define FUNC_CMPXCHG_RELAX(TYPE) \ > +__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop"))) \ > +TYPE relax_##TYPE##_cmpxchg (TYPE *lock, TYPE newval, TYPE oldval) \ > +{ \ > + do \ > + { \ > + newval = oldval | 1; \ > + } while (! __atomic_compare_exchange_n (lock, &oldval, newval, \ > + 0, __ATOMIC_RELEASE, \ > + __ATOMIC_RELAXED)); \ > + return *lock; \ > +} > + > +FUNC_CMPXCHG_RELAX (int64_t) > +FUNC_CMPXCHG_RELAX (int) > +FUNC_CMPXCHG_RELAX (short) > +FUNC_CMPXCHG_RELAX (char) > + > +#define TEST_CMPXCHG_LOOP(TYPE) \ > +{ \ > + TYPE a = 11, b = 20, c = 11, res, exp; \ > + TYPE d = 11, e = 20, f = 11; \ > + res = relax_##TYPE##_cmpxchg (&a, b, c); \ > + exp = f_##TYPE##_cmpxchg (&d, e, f); \ > + if (res != exp || a != d) \ > + abort (); \ > +} > + > +int main (void) > +{ > + TEST_CMPXCHG_LOOP (int64_t) > + TEST_CMPXCHG_LOOP (int) > + TEST_CMPXCHG_LOOP (short) > + TEST_CMPXCHG_LOOP (char) > + return 0; > +} > -- > 2.18.1 >