On Mon, Feb 21, 2022 at 10:29 AM Hongyu Wang <hongyu.w...@intel.com> wrote:
>
> Hi,
>
> For cmpxchg, it is commonly used in spin loop, and several user code
> such as pthread directly takes cmpxchg as loop condition, which cause
> huge cache bouncing.
>
> This patch extends previous implementation to relax all cmpxchg
> instruction under -mrelax-cmpxchg-loop with an extra atomic load,
> compare and emulate the failed cmpxchg behavior.
>
> For original spin loop which looks like
>
> loop: mov    %eax,%r8d
>       or     $1,%r8d
>       lock cmpxchg %r8d,(%rdi)
>       jne    loop
>
> It will now truns to
>
> loop: mov    %eax,%r8d
>       or     $1,%r8d
>       mov    (%r8),%rsi <--- load lock first
>       cmp    %rsi,%rax <--- compare with expected input
>       jne    .L2 <--- lock ne expected
>       lock cmpxchg %r8d,(%rdi)
>       jne    loop
>   L2: mov    %rsi,%rax <--- perform the behavior of failed cmpxchg
>       jne    loop
>
> under -mrelax-cmpxchg-loop.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
>
> OK for master?
>
> gcc/ChangeLog:
>
>         PR target/103069
>         * config/i386/i386-expand.cc (ix86_expand_atomic_fetch_op_loop):
>         Split atomic fetch and loop part.
>         (ix86_expand_cmpxchg_loop): New expander for cmpxchg loop.
>         * config/i386/i386-protos.h (ix86_expand_cmpxchg_loop): New
>         prototype.
>         * config/i386/sync.md (atomic_compare_and_swap<mode>): Call new
>         expander under TARGET_RELAX_CMPXCHG_LOOP.
>         (atomic_compare_and_swap<mode>): Likewise for doubleword modes.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/103069
>         * gcc.target/i386/pr103069-2.c: Adjust result check.
>         * gcc.target/i386/pr103069-3.c: New test.
>         * gcc.target/i386/pr103069-4.c: Likewise.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-expand.cc             | 153 +++++++++++++++------
>  gcc/config/i386/i386-protos.h              |   2 +
>  gcc/config/i386/sync.md                    |  65 +++++----
>  gcc/testsuite/gcc.target/i386/pr103069-2.c |   4 +-
>  gcc/testsuite/gcc.target/i386/pr103069-3.c |  24 ++++
>  gcc/testsuite/gcc.target/i386/pr103069-4.c |  43 ++++++
>  6 files changed, 226 insertions(+), 65 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-4.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index ce9607e36de..6cf1a0b9cb6 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -23203,16 +23203,14 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, 
> rtx mem, rtx val,
>                                        enum rtx_code code, bool after,
>                                        bool doubleword)
>  {
> -  rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
> -  rtx_code_label *loop_label, *pause_label, *done_label;
> +  rtx old_reg, new_reg, old_mem, success;
>    machine_mode mode = GET_MODE (target);
> +  rtx_code_label *loop_label = NULL;
>
>    old_reg = gen_reg_rtx (mode);
>    new_reg = old_reg;
> -  loop_label = gen_label_rtx ();
> -  pause_label = gen_label_rtx ();
> -  done_label = gen_label_rtx ();
>    old_mem = copy_to_reg (mem);
> +  loop_label = gen_label_rtx ();
>    emit_label (loop_label);
>    emit_move_insn (old_reg, old_mem);
>
> @@ -23234,50 +23232,125 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, 
> rtx mem, rtx val,
>    if (after)
>      emit_move_insn (target, new_reg);
>
> -  /* Load memory again inside loop.  */
> -  new_mem = copy_to_reg (mem);
> -  /* Compare mem value with expected value.  */
> +  success = NULL_RTX;
> +
> +  ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
> +                           gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
> +                                         SImode),
> +                           doubleword, loop_label);
> +}
> +
> +/* Relax cmpxchg instruction, param loop_label indicates whether
> +   the instruction should be relaxed with a pause loop.  If not,
> +   it will be relaxed to an atomic load + compare, and skip
> +   cmpxchg instruction if mem != exp_input.  */
> +
> +void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
> +                              rtx mem, rtx exp_input, rtx new_input,
> +                              rtx mem_model, bool doubleword,
> +                              rtx_code_label *loop_label)
> +{
> +  rtx_code_label *cmp_label = NULL;
> +  rtx_code_label *done_label = NULL;
> +  rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
> +  rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
> +  rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
> +  machine_mode mode = GET_MODE (target_val), hmode = mode;
> +
> +  if (*ptarget_bool == NULL)
> +    target_bool = gen_reg_rtx (QImode);
> +  else
> +    target_bool = *ptarget_bool;
> +
> +  cmp_label = gen_label_rtx ();
> +  done_label = gen_label_rtx ();
> +
> +  new_mem = gen_reg_rtx (mode);
> +  /* Load memory first.  */
> +  expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
> +
> +  switch (mode)
> +    {
> +    case TImode:
> +      gendw = gen_atomic_compare_and_swapti_doubleword;
> +      hmode = DImode;
> +      break;
> +    case DImode:
> +      if (doubleword)
> +       {
> +         gendw = gen_atomic_compare_and_swapdi_doubleword;
> +         hmode = SImode;
> +       }
> +      else
> +       gen = gen_atomic_compare_and_swapdi_1;
> +      break;
> +    case SImode:
> +      gen = gen_atomic_compare_and_swapsi_1; break;
> +    case HImode:
> +      gen = gen_atomic_compare_and_swaphi_1; break;
> +    case QImode:
> +      gen = gen_atomic_compare_and_swapqi_1; break;
> +    default:
> +      gcc_unreachable ();
> +    }
>
> +  /* Compare mem value with expected value.  */
>    if (doubleword)
>      {
> -      machine_mode half_mode = (mode == DImode)? SImode : DImode;
> -      rtx low_new_mem = gen_lowpart (half_mode, new_mem);
> -      rtx low_old_mem = gen_lowpart (half_mode, old_mem);
> -      rtx high_new_mem = gen_highpart (half_mode, new_mem);
> -      rtx high_old_mem = gen_highpart (half_mode, old_mem);
> -      emit_cmp_and_jump_insns (low_new_mem, low_old_mem, NE, NULL_RTX,
> -                              half_mode, 1, pause_label,
> +      rtx low_new_mem = gen_lowpart (hmode, new_mem);
> +      rtx low_exp_input = gen_lowpart (hmode, exp_input);
> +      rtx high_new_mem = gen_highpart (hmode, new_mem);
> +      rtx high_exp_input = gen_highpart (hmode, exp_input);
> +      emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
> +                              hmode, 1, cmp_label,
>                                profile_probability::guessed_never ());
> -      emit_cmp_and_jump_insns (high_new_mem, high_old_mem, NE, NULL_RTX,
> -                              half_mode, 1, pause_label,
> +      emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
> +                              hmode, 1, cmp_label,
>                                profile_probability::guessed_never ());
>      }
>    else
> -    emit_cmp_and_jump_insns (new_mem, old_mem, NE, NULL_RTX,
> -                            GET_MODE (old_mem), 1, pause_label,
> +    emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
> +                            GET_MODE (exp_input), 1, cmp_label,
>                              profile_probability::guessed_never ());
>
> -  success = NULL_RTX;
> -  oldval = old_mem;
> -  expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
> -                                 new_reg, false, MEMMODEL_SYNC_SEQ_CST,
> -                                 MEMMODEL_RELAXED);
> -  if (oldval != old_mem)
> -    emit_move_insn (old_mem, oldval);
> -
> -  emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
> -                          GET_MODE (success), 1, loop_label,
> -                          profile_probability::guessed_never ());
> -
> -  emit_jump_insn (gen_jump (done_label));
> -  emit_barrier ();
> -
> -  /* If mem is not expected, pause and loop back.  */
> -  emit_label (pause_label);
> -  emit_insn (gen_pause ());
> -  emit_jump_insn (gen_jump (loop_label));
> -  emit_barrier ();
> -  emit_label (done_label);
> +  /* Directly emits cmpxchg here.  */
> +  if (doubleword)
> +    emit_insn (gendw (target_val, mem, exp_input,
> +                     gen_lowpart (hmode, new_input),
> +                     gen_highpart (hmode, new_input),
> +                     mem_model));
> +  else
> +    emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
> +
> +  if (!loop_label)
> +  {
> +    emit_jump_insn (gen_jump (done_label));
> +    emit_barrier ();
> +    emit_label (cmp_label);
> +    emit_move_insn (target_val, new_mem);
> +    emit_label (done_label);
> +    ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> +                      const0_rtx);
> +  }
> +  else
> +  {
> +    ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> +                      const0_rtx);
> +    emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
> +                            GET_MODE (target_bool), 1, loop_label,
> +                            profile_probability::guessed_never ());
> +    emit_jump_insn (gen_jump (done_label));
> +    emit_barrier ();
> +
> +    /* If mem is not expected, pause and loop back.  */
> +    emit_label (cmp_label);
> +    emit_insn (gen_pause ());
> +    emit_jump_insn (gen_jump (loop_label));
> +    emit_barrier ();
> +    emit_label (done_label);
> +  }
> +
> +  *ptarget_bool = target_bool;
>  }
>
>  #include "gt-i386-expand.h"
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index b7e9aa75d25..d5e11259a5a 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -221,6 +221,8 @@ extern void ix86_split_mmx_punpck (rtx[], bool);
>  extern void ix86_expand_avx_vzeroupper (void);
>  extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code,
>                                               bool, bool);
> +extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
> +                                     bool, rtx_code_label *);
>
>  #ifdef TREE_CODE
>  extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
> diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
> index 36417c54c11..820e9ca911a 100644
> --- a/gcc/config/i386/sync.md
> +++ b/gcc/config/i386/sync.md
> @@ -373,11 +373,20 @@ (define_expand "atomic_compare_and_swap<mode>"
>     (match_operand:SI 7 "const_int_operand")]   ;; failure model
>    "TARGET_CMPXCHG"
>  {
> -  emit_insn
> -   (gen_atomic_compare_and_swap<mode>_1
> -    (operands[1], operands[2], operands[3], operands[4], operands[6]));
> -  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> -                    const0_rtx);
> +  if (TARGET_RELAX_CMPXCHG_LOOP)
> +  {
> +    ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2],
> +                             operands[3], operands[4], operands[6],
> +                             false, NULL);
> +  }
> +  else
> +  {
> +    emit_insn
> +      (gen_atomic_compare_and_swap<mode>_1
> +       (operands[1], operands[2], operands[3], operands[4], operands[6]));
> +      ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> +                       const0_rtx);
> +  }
>    DONE;
>  })
>
> @@ -397,25 +406,35 @@ (define_expand "atomic_compare_and_swap<mode>"
>     (match_operand:SI 7 "const_int_operand")]   ;; failure model
>    "TARGET_CMPXCHG"
>  {
> -  if (<MODE>mode == DImode && TARGET_64BIT)
> -    {
> -      emit_insn
> -       (gen_atomic_compare_and_swapdi_1
> -       (operands[1], operands[2], operands[3], operands[4], operands[6]));
> -    }
> +  int doubleword = !(<MODE>mode == DImode && TARGET_64BIT);
> +  if (TARGET_RELAX_CMPXCHG_LOOP)
> +  {
> +    ix86_expand_cmpxchg_loop (&operands[0], operands[1], operands[2],
> +                             operands[3], operands[4], operands[6],
> +                             doubleword, NULL);
> +  }
>    else
> -    {
> -      machine_mode hmode = <CASHMODE>mode;
> -
> -      emit_insn
> -       (gen_atomic_compare_and_swap<mode>_doubleword
> -        (operands[1], operands[2], operands[3],
> -        gen_lowpart (hmode, operands[4]), gen_highpart (hmode, operands[4]),
> -        operands[6]));
> -    }
> -
> -  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> -                    const0_rtx);
> +  {
> +    if (!doubleword)
> +      {
> +       emit_insn
> +         (gen_atomic_compare_and_swapdi_1
> +          (operands[1], operands[2], operands[3], operands[4], operands[6]));
> +      }
> +    else
> +      {
> +       machine_mode hmode = <CASHMODE>mode;
> +
> +       emit_insn
> +         (gen_atomic_compare_and_swap<mode>_doubleword
> +          (operands[1], operands[2], operands[3],
> +           gen_lowpart (hmode, operands[4]), gen_highpart (hmode, 
> operands[4]),
> +           operands[6]));
> +      }
> +
> +    ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
> +                      const0_rtx);
> +  }
>    DONE;
>  })
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr103069-2.c 
> b/gcc/testsuite/gcc.target/i386/pr103069-2.c
> index b3f2235fd55..2ceb54e013a 100644
> --- a/gcc/testsuite/gcc.target/i386/pr103069-2.c
> +++ b/gcc/testsuite/gcc.target/i386/pr103069-2.c
> @@ -40,12 +40,12 @@ FUNC_ATOMIC_RELAX (char, xor)
>    TYPE c = 11, d = 101;        \
>    res = relax_##TYPE##_##OP##_fetch (&a, b); \
>    exp = f_##TYPE##_##OP##_fetch (&c, d);  \
> -  if (res != exp) \
> +  if (res != exp || a != c) \
>      abort (); \
>    a = c = 21, b = d = 92; \
>    res = relax_##TYPE##_fetch_##OP (&a, b); \
>    exp = f_##TYPE##_fetch_##OP (&c, d);  \
> -  if (res != exp) \
> +  if (res != exp || a != c) \
>      abort (); \
>  }
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr103069-3.c 
> b/gcc/testsuite/gcc.target/i386/pr103069-3.c
> new file mode 100644
> index 00000000000..fee708905ec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103069-3.c
> @@ -0,0 +1,24 @@
> +/* PR target/103068 */
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic 
> -mrelax-cmpxchg-loop" } */
> +
> +#include <stdint.h>
> +
> +#define FUNC_CMPXCHG(TYPE) \
> +__attribute__ ((noinline, noclone))    \
> +TYPE f_##TYPE##_cmpxchg (TYPE *lock, TYPE newval, TYPE oldval)  \
> +{ \
> +  do  \
> +  { \
> +    newval = oldval | 1;  \
> +  } while (! __atomic_compare_exchange_n (lock, &oldval, newval,  \
> +                                         0, __ATOMIC_RELEASE,  \
> +                                         __ATOMIC_RELAXED));  \
> +  return *lock;        \
> +}
> +
> +
> +FUNC_CMPXCHG (int64_t)
> +FUNC_CMPXCHG (int)
> +FUNC_CMPXCHG (short)
> +FUNC_CMPXCHG (char)
> diff --git a/gcc/testsuite/gcc.target/i386/pr103069-4.c 
> b/gcc/testsuite/gcc.target/i386/pr103069-4.c
> new file mode 100644
> index 00000000000..5abcbca2a98
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103069-4.c
> @@ -0,0 +1,43 @@
> +/* PR target/103069 */
> +/* { dg-do run } */
> +/* { dg-additional-options "-O2 -march=x86-64 -mtune=generic" } */
> +
> +#include <stdlib.h>
> +#include "pr103069-3.c"
> +
> +#define FUNC_CMPXCHG_RELAX(TYPE) \
> +__attribute__ ((noinline, noclone, target ("relax-cmpxchg-loop")))     \
> +TYPE relax_##TYPE##_cmpxchg (TYPE *lock, TYPE newval, TYPE oldval)  \
> +{ \
> +  do  \
> +  { \
> +    newval = oldval | 1;  \
> +  } while (! __atomic_compare_exchange_n (lock, &oldval, newval,  \
> +                                         0, __ATOMIC_RELEASE,  \
> +                                         __ATOMIC_RELAXED));  \
> +  return *lock;        \
> +}
> +
> +FUNC_CMPXCHG_RELAX (int64_t)
> +FUNC_CMPXCHG_RELAX (int)
> +FUNC_CMPXCHG_RELAX (short)
> +FUNC_CMPXCHG_RELAX (char)
> +
> +#define TEST_CMPXCHG_LOOP(TYPE)        \
> +{ \
> +  TYPE a = 11, b = 20, c = 11, res, exp; \
> +  TYPE d = 11, e = 20, f = 11; \
> +  res = relax_##TYPE##_cmpxchg (&a, b, c); \
> +  exp = f_##TYPE##_cmpxchg (&d, e, f); \
> +  if (res != exp || a != d) \
> +    abort (); \
> +}
> +
> +int main (void)
> +{
> +  TEST_CMPXCHG_LOOP (int64_t)
> +  TEST_CMPXCHG_LOOP (int)
> +  TEST_CMPXCHG_LOOP (short)
> +  TEST_CMPXCHG_LOOP (char)
> +  return 0;
> +}
> --
> 2.18.1
>

Reply via email to