ping: [PATCH][GCC][AARCH64]Introduce aarch64 atomic_{load,store}ti patterns

Matthew Malcomson Mon, 17 Dec 2018 05:29:37 -0800

Ping


On 27/09/18 14:43, Matthew Malcomson wrote:
> [PATCH][GCC][AARCH64] Introduce aarch64 atomic_{load,store}ti patterns
>
> In Armv8.4-a these patterns use the LDP/STP instructions that are guaranteed 
> to
> be single-copy atomic, ensure correct memory ordering semantics by using
> the DMB instruction.
>
> We put the use of these inline expansions behind a command line flag since 
> they
> do not satisfy the libatomic ABI and hence can't be used together with code
> already compiled using 16 byte atomics.
> This command line flag is -matomic-128bit-instructions.
>
> Given the introduction of a flag specified to break ABI compatibility with
> libatomic, it seems reasonable to introduce the load-exclusive/store-exclusive
> read-modify-write loop emulation of 128 bit atomic load and stores for older
> architectures behind this flag.
>
> We introduce the usual extension macros for the "at" extension marking the
> LDP/STP atomicity guarantees introduced in Armv8.4-a and use these to decide
> which to use when -matomic-128bit-instructions is provided on the command 
> line.
>
> Tested with full bootstrap and make check on aarch64-none-linux-gnu.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> 2018-09-27  Matthew Malcomson  <matthew.malcom...@arm.com>
>
>       * config/aarch64/aarch64-protos.h (aarch64_split_atomic_ti_access): New
>       prototype.
>       * config/aarch64/aarch64.c (aarch64_split_atomic_ti_access): New.
>       * config/aarch64/aarch64.h (AARCH64_FL_AT): New flag.
>       (AARCH64_FL_PROFILE): Flag moved to accomodate above.
>       (AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_AT.
>       (AARCH64_ISA_AT): New ISA flag.
>       * config/aarch64/aarch64.opt (-matomic-128bit-instruction): New.
>       * config/aarch64/atomics.md (atomic_load<mode>, atomic_store<mode>,
>       @aarch64_load_exclusive<mode> {smaller registers},
>       @aarch64_load_exclusive<mode> {GPI registers},
>       @aarch64_store_exclusive<mode>): Use aarch_mm_needs_{acquire,release}
>       instead of three part check.
>       (atomic_loadti, aarch64_atomic_loadti_ldp, aarch64_atomic_loadti_basic
>       atomic_storeti, aarch64_atomic_storeti_stp,
>       aarch64_atomic_storeti_basic) New
>       * config/aarch64/iterators.md (GPI_TI): New.
>       * config/aarch64/predicates.md (aarch64_atomic_TImode_operand,
>       aarch64_TImode_pair_operand): New.
>       * doc/invoke.texi (-matomic-128bit-instructions): Document option.
>
> gcc/testsuite/ChangeLog:
>
> 2018-09-27  Matthew Malcomson  <matthew.malcom...@arm.com>
>
>       * gcc.target/aarch64/atomic-load128.c: New test.
>       * gcc.target/aarch64/atomic-store.x: Shared macro for below tests.
>       * gcc.target/aarch64/atomic-store.c: Use atomic-store.x.
>       * gcc.target/aarch64/atomic-store128.c: New test using atomic-store.x.
>
>
> ###############     Attachment also inlined for ease of reply    
> ###############
>
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> caf1d2041f0cac8e3f975f8384a167a90dc638e5..578ea925fac9a7237af3a53e7ec642d0ba8e7b93
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -560,6 +560,8 @@ machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
>   rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
>   rtx aarch64_load_tp (rtx);
>   
> +void aarch64_split_atomic_ti_access (rtx op[], bool);
> +
>   void aarch64_expand_compare_and_swap (rtx op[]);
>   void aarch64_split_compare_and_swap (rtx op[]);
>   void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 
> e5cdb1d54f4ee96140202ea21a9478438d208f45..c1e407b5a3f27aa7eea9c35e749fe597e79f3e65
>  100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -158,9 +158,10 @@ extern unsigned aarch64_architecture_version;
>   #define AARCH64_FL_SHA3           (1 << 18)  /* Has ARMv8.4-a SHA3 and 
> SHA512.  */
>   #define AARCH64_FL_F16FML     (1 << 19)  /* Has ARMv8.4-a FP16 extensions.  
> */
>   #define AARCH64_FL_RCPC8_4    (1 << 20)  /* Has ARMv8.4-a RCPC extensions.  
> */
> +#define AARCH64_FL_AT         (1 << 21)  /* Has ARMv8.4-a AT extensions.  */
>   
>   /* Statistical Profiling extensions.  */
> -#define AARCH64_FL_PROFILE    (1 << 21)
> +#define AARCH64_FL_PROFILE    (1 << 22)
>   
>   /* Has FP and SIMD.  */
>   #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
> @@ -179,7 +180,7 @@ extern unsigned aarch64_architecture_version;
>     (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
>   #define AARCH64_FL_FOR_ARCH8_4                      \
>     (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
> -   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
> +   | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4 | AARCH64_FL_AT)
>   
>   /* Macros to test ISA flags.  */
>   
> @@ -201,6 +202,7 @@ extern unsigned aarch64_architecture_version;
>   #define AARCH64_ISA_SHA3       (aarch64_isa_flags & AARCH64_FL_SHA3)
>   #define AARCH64_ISA_F16FML     (aarch64_isa_flags & AARCH64_FL_F16FML)
>   #define AARCH64_ISA_RCPC8_4    (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
> +#define AARCH64_ISA_AT                  (aarch64_isa_flags & AARCH64_FL_AT)
>   
>   /* Crypto is an optional extension to AdvSIMD.  */
>   #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> cbf9d0c09b23712a67a5f0781c247cc859ade18d..7b46ca38a8cf55c6359e2f577bb9e15363dd3132
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -14160,6 +14160,80 @@ aarch64_emit_post_barrier (enum memmodel model)
>       }
>   }
>   
> +/* Emit an emulation of an atomic access for TImode using a load-exclusive
> +   store-exclusive pair.  */
> +void
> +aarch64_split_atomic_ti_access (rtx operands[], bool loading)
> +{
> +  rtx dest, src, model_rtx, scratch;
> +  dest = operands[0];
> +  src = operands[1];
> +  model_rtx = operands[2];
> +  scratch = operands[3];
> +
> +  machine_mode mode = GET_MODE (src);
> +  gcc_assert (mode == TImode || (mode == VOIDmode && src == const0_rtx));
> +
> +  rtx_code_label *label = gen_label_rtx ();
> +  emit_label (label);
> +
> +  rtx scratch_flag;
> +  /* In the below we use the definition that the ordering of sequentially
> +     consistent memory ordering semantics on a load are the same as 
> load-acquire
> +     semantics, and similarly on a store the ordering semantics make the same
> +     requirements as store-release semantics.
> +
> +     Sequentially consistent does provide extra semantics to do with a total
> +     ordering of atomic modifications of memory with sequential consistent
> +     semantics.  That memory ordering requirement is already provided by the
> +     fact that the Armv8 memory model is other-multi-copy atomic (page B2-96 
> of
> +     the ARM Architecture Reference Manual issue C.a) in combination with the
> +     load-acquire/store-release semantics.
> +
> +     Given that the aim of this instruction is to behave as an
> +     atomic_{load,store}ti these observations demonstrate that we do not 
> need to
> +     provide any special handling for sequentially consistent memory ordering
> +     over and above the handling for load-acquire and store-release
> +     semantics.  */
> +  if (loading)
> +    {
> +      /* For load-acquire semantics we require that no reads or writes can be
> +      reordered to before the observed load.  Hence all we need is for that
> +      load to have the required memory ordering semantics.  */
> +      scratch_flag = scratch;
> +      emit_insn (gen_aarch64_load_exclusive (TImode, dest, src, model_rtx));
> +      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag, src, 
> dest,
> +                                           GEN_INT (MEMMODEL_RELAXED)));
> +    }
> +  else
> +    {
> +      /* For store-release semantics we require that no memory access is
> +      reordered to after the store-exclusive that is observed.  This is
> +      satisfied by having that store-exclusive instruction execute with
> +      store-release memory semantics.  */
> +      emit_insn (gen_aarch64_load_exclusive (TImode, scratch, dest,
> +                                          GEN_INT (MEMMODEL_RELAXED)));
> +      scratch_flag = gen_lowpart (SImode, scratch);
> +      emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag,
> +                                           dest, src, model_rtx));
> +    }
> +
> +  rtx x;
> +  if (aarch64_track_speculation)
> +    {
> +      /* Emit an explicit compare instruction, so that we can correctly
> +      track the condition codes.  */
> +      rtx cc_reg = aarch64_gen_compare_reg (NE, scratch_flag, const0_rtx);
> +      x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
> +    }
> +  else
> +    x = gen_rtx_NE (VOIDmode, scratch_flag, const0_rtx);
> +
> +  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +                         gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
> +  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +}
> +
>   /* Emit an atomic compare-and-swap operation.  RVAL is the destination 
> register
>      for the data in memory.  EXPECTED is the value expected to be in memory.
>      DESIRED is the value to store to memory.  MEM is the memory location.  
> MODEL
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index 
> b2e80cbf6f1f9727c4309874b1122f975fb6b9be..3b1769ba67b6e94cc7c05e051902a8a0f2cfcbe0
>  100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -218,3 +218,9 @@ Enables verbose cost model dumping in the debug dump 
> files.
>   mtrack-speculation
>   Target Var(aarch64_track_speculation)
>   Generate code to track when the CPU might be speculating incorrectly.
> +
> +matomic-128bit-instructions
> +Target Var(aarch64_handle_128bit_atomics) Init(false)
> +Use architecture atomic operations to handle 128 bit atomic store/load 
> instead
> +of using libatomic.  The use of 128 bit atomics in code compiled with this
> +option is ABI incompatible with that of code compiled without this option.
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index 
> bba8e9e9c8e61d95fcfb61e650e7e76671c8f996..b47abcf7250aa2045ab8ced52a1373f6d4d71047
>  100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -472,11 +472,66 @@
>         UNSPECV_LDA))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release 
> (model))
> -      return "ldr<atomic_sfx>\t%<w>0, %1";
> -    else
> +    if (aarch_mm_needs_acquire (operands[2]))
>         return "ldar<atomic_sfx>\t%<w>0, %1";
> +    else
> +      return "ldr<atomic_sfx>\t%<w>0, %1";
> +  }
> +)
> +
> +(define_expand "atomic_loadti"
> + [(match_operand:TI 0 "register_operand" "")
> +  (match_operand:TI 1 "aarch64_atomic_TImode_operand" "")
> +  (match_operand:TI 2 "const_int_operand" "")]
> + "aarch64_handle_128bit_atomics"
> + {
> +    if (AARCH64_ISA_AT)
> +      {
> +     emit_insn (gen_aarch64_atomic_loadti_ldp (operands[0], operands[1],
> +                                               operands[2]));
> +     DONE;
> +      }
> +
> +    emit_insn (gen_aarch64_atomic_loadti_loop (operands[0], operands[1],
> +                                             operands[2]));
> +    DONE;
> + }
> +)
> +
> +(define_insn "aarch64_atomic_loadti_ldp"
> +  [(set (match_operand:TI 0 "register_operand" "=r")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_TImode_pair_operand" "Umn")
> +       (match_operand:SI 2 "const_int_operand")]                     ;; model
> +      UNSPECV_LDA))]
> +  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
> +  {
> +    output_asm_insn ("ldp\\t%0, %H0, %1", operands);
> +    return aarch_mm_needs_acquire (operands[2])
> +       ? "dmb\\tishld"
> +       : "";
> +  }
> +  [(set (attr "length")
> +     (if_then_else (match_test "aarch_mm_needs_acquire (operands[2])")
> +                   (const_int 8)
> +                   (const_int 4)))]
> +)
> +
> +(define_insn_and_split "aarch64_atomic_loadti_loop"
> +  [(set (match_operand:TI 0 "register_operand" "=&r")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_sync_memory_operand" "Q")
> +       (match_operand:SI 2 "const_int_operand")]                     ;; model
> +      UNSPECV_LDA))
> +   (clobber (reg:CC CC_REGNUM))
> +   (clobber (match_scratch:SI 3 "=&r"))]
> +  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    aarch64_split_atomic_ti_access (operands, true);
> +    DONE;
>     }
>   )
>   
> @@ -488,8 +543,7 @@
>         UNSPECV_STL))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire 
> (model))
> +    if (! aarch_mm_needs_release (operands[2]))
>         return "str<atomic_sfx>\t%<w>1, %0";
>       else if (which_alternative == 0)
>         return "stlr<atomic_sfx>\t%<w>1, %0";
> @@ -499,6 +553,61 @@
>     [(set_attr "arch" "*,rcpc8_4")]
>   )
>   
> +(define_expand "atomic_storeti"
> + [(match_operand:TI 0 "aarch64_atomic_TImode_operand" "")
> +  (match_operand:TI 1 "aarch64_reg_or_zero" "")
> +  (match_operand:TI 2 "const_int_operand" "")]
> + "aarch64_handle_128bit_atomics"
> + {
> +    if (AARCH64_ISA_AT)
> +      {
> +     emit_insn (gen_aarch64_atomic_storeti_stp (operands[0], operands[1],
> +                                                operands[2]));
> +     DONE;
> +      }
> +
> +    emit_insn (gen_aarch64_atomic_storeti_loop (operands[0], operands[1],
> +                                              operands[2]));
> +    DONE;
> + }
> +)
> +
> +(define_insn "aarch64_atomic_storeti_stp"
> +  [(set (match_operand:TI 0 "aarch64_TImode_pair_operand" "=Umn")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
> +       (match_operand:SI 2 "const_int_operand")]                     ;; model
> +      UNSPECV_STL)) ]
> +  "aarch64_handle_128bit_atomics && AARCH64_ISA_AT"
> +  {
> +    if (aarch_mm_needs_release (operands[2]))
> +      output_asm_insn ("dmb\tish", operands);
> +    return "stp\t%x1, %H1, %0";
> +  }
> +  [(set (attr "length")
> +     (if_then_else (match_test "aarch_mm_needs_release (operands[2])")
> +                   (const_int 8)
> +                   (const_int 4)))]
> +)
> +
> +(define_insn_and_split "aarch64_atomic_storeti_loop"
> +  [(set (match_operand:TI 0 "aarch64_sync_memory_operand" "=Q")
> +    (unspec_volatile:TI
> +      [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ")
> +       (match_operand:SI 2 "const_int_operand")]                     ;; model
> +      UNSPECV_STL))
> +   (clobber (reg:CC CC_REGNUM))
> +   (clobber (match_scratch:TI 3 "=&r"))]
> +  "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    aarch64_split_atomic_ti_access (operands, false);
> +    DONE;
> +  }
> +)
> +
>   (define_insn "@aarch64_load_exclusive<mode>"
>     [(set (match_operand:SI 0 "register_operand" "=r")
>       (zero_extend:SI
> @@ -508,45 +617,52 @@
>       UNSPECV_LX)))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release 
> (model))
> -      return "ldxr<atomic_sfx>\t%w0, %1";
> -    else
> +    if (aarch_mm_needs_acquire (operands[2]))
>         return "ldaxr<atomic_sfx>\t%w0, %1";
> +    else
> +      return "ldxr<atomic_sfx>\t%w0, %1";
>     }
>   )
>   
>   (define_insn "@aarch64_load_exclusive<mode>"
> -  [(set (match_operand:GPI 0 "register_operand" "=r")
> -    (unspec_volatile:GPI
> -      [(match_operand:GPI 1 "aarch64_sync_memory_operand" "Q")
> +  [(set (match_operand:GPI_TI 0 "register_operand" "=r")
> +    (unspec_volatile:GPI_TI
> +      [(match_operand:GPI_TI 1 "aarch64_sync_memory_operand" "Q")
>          (match_operand:SI 2 "const_int_operand")]
>         UNSPECV_LX))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release 
> (model))
> -      return "ldxr\t%<w>0, %1";
> +    bool acquire_needed = aarch_mm_needs_acquire (operands[2]);
> +    if (GET_MODE (operands[1]) == TImode)
> +      return acquire_needed
> +          ? "ldaxp\t%0, %H0, %1"
> +          : "ldxp\t%0, %H0, %1";
>       else
> -      return "ldaxr\t%<w>0, %1";
> +      return acquire_needed
> +          ? "ldaxr\t%<w>0, %1"
> +          : "ldxr\t%<w>0, %1";
>     }
>   )
>   
>   (define_insn "@aarch64_store_exclusive<mode>"
>     [(set (match_operand:SI 0 "register_operand" "=&r")
>       (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
> -   (set (match_operand:ALLI 1 "aarch64_sync_memory_operand" "=Q")
> -    (unspec_volatile:ALLI
> -      [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
> +   (set (match_operand:ALLI_TI 1 "aarch64_sync_memory_operand" "=Q")
> +    (unspec_volatile:ALLI_TI
> +      [(match_operand:ALLI_TI 2 "aarch64_reg_or_zero" "rZ")
>          (match_operand:SI 3 "const_int_operand")]
>         UNSPECV_SX))]
>     ""
>     {
> -    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
> -    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire 
> (model))
> -      return "stxr<atomic_sfx>\t%w0, %<w>2, %1";
> +    bool release_needed = aarch_mm_needs_release (operands[3]);
> +    if (GET_MODE (operands[1]) == TImode)
> +      return release_needed
> +          ? "stlxp\t%w0, %x2, %H2, %1"
> +          : "stxp\t%w0, %x2, %H2, %1";
>       else
> -      return "stlxr<atomic_sfx>\t%w0, %<w>2, %1";
> +      return release_needed
> +          ? "stlxr<atomic_sfx>\t%w0, %<w>2, %1"
> +          : "stxr<atomic_sfx>\t%w0, %<w>2, %1";
>     }
>   )
>   
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> a43956054e82aaf651fb45d0ff254b248c02c644..8d3fe29f6e4b9a3a7a6c8fc32c1564ef88501fb4
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -26,6 +26,9 @@
>   ;; Iterator for General Purpose Integer registers (32- and 64-bit modes)
>   (define_mode_iterator GPI [SI DI])
>   
> +;; Iterator for SI, DI, TI.
> +(define_mode_iterator GPI_TI [SI DI TI])
> +
>   ;; Iterator for HI, SI, DI, some instructions can only work on these modes.
>   (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
>   
> diff --git a/gcc/config/aarch64/predicates.md 
> b/gcc/config/aarch64/predicates.md
> index 
> 5b08b03c5868c7aa86f8844e3219a6e82717d4f0..a52b8989fe277d2ec3d32ee31d99708cb8ef2ced
>  100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -389,6 +389,23 @@
>       (match_operand 0 "aarch64_9bit_offset_memory_operand")
>       (match_operand 0 "aarch64_sync_memory_operand")))
>   
> +;; Predicate to accept operands for TImode atomic load/store.
> +;; Depends on the ISA because the AT extension makes LDP/STP atomic, and they
> +;; accept more operands than LDAXP/STLXP.
> +(define_predicate "aarch64_TImode_pair_operand"
> +  (and (match_code "mem")
> +     (ior (match_code "reg" "0")
> +          (and (match_code "plus" "0")
> +               (match_code "reg" "00")
> +               (match_code "const_int" "01")
> +               (match_test "aarch64_offset_7bit_signed_scaled_p (
> +                             DImode, INTVAL (XEXP (XEXP (op, 0), 1)))")))))
> +
> +(define_predicate "aarch64_atomic_TImode_operand"
> +  (if_then_else (match_test "AARCH64_ISA_AT")
> +    (match_operand 0 "aarch64_TImode_pair_operand")
> +    (match_operand 0 "aarch64_sync_memory_operand")))
> +
>   ;; Predicates for parallel expanders based on mode.
>   (define_special_predicate "vect_par_cnst_hi_half"
>     (match_code "parallel")
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 
> 685c211e176d13809078cf0fd595d49763adef25..d1e5cdfcdfb8e00400d422ed6869d60ebe4f03fb
>  100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -15075,6 +15075,26 @@ and 2048.  @samp{scalable} is the default.
>   At present, @samp{-msve-vector-bits=128} produces the same output
>   as @samp{-msve-vector-bits=scalable}.
>   
> +@item -matomic-128bit-instructions
> +@itemx -mno-atomic-128bit-instructions
> +@opindex matomic-128bit-instructions
> +@opindex mno-atomic-128bit-instructions
> +Enable or disable using inline 128 bit atomic loads and stores.
> +Without this flag atomic memory accesses of this size will be handled by
> +libatomic.
> +Inline accesses are faster than calls to libatomic but can interrupt accesses
> +through libatomic, this means that pre-existing code using libatomic is ABI
> +incompatible with code generated using this flag.
> +By default this option is disabled @samp{-mno-atomic-128bit-instructions}.
> +
> +If this flag is used targeting a processor that has the atomicity guarantees 
> on
> +the STP and LDP instructions added in Armv8.4 then GCC will use these
> +instructions, otherwise GCC will generate a load-exclusive/store-exclusive
> +read-write-modify loop.
> +The use of a read-write-modify loop for an atomic load can cause a 
> Segmentation
> +fault when atomically loading a variable that the compiler has put in 
> read-only
> +memory.
> +
>   @end table
>   
>   @subsubsection @option{-march} and @option{-mcpu} Feature Modifiers
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-load128.c 
> b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..b43599975db69201771adc6695d67da052be75a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
> +
> +#include <stdlib.h>
> +#include <stdatomic.h>
> +#include <stdint.h>
> +
> +#define RUN_TESTS_NAMED(prefix) \
> +  void \
> +  prefix##128 () \
> +{ \
> +  __int128 *atomic_vals = calloc (4, sizeof (__int128)); \
> +  __int128 temp_val; \
> +  temp_val = atomic_load_explicit (atomic_vals, memory_order_relaxed); \
> +  temp_val = atomic_load_explicit (atomic_vals, memory_order_acquire); \
> +  temp_val = atomic_load_explicit ((atomic_vals + 1), memory_order_acquire); 
> \
> +  temp_val = atomic_load ((atomic_vals + 2)); \
> +  temp_val = atomic_load_explicit ((atomic_vals + 3), memory_order_relaxed); 
> \
> +}
> +
> +RUN_TESTS_NAMED (bar);
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, 
> \\\[x\[^\n\]*\n\[ \t\]*dmb\tishld" 3 } } */
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } 
> } */
> +
> +__attribute__ ((target ("arch=armv8.3-a")))
> +RUN_TESTS_NAMED (foo);
> +/* { dg-final { scan-assembler-times "ldxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 2 } 
> } */
> +/* { dg-final { scan-assembler-times "ldaxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 3 
> } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c 
> b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> index 
> 8cabc05b0d739dbfdcecf681348b62634fcfc9a4..141e4e317d73b12555163c8352218842d4250a37
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> @@ -1,23 +1,7 @@
>   /* { dg-do compile } */
>   /* { dg-options "-march=armv8.4-a -O2" } */
>   
> -#include <stdatomic.h>
> -
> -typedef __INT8_TYPE__ int8_t;
> -typedef __INT16_TYPE__ int16_t;
> -typedef __INT32_TYPE__ int32_t;
> -typedef __INT64_TYPE__ int64_t;
> -
> -#define STORE_TESTS(size) \
> -  void \
> -  foo##size (int##size##_t *atomic_vals) \
> -{ \
> -  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> -  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> -  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> -  atomic_store ((atomic_vals + 2), 2); \
> -  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> -}
> +#include "atomic-store.x"
>   
>   STORE_TESTS (8);
>   /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 
> } } */
> @@ -26,6 +10,7 @@ STORE_TESTS (8);
>   /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 1\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 2\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 3\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } 
> } */
>   
>   STORE_TESTS (16);
>   /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 
> } } */
> @@ -33,6 +18,7 @@ STORE_TESTS (16);
>   /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 2\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 4\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 6\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } 
> } */
>   
>   STORE_TESTS (32);
>   /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 
> } } */
> @@ -40,12 +26,14 @@ STORE_TESTS (32);
>   /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 4\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 8\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 12\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } 
> } */
>   
>   STORE_TESTS (64);
>   /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 
> } } */
>   /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 
> 8\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 
> 16\\\]" 1 } } */
>   /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 
> 24\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } 
> } */
>   
>   void
>   foo_toolarge_offset (int64_t *atomic_vals)
> @@ -64,12 +52,20 @@ foo_negative (int8_t *atomic_vals)
>   }
>   /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 
> -2\\\]" 1 { target { ! ilp32 } } } } */
>   
> -#pragma GCC target ("arch=armv8.3-a")
>   void
> +__attribute__ ((target ("arch=armv8.3-a")))
>   foo_older_arch (int64_t *atomic_vals)
>   {
>     atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
>   }
> -
>   /* Three times, one for each of the three above functions.  */
>   /* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 
> } } */
> +
> +/* This test is to show that the -matomic-128bit-instructions flag is needed
> + * to handle 128 bit atomic store.  */
> +typedef __int128 int128_t;
> +STORE_TESTS (128);
> +/* { dg-final { scan-assembler-not "dmb\tish\n\[ \t\]*stp" } } */
> +/* { dg-final { scan-assembler-not "stxp" } } */
> +/* { dg-final { scan-assembler-not "stlxp" } } */
> +/* { dg-final { scan-assembler-times "bl?\t__atomic_store_16" 6 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.x 
> b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..5e6261a8d3ec3905b4a850cd33dbd1caa37a186e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.x
> @@ -0,0 +1,20 @@
> +#include <stdatomic.h>
> +
> +typedef __INT8_TYPE__ int8_t;
> +typedef __INT16_TYPE__ int16_t;
> +typedef __INT32_TYPE__ int32_t;
> +typedef __INT64_TYPE__ int64_t;
> +
> +#define STORE_TESTS_NAMED(size, prefix) \
> +void \
> +prefix##size (int##size##_t *atomic_vals) \
> +{ \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> +  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> +  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> +  atomic_store ((atomic_vals + 2), 2); \
> +  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> +  atomic_store_explicit (atomic_vals + 4, 0, memory_order_release); \
> +}
> +
> +#define STORE_TESTS(size) STORE_TESTS_NAMED(size, foo)
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store128.c 
> b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..bed864205128e2d8b6deb856d061ad13667cb14b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c
> @@ -0,0 +1,74 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */
> +
> +#include "atomic-store.x"
> +
> +STORE_TESTS (8);
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 
> } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 
>  { target { ! ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2 
>  { target { ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 1\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 
> 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } 
> } */
> +
> +STORE_TESTS (16);
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 
> } } */
> +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 
> } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 
> 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } 
> } */
> +
> +STORE_TESTS (32);
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } 
> } */
> +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 
> } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 
> 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 
> 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } 
> } */
> +
> +STORE_TESTS (64);
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } 
> } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 
> 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 
> 16\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 
> 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } 
> } */
> +
> +void
> +foo_toolarge_offset (int64_t *atomic_vals)
> +{
> +  /* 9bit signed unscaled immediate =>
> +     largest representable value +255.
> +     smallest representable value -256.  */
> +  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
> +  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
> +}
> +
> +void
> +foo_negative (int8_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
> +}
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 
> -2\\\]" 1 { target { ! ilp32 } } } } */
> +
> +void
> +__attribute__ ((target ("arch=armv8.3-a")))
> +foo_older_arch (int64_t *atomic_vals)
> +{
> +  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
> +}
> +/* Three times, one for each of the three above functions.  */
> +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 
> } } */
> +typedef __int128 int128_t;
> +STORE_TESTS (128);
> +/* { dg-final { scan-assembler-times "dmb\tish\n\[ \t\]*stp" 4 } } */
> +/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } 
> } */
> +/* { dg-final { scan-assembler-times "stp\txzr, xzr, \\\[x" 1 } } */
> +
> +
> +__attribute__ ((target ("arch=armv8.3-a")))
> +STORE_TESTS_NAMED (128, bar);
> +/* { dg-final { scan-assembler-times "stxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, 
> \\\[x" 2 } } */
> +/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, x\[0-9\]+, 
> x\[0-9\]+, \\\[x" 3 } } */
> +/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, xzr, xzr, \\\[x" 1 } 
> } */
>

ping: [PATCH][GCC][AARCH64]Introduce aarch64 atomic_{load,store}ti patterns

Reply via email to