Ping
On 27/09/18 14:43, Matthew Malcomson wrote: > [PATCH][GCC][AARCH64] Introduce aarch64 atomic_{load,store}ti patterns > > In Armv8.4-a these patterns use the LDP/STP instructions that are guaranteed > to > be single-copy atomic, ensure correct memory ordering semantics by using > the DMB instruction. > > We put the use of these inline expansions behind a command line flag since > they > do not satisfy the libatomic ABI and hence can't be used together with code > already compiled using 16 byte atomics. > This command line flag is -matomic-128bit-instructions. > > Given the introduction of a flag specified to break ABI compatibility with > libatomic, it seems reasonable to introduce the load-exclusive/store-exclusive > read-modify-write loop emulation of 128 bit atomic load and stores for older > architectures behind this flag. > > We introduce the usual extension macros for the "at" extension marking the > LDP/STP atomicity guarantees introduced in Armv8.4-a and use these to decide > which to use when -matomic-128bit-instructions is provided on the command > line. > > Tested with full bootstrap and make check on aarch64-none-linux-gnu. > Ok for trunk? > > gcc/ChangeLog: > > 2018-09-27 Matthew Malcomson <matthew.malcom...@arm.com> > > * config/aarch64/aarch64-protos.h (aarch64_split_atomic_ti_access): New > prototype. > * config/aarch64/aarch64.c (aarch64_split_atomic_ti_access): New. > * config/aarch64/aarch64.h (AARCH64_FL_AT): New flag. > (AARCH64_FL_PROFILE): Flag moved to accomodate above. > (AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_AT. > (AARCH64_ISA_AT): New ISA flag. > * config/aarch64/aarch64.opt (-matomic-128bit-instruction): New. > * config/aarch64/atomics.md (atomic_load<mode>, atomic_store<mode>, > @aarch64_load_exclusive<mode> {smaller registers}, > @aarch64_load_exclusive<mode> {GPI registers}, > @aarch64_store_exclusive<mode>): Use aarch_mm_needs_{acquire,release} > instead of three part check. > (atomic_loadti, aarch64_atomic_loadti_ldp, aarch64_atomic_loadti_basic > atomic_storeti, aarch64_atomic_storeti_stp, > aarch64_atomic_storeti_basic) New > * config/aarch64/iterators.md (GPI_TI): New. > * config/aarch64/predicates.md (aarch64_atomic_TImode_operand, > aarch64_TImode_pair_operand): New. > * doc/invoke.texi (-matomic-128bit-instructions): Document option. > > gcc/testsuite/ChangeLog: > > 2018-09-27 Matthew Malcomson <matthew.malcom...@arm.com> > > * gcc.target/aarch64/atomic-load128.c: New test. > * gcc.target/aarch64/atomic-store.x: Shared macro for below tests. > * gcc.target/aarch64/atomic-store.c: Use atomic-store.x. > * gcc.target/aarch64/atomic-store128.c: New test using atomic-store.x. > > > ############### Attachment also inlined for ease of reply > ############### > > > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index > caf1d2041f0cac8e3f975f8384a167a90dc638e5..578ea925fac9a7237af3a53e7ec642d0ba8e7b93 > 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -560,6 +560,8 @@ machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx); > rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx); > rtx aarch64_load_tp (rtx); > > +void aarch64_split_atomic_ti_access (rtx op[], bool); > + > void aarch64_expand_compare_and_swap (rtx op[]); > void aarch64_split_compare_and_swap (rtx op[]); > void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx); > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index > e5cdb1d54f4ee96140202ea21a9478438d208f45..c1e407b5a3f27aa7eea9c35e749fe597e79f3e65 > 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -158,9 +158,10 @@ extern unsigned aarch64_architecture_version; > #define AARCH64_FL_SHA3 (1 << 18) /* Has ARMv8.4-a SHA3 and > SHA512. */ > #define AARCH64_FL_F16FML (1 << 19) /* Has ARMv8.4-a FP16 extensions. > */ > #define AARCH64_FL_RCPC8_4 (1 << 20) /* Has ARMv8.4-a RCPC extensions. > */ > +#define AARCH64_FL_AT (1 << 21) /* Has ARMv8.4-a AT extensions. */ > > /* Statistical Profiling extensions. */ > -#define AARCH64_FL_PROFILE (1 << 21) > +#define AARCH64_FL_PROFILE (1 << 22) > > /* Has FP and SIMD. */ > #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD) > @@ -179,7 +180,7 @@ extern unsigned aarch64_architecture_version; > (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3) > #define AARCH64_FL_FOR_ARCH8_4 \ > (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \ > - | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4) > + | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4 | AARCH64_FL_AT) > > /* Macros to test ISA flags. */ > > @@ -201,6 +202,7 @@ extern unsigned aarch64_architecture_version; > #define AARCH64_ISA_SHA3 (aarch64_isa_flags & AARCH64_FL_SHA3) > #define AARCH64_ISA_F16FML (aarch64_isa_flags & AARCH64_FL_F16FML) > #define AARCH64_ISA_RCPC8_4 (aarch64_isa_flags & AARCH64_FL_RCPC8_4) > +#define AARCH64_ISA_AT (aarch64_isa_flags & AARCH64_FL_AT) > > /* Crypto is an optional extension to AdvSIMD. */ > #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO) > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > cbf9d0c09b23712a67a5f0781c247cc859ade18d..7b46ca38a8cf55c6359e2f577bb9e15363dd3132 > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -14160,6 +14160,80 @@ aarch64_emit_post_barrier (enum memmodel model) > } > } > > +/* Emit an emulation of an atomic access for TImode using a load-exclusive > + store-exclusive pair. */ > +void > +aarch64_split_atomic_ti_access (rtx operands[], bool loading) > +{ > + rtx dest, src, model_rtx, scratch; > + dest = operands[0]; > + src = operands[1]; > + model_rtx = operands[2]; > + scratch = operands[3]; > + > + machine_mode mode = GET_MODE (src); > + gcc_assert (mode == TImode || (mode == VOIDmode && src == const0_rtx)); > + > + rtx_code_label *label = gen_label_rtx (); > + emit_label (label); > + > + rtx scratch_flag; > + /* In the below we use the definition that the ordering of sequentially > + consistent memory ordering semantics on a load are the same as > load-acquire > + semantics, and similarly on a store the ordering semantics make the same > + requirements as store-release semantics. > + > + Sequentially consistent does provide extra semantics to do with a total > + ordering of atomic modifications of memory with sequential consistent > + semantics. That memory ordering requirement is already provided by the > + fact that the Armv8 memory model is other-multi-copy atomic (page B2-96 > of > + the ARM Architecture Reference Manual issue C.a) in combination with the > + load-acquire/store-release semantics. > + > + Given that the aim of this instruction is to behave as an > + atomic_{load,store}ti these observations demonstrate that we do not > need to > + provide any special handling for sequentially consistent memory ordering > + over and above the handling for load-acquire and store-release > + semantics. */ > + if (loading) > + { > + /* For load-acquire semantics we require that no reads or writes can be > + reordered to before the observed load. Hence all we need is for that > + load to have the required memory ordering semantics. */ > + scratch_flag = scratch; > + emit_insn (gen_aarch64_load_exclusive (TImode, dest, src, model_rtx)); > + emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag, src, > dest, > + GEN_INT (MEMMODEL_RELAXED))); > + } > + else > + { > + /* For store-release semantics we require that no memory access is > + reordered to after the store-exclusive that is observed. This is > + satisfied by having that store-exclusive instruction execute with > + store-release memory semantics. */ > + emit_insn (gen_aarch64_load_exclusive (TImode, scratch, dest, > + GEN_INT (MEMMODEL_RELAXED))); > + scratch_flag = gen_lowpart (SImode, scratch); > + emit_insn (gen_aarch64_store_exclusive (TImode, scratch_flag, > + dest, src, model_rtx)); > + } > + > + rtx x; > + if (aarch64_track_speculation) > + { > + /* Emit an explicit compare instruction, so that we can correctly > + track the condition codes. */ > + rtx cc_reg = aarch64_gen_compare_reg (NE, scratch_flag, const0_rtx); > + x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx); > + } > + else > + x = gen_rtx_NE (VOIDmode, scratch_flag, const0_rtx); > + > + x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, > + gen_rtx_LABEL_REF (Pmode, label), pc_rtx); > + aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); > +} > + > /* Emit an atomic compare-and-swap operation. RVAL is the destination > register > for the data in memory. EXPECTED is the value expected to be in memory. > DESIRED is the value to store to memory. MEM is the memory location. > MODEL > diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt > index > b2e80cbf6f1f9727c4309874b1122f975fb6b9be..3b1769ba67b6e94cc7c05e051902a8a0f2cfcbe0 > 100644 > --- a/gcc/config/aarch64/aarch64.opt > +++ b/gcc/config/aarch64/aarch64.opt > @@ -218,3 +218,9 @@ Enables verbose cost model dumping in the debug dump > files. > mtrack-speculation > Target Var(aarch64_track_speculation) > Generate code to track when the CPU might be speculating incorrectly. > + > +matomic-128bit-instructions > +Target Var(aarch64_handle_128bit_atomics) Init(false) > +Use architecture atomic operations to handle 128 bit atomic store/load > instead > +of using libatomic. The use of 128 bit atomics in code compiled with this > +option is ABI incompatible with that of code compiled without this option. > diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md > index > bba8e9e9c8e61d95fcfb61e650e7e76671c8f996..b47abcf7250aa2045ab8ced52a1373f6d4d71047 > 100644 > --- a/gcc/config/aarch64/atomics.md > +++ b/gcc/config/aarch64/atomics.md > @@ -472,11 +472,66 @@ > UNSPECV_LDA))] > "" > { > - enum memmodel model = memmodel_from_int (INTVAL (operands[2])); > - if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release > (model)) > - return "ldr<atomic_sfx>\t%<w>0, %1"; > - else > + if (aarch_mm_needs_acquire (operands[2])) > return "ldar<atomic_sfx>\t%<w>0, %1"; > + else > + return "ldr<atomic_sfx>\t%<w>0, %1"; > + } > +) > + > +(define_expand "atomic_loadti" > + [(match_operand:TI 0 "register_operand" "") > + (match_operand:TI 1 "aarch64_atomic_TImode_operand" "") > + (match_operand:TI 2 "const_int_operand" "")] > + "aarch64_handle_128bit_atomics" > + { > + if (AARCH64_ISA_AT) > + { > + emit_insn (gen_aarch64_atomic_loadti_ldp (operands[0], operands[1], > + operands[2])); > + DONE; > + } > + > + emit_insn (gen_aarch64_atomic_loadti_loop (operands[0], operands[1], > + operands[2])); > + DONE; > + } > +) > + > +(define_insn "aarch64_atomic_loadti_ldp" > + [(set (match_operand:TI 0 "register_operand" "=r") > + (unspec_volatile:TI > + [(match_operand:TI 1 "aarch64_TImode_pair_operand" "Umn") > + (match_operand:SI 2 "const_int_operand")] ;; model > + UNSPECV_LDA))] > + "aarch64_handle_128bit_atomics && AARCH64_ISA_AT" > + { > + output_asm_insn ("ldp\\t%0, %H0, %1", operands); > + return aarch_mm_needs_acquire (operands[2]) > + ? "dmb\\tishld" > + : ""; > + } > + [(set (attr "length") > + (if_then_else (match_test "aarch_mm_needs_acquire (operands[2])") > + (const_int 8) > + (const_int 4)))] > +) > + > +(define_insn_and_split "aarch64_atomic_loadti_loop" > + [(set (match_operand:TI 0 "register_operand" "=&r") > + (unspec_volatile:TI > + [(match_operand:TI 1 "aarch64_sync_memory_operand" "Q") > + (match_operand:SI 2 "const_int_operand")] ;; model > + UNSPECV_LDA)) > + (clobber (reg:CC CC_REGNUM)) > + (clobber (match_scratch:SI 3 "=&r"))] > + "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT" > + "#" > + "&& reload_completed" > + [(const_int 0)] > + { > + aarch64_split_atomic_ti_access (operands, true); > + DONE; > } > ) > > @@ -488,8 +543,7 @@ > UNSPECV_STL))] > "" > { > - enum memmodel model = memmodel_from_int (INTVAL (operands[2])); > - if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire > (model)) > + if (! aarch_mm_needs_release (operands[2])) > return "str<atomic_sfx>\t%<w>1, %0"; > else if (which_alternative == 0) > return "stlr<atomic_sfx>\t%<w>1, %0"; > @@ -499,6 +553,61 @@ > [(set_attr "arch" "*,rcpc8_4")] > ) > > +(define_expand "atomic_storeti" > + [(match_operand:TI 0 "aarch64_atomic_TImode_operand" "") > + (match_operand:TI 1 "aarch64_reg_or_zero" "") > + (match_operand:TI 2 "const_int_operand" "")] > + "aarch64_handle_128bit_atomics" > + { > + if (AARCH64_ISA_AT) > + { > + emit_insn (gen_aarch64_atomic_storeti_stp (operands[0], operands[1], > + operands[2])); > + DONE; > + } > + > + emit_insn (gen_aarch64_atomic_storeti_loop (operands[0], operands[1], > + operands[2])); > + DONE; > + } > +) > + > +(define_insn "aarch64_atomic_storeti_stp" > + [(set (match_operand:TI 0 "aarch64_TImode_pair_operand" "=Umn") > + (unspec_volatile:TI > + [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ") > + (match_operand:SI 2 "const_int_operand")] ;; model > + UNSPECV_STL)) ] > + "aarch64_handle_128bit_atomics && AARCH64_ISA_AT" > + { > + if (aarch_mm_needs_release (operands[2])) > + output_asm_insn ("dmb\tish", operands); > + return "stp\t%x1, %H1, %0"; > + } > + [(set (attr "length") > + (if_then_else (match_test "aarch_mm_needs_release (operands[2])") > + (const_int 8) > + (const_int 4)))] > +) > + > +(define_insn_and_split "aarch64_atomic_storeti_loop" > + [(set (match_operand:TI 0 "aarch64_sync_memory_operand" "=Q") > + (unspec_volatile:TI > + [(match_operand:TI 1 "aarch64_reg_or_zero" "rZ") > + (match_operand:SI 2 "const_int_operand")] ;; model > + UNSPECV_STL)) > + (clobber (reg:CC CC_REGNUM)) > + (clobber (match_scratch:TI 3 "=&r"))] > + "aarch64_handle_128bit_atomics && !AARCH64_ISA_AT" > + "#" > + "&& reload_completed" > + [(const_int 0)] > + { > + aarch64_split_atomic_ti_access (operands, false); > + DONE; > + } > +) > + > (define_insn "@aarch64_load_exclusive<mode>" > [(set (match_operand:SI 0 "register_operand" "=r") > (zero_extend:SI > @@ -508,45 +617,52 @@ > UNSPECV_LX)))] > "" > { > - enum memmodel model = memmodel_from_int (INTVAL (operands[2])); > - if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release > (model)) > - return "ldxr<atomic_sfx>\t%w0, %1"; > - else > + if (aarch_mm_needs_acquire (operands[2])) > return "ldaxr<atomic_sfx>\t%w0, %1"; > + else > + return "ldxr<atomic_sfx>\t%w0, %1"; > } > ) > > (define_insn "@aarch64_load_exclusive<mode>" > - [(set (match_operand:GPI 0 "register_operand" "=r") > - (unspec_volatile:GPI > - [(match_operand:GPI 1 "aarch64_sync_memory_operand" "Q") > + [(set (match_operand:GPI_TI 0 "register_operand" "=r") > + (unspec_volatile:GPI_TI > + [(match_operand:GPI_TI 1 "aarch64_sync_memory_operand" "Q") > (match_operand:SI 2 "const_int_operand")] > UNSPECV_LX))] > "" > { > - enum memmodel model = memmodel_from_int (INTVAL (operands[2])); > - if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release > (model)) > - return "ldxr\t%<w>0, %1"; > + bool acquire_needed = aarch_mm_needs_acquire (operands[2]); > + if (GET_MODE (operands[1]) == TImode) > + return acquire_needed > + ? "ldaxp\t%0, %H0, %1" > + : "ldxp\t%0, %H0, %1"; > else > - return "ldaxr\t%<w>0, %1"; > + return acquire_needed > + ? "ldaxr\t%<w>0, %1" > + : "ldxr\t%<w>0, %1"; > } > ) > > (define_insn "@aarch64_store_exclusive<mode>" > [(set (match_operand:SI 0 "register_operand" "=&r") > (unspec_volatile:SI [(const_int 0)] UNSPECV_SX)) > - (set (match_operand:ALLI 1 "aarch64_sync_memory_operand" "=Q") > - (unspec_volatile:ALLI > - [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ") > + (set (match_operand:ALLI_TI 1 "aarch64_sync_memory_operand" "=Q") > + (unspec_volatile:ALLI_TI > + [(match_operand:ALLI_TI 2 "aarch64_reg_or_zero" "rZ") > (match_operand:SI 3 "const_int_operand")] > UNSPECV_SX))] > "" > { > - enum memmodel model = memmodel_from_int (INTVAL (operands[3])); > - if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire > (model)) > - return "stxr<atomic_sfx>\t%w0, %<w>2, %1"; > + bool release_needed = aarch_mm_needs_release (operands[3]); > + if (GET_MODE (operands[1]) == TImode) > + return release_needed > + ? "stlxp\t%w0, %x2, %H2, %1" > + : "stxp\t%w0, %x2, %H2, %1"; > else > - return "stlxr<atomic_sfx>\t%w0, %<w>2, %1"; > + return release_needed > + ? "stlxr<atomic_sfx>\t%w0, %<w>2, %1" > + : "stxr<atomic_sfx>\t%w0, %<w>2, %1"; > } > ) > > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > a43956054e82aaf651fb45d0ff254b248c02c644..8d3fe29f6e4b9a3a7a6c8fc32c1564ef88501fb4 > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -26,6 +26,9 @@ > ;; Iterator for General Purpose Integer registers (32- and 64-bit modes) > (define_mode_iterator GPI [SI DI]) > > +;; Iterator for SI, DI, TI. > +(define_mode_iterator GPI_TI [SI DI TI]) > + > ;; Iterator for HI, SI, DI, some instructions can only work on these modes. > (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI]) > > diff --git a/gcc/config/aarch64/predicates.md > b/gcc/config/aarch64/predicates.md > index > 5b08b03c5868c7aa86f8844e3219a6e82717d4f0..a52b8989fe277d2ec3d32ee31d99708cb8ef2ced > 100644 > --- a/gcc/config/aarch64/predicates.md > +++ b/gcc/config/aarch64/predicates.md > @@ -389,6 +389,23 @@ > (match_operand 0 "aarch64_9bit_offset_memory_operand") > (match_operand 0 "aarch64_sync_memory_operand"))) > > +;; Predicate to accept operands for TImode atomic load/store. > +;; Depends on the ISA because the AT extension makes LDP/STP atomic, and they > +;; accept more operands than LDAXP/STLXP. > +(define_predicate "aarch64_TImode_pair_operand" > + (and (match_code "mem") > + (ior (match_code "reg" "0") > + (and (match_code "plus" "0") > + (match_code "reg" "00") > + (match_code "const_int" "01") > + (match_test "aarch64_offset_7bit_signed_scaled_p ( > + DImode, INTVAL (XEXP (XEXP (op, 0), 1)))"))))) > + > +(define_predicate "aarch64_atomic_TImode_operand" > + (if_then_else (match_test "AARCH64_ISA_AT") > + (match_operand 0 "aarch64_TImode_pair_operand") > + (match_operand 0 "aarch64_sync_memory_operand"))) > + > ;; Predicates for parallel expanders based on mode. > (define_special_predicate "vect_par_cnst_hi_half" > (match_code "parallel") > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > index > 685c211e176d13809078cf0fd595d49763adef25..d1e5cdfcdfb8e00400d422ed6869d60ebe4f03fb > 100644 > --- a/gcc/doc/invoke.texi > +++ b/gcc/doc/invoke.texi > @@ -15075,6 +15075,26 @@ and 2048. @samp{scalable} is the default. > At present, @samp{-msve-vector-bits=128} produces the same output > as @samp{-msve-vector-bits=scalable}. > > +@item -matomic-128bit-instructions > +@itemx -mno-atomic-128bit-instructions > +@opindex matomic-128bit-instructions > +@opindex mno-atomic-128bit-instructions > +Enable or disable using inline 128 bit atomic loads and stores. > +Without this flag atomic memory accesses of this size will be handled by > +libatomic. > +Inline accesses are faster than calls to libatomic but can interrupt accesses > +through libatomic, this means that pre-existing code using libatomic is ABI > +incompatible with code generated using this flag. > +By default this option is disabled @samp{-mno-atomic-128bit-instructions}. > + > +If this flag is used targeting a processor that has the atomicity guarantees > on > +the STP and LDP instructions added in Armv8.4 then GCC will use these > +instructions, otherwise GCC will generate a load-exclusive/store-exclusive > +read-write-modify loop. > +The use of a read-write-modify loop for an atomic load can cause a > Segmentation > +fault when atomically loading a variable that the compiler has put in > read-only > +memory. > + > @end table > > @subsubsection @option{-march} and @option{-mcpu} Feature Modifiers > diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-load128.c > b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..b43599975db69201771adc6695d67da052be75a4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/atomic-load128.c > @@ -0,0 +1,28 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */ > + > +#include <stdlib.h> > +#include <stdatomic.h> > +#include <stdint.h> > + > +#define RUN_TESTS_NAMED(prefix) \ > + void \ > + prefix##128 () \ > +{ \ > + __int128 *atomic_vals = calloc (4, sizeof (__int128)); \ > + __int128 temp_val; \ > + temp_val = atomic_load_explicit (atomic_vals, memory_order_relaxed); \ > + temp_val = atomic_load_explicit (atomic_vals, memory_order_acquire); \ > + temp_val = atomic_load_explicit ((atomic_vals + 1), memory_order_acquire); > \ > + temp_val = atomic_load ((atomic_vals + 2)); \ > + temp_val = atomic_load_explicit ((atomic_vals + 3), memory_order_relaxed); > \ > +} > + > +RUN_TESTS_NAMED (bar); > +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, > \\\[x\[^\n\]*\n\[ \t\]*dmb\tishld" 3 } } */ > +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } > } */ > + > +__attribute__ ((target ("arch=armv8.3-a"))) > +RUN_TESTS_NAMED (foo); > +/* { dg-final { scan-assembler-times "ldxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 2 } > } */ > +/* { dg-final { scan-assembler-times "ldaxp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 3 > } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c > b/gcc/testsuite/gcc.target/aarch64/atomic-store.c > index > 8cabc05b0d739dbfdcecf681348b62634fcfc9a4..141e4e317d73b12555163c8352218842d4250a37 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/atomic-store.c > +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c > @@ -1,23 +1,7 @@ > /* { dg-do compile } */ > /* { dg-options "-march=armv8.4-a -O2" } */ > > -#include <stdatomic.h> > - > -typedef __INT8_TYPE__ int8_t; > -typedef __INT16_TYPE__ int16_t; > -typedef __INT32_TYPE__ int32_t; > -typedef __INT64_TYPE__ int64_t; > - > -#define STORE_TESTS(size) \ > - void \ > - foo##size (int##size##_t *atomic_vals) \ > -{ \ > - atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \ > - atomic_store_explicit (atomic_vals, 2, memory_order_release); \ > - atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \ > - atomic_store ((atomic_vals + 2), 2); \ > - atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \ > -} > +#include "atomic-store.x" > > STORE_TESTS (8); > /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > @@ -26,6 +10,7 @@ STORE_TESTS (8); > /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > 1\\\]" 1 } } */ > /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > 2\\\]" 1 } } */ > /* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, > 3\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } > } */ > > STORE_TESTS (16); > /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > @@ -33,6 +18,7 @@ STORE_TESTS (16); > /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, > 2\\\]" 1 } } */ > /* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, > 4\\\]" 1 } } */ > /* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, > 6\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } > } */ > > STORE_TESTS (32); > /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > @@ -40,12 +26,14 @@ STORE_TESTS (32); > /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, > 4\\\]" 1 } } */ > /* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, > 8\\\]" 1 } } */ > /* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, > 12\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } > } */ > > STORE_TESTS (64); > /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, > 8\\\]" 1 } } */ > /* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, > 16\\\]" 1 } } */ > /* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, > 24\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } > } */ > > void > foo_toolarge_offset (int64_t *atomic_vals) > @@ -64,12 +52,20 @@ foo_negative (int8_t *atomic_vals) > } > /* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > -2\\\]" 1 { target { ! ilp32 } } } } */ > > -#pragma GCC target ("arch=armv8.3-a") > void > +__attribute__ ((target ("arch=armv8.3-a"))) > foo_older_arch (int64_t *atomic_vals) > { > atomic_store_explicit (atomic_vals + 2, 2, memory_order_release); > } > - > /* Three times, one for each of the three above functions. */ > /* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 > } } */ > + > +/* This test is to show that the -matomic-128bit-instructions flag is needed > + * to handle 128 bit atomic store. */ > +typedef __int128 int128_t; > +STORE_TESTS (128); > +/* { dg-final { scan-assembler-not "dmb\tish\n\[ \t\]*stp" } } */ > +/* { dg-final { scan-assembler-not "stxp" } } */ > +/* { dg-final { scan-assembler-not "stlxp" } } */ > +/* { dg-final { scan-assembler-times "bl?\t__atomic_store_16" 6 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.x > b/gcc/testsuite/gcc.target/aarch64/atomic-store.x > new file mode 100644 > index > 0000000000000000000000000000000000000000..5e6261a8d3ec3905b4a850cd33dbd1caa37a186e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.x > @@ -0,0 +1,20 @@ > +#include <stdatomic.h> > + > +typedef __INT8_TYPE__ int8_t; > +typedef __INT16_TYPE__ int16_t; > +typedef __INT32_TYPE__ int32_t; > +typedef __INT64_TYPE__ int64_t; > + > +#define STORE_TESTS_NAMED(size, prefix) \ > +void \ > +prefix##size (int##size##_t *atomic_vals) \ > +{ \ > + atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \ > + atomic_store_explicit (atomic_vals, 2, memory_order_release); \ > + atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \ > + atomic_store ((atomic_vals + 2), 2); \ > + atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \ > + atomic_store_explicit (atomic_vals + 4, 0, memory_order_release); \ > +} > + > +#define STORE_TESTS(size) STORE_TESTS_NAMED(size, foo) > diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store128.c > b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..bed864205128e2d8b6deb856d061ad13667cb14b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store128.c > @@ -0,0 +1,74 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=armv8.4-a -O2 -matomic-128bit-instructions" } */ > + > +#include "atomic-store.x" > + > +STORE_TESTS (8); > +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > { target { ! ilp32 } } } } */ > +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2 > { target { ilp32 } } } } */ > +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > 1\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > 2\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" > 1 } } */ > +/* { dg-final { scan-assembler-times "stlurb\twzr, \\\[x\[0-9\]+, 4\\\]" 1 } > } */ > + > +STORE_TESTS (16); > +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, > 2\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, > 4\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" > 1 } } */ > +/* { dg-final { scan-assembler-times "stlurh\twzr, \\\[x\[0-9\]+, 8\\\]" 1 } > } */ > + > +STORE_TESTS (32); > +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } > } */ > +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, > 4\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, > 8\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" > 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\twzr, \\\[x\[0-9\]+, 16\\\]" 1 } > } */ > + > +STORE_TESTS (64); > +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } > } */ > +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, > 8\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, > 16\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" > 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\txzr, \\\[x\[0-9\]+, 32\\\]" 1 } > } */ > + > +void > +foo_toolarge_offset (int64_t *atomic_vals) > +{ > + /* 9bit signed unscaled immediate => > + largest representable value +255. > + smallest representable value -256. */ > + atomic_store_explicit (atomic_vals + 32, 2, memory_order_release); > + atomic_store_explicit (atomic_vals - 33, 2, memory_order_release); > +} > + > +void > +foo_negative (int8_t *atomic_vals) > +{ > + atomic_store_explicit (atomic_vals - 2, 2, memory_order_release); > +} > +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > -2\\\]" 1 { target { ! ilp32 } } } } */ > + > +void > +__attribute__ ((target ("arch=armv8.3-a"))) > +foo_older_arch (int64_t *atomic_vals) > +{ > + atomic_store_explicit (atomic_vals + 2, 2, memory_order_release); > +} > +/* Three times, one for each of the three above functions. */ > +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 > } } */ > +typedef __int128 int128_t; > +STORE_TESTS (128); > +/* { dg-final { scan-assembler-times "dmb\tish\n\[ \t\]*stp" 4 } } */ > +/* { dg-final { scan-assembler-times "stp\tx\[0-9\]+, x\[0-9\]+, \\\[x" 5 } > } */ > +/* { dg-final { scan-assembler-times "stp\txzr, xzr, \\\[x" 1 } } */ > + > + > +__attribute__ ((target ("arch=armv8.3-a"))) > +STORE_TESTS_NAMED (128, bar); > +/* { dg-final { scan-assembler-times "stxp\tw\[0-9\]+, x\[0-9\]+, x\[0-9\]+, > \\\[x" 2 } } */ > +/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, x\[0-9\]+, > x\[0-9\]+, \\\[x" 3 } } */ > +/* { dg-final { scan-assembler-times "stlxp\tw\[0-9\]+, xzr, xzr, \\\[x" 1 } > } */ >