On 18/09/18 10:15, Matthew Malcomson wrote: > [PATCH][GCC][AARCH64] Use STLUR for atomic_store > > Use the STLUR instruction introduced in Armv8.4-a. > This instruction has the store-release semantic like STLR but can take a > 9-bit unscaled signed immediate offset. > > Example test case: > ``` > void > foo () > { > int32_t *atomic_vals = calloc (4, sizeof (int32_t)); > atomic_store_explicit (atomic_vals + 1, 2, memory_order_release); > } > ``` > > Before patch generates > ``` > foo: > stp x29, x30, [sp, -16]! > mov x1, 4 > mov x0, x1 > mov x29, sp > bl calloc > mov w1, 2 > add x0, x0, 4 > stlr w1, [x0] > ldp x29, x30, [sp], 16 > ret > ``` > > After patch generates > ``` > foo: > stp x29, x30, [sp, -16]! > mov x1, 4 > mov x0, x1 > mov x29, sp > bl calloc > mov w1, 2 > stlur w1, [x0, 4] > ldp x29, x30, [sp], 16 > ret > ``` > > We introduce a new feature flag to indicate the presence of this instruction. > The feature flag is called AARCH64_ISA_RCPC8_4 and is included when targeting > armv8.4 architecture. > > We also introduce an "arch" attribute to be checked called "rcpc8_4" after > this > feature flag. > > Full bootstrap and regression test done on aarch64-none-linux-gnu. > Ok for trunk? > > gcc/ > > 2018-09-18 Matthew Malcomson <matthew.malcom...@arm.com> > > * config/aarch64/aarch64-protos.h > (aarch64_offset_9bit_signed_unscaled_p): New declaration. > * config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value. > (arch_enabled): Add check for "rcpc8_4" attribute value of "arch". > * config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield. > (AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4. > (AARCH64_FL_PROFILE): Move index so flags are ordered. > (AARCH64_ISA_RCPC8_4): New flag. > * config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed > to aarch64_offset_9bit_signed_unscaled_p. > * config/aarch64/atomics.md (atomic_store<mode>): Allow offset > and use stlur. > * config/aarch64/constraints.md (Ust): New constraint. > * config/aarch64/predicates.md. > (aarch64_9bit_offset_memory_operand): New predicate. > > gcc/testsuite/ > > 2018-09-18 Matthew Malcomson <matthew.malcom...@arm.com> > > * gcc.target/aarch64/atomic-store.c: New. > > > ############### Attachment also inlined for ease of reply > ############### > > > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index > ef95fc829b83886e2ff00e4664e31af916e99b0c..7a6254e46893fb36dc2ae57e7cfe78af67fb0e49 > 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, > rtx, rtx, rtx); > bool aarch64_mov_operand_p (rtx, machine_mode); > rtx aarch64_reverse_mask (machine_mode, unsigned int); > bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64); > +bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64); > char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx); > char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx); > char *aarch64_output_sve_inc_dec_immediate (const char *, rtx); > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index > c1218503bab19323eee1cca8b7e4bea8fbfcf573..cc21e1656b75b4ed1e94d0eb4b2b3af0039ba47e > 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version; > #define AARCH64_FL_SM4 (1 << 17) /* Has ARMv8.4-A SM3 and SM4. > */ > #define AARCH64_FL_SHA3 (1 << 18) /* Has ARMv8.4-a SHA3 and > SHA512. */ > #define AARCH64_FL_F16FML (1 << 19) /* Has ARMv8.4-a FP16 extensions. > */ > +#define AARCH64_FL_RCPC8_4 (1 << 20) /* Has ARMv8.4-a RCPC extensions. > */ > > /* Statistical Profiling extensions. */ > -#define AARCH64_FL_PROFILE (1 << 20) > +#define AARCH64_FL_PROFILE (1 << 21) > > /* Has FP and SIMD. */ > #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD) > @@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version; > (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3) > #define AARCH64_FL_FOR_ARCH8_4 \ > (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \ > - | AARCH64_FL_DOTPROD) > + | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4) > > /* Macros to test ISA flags. */ > > @@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version; > #define AARCH64_ISA_SM4 (aarch64_isa_flags & AARCH64_FL_SM4) > #define AARCH64_ISA_SHA3 (aarch64_isa_flags & AARCH64_FL_SHA3) > #define AARCH64_ISA_F16FML (aarch64_isa_flags & AARCH64_FL_F16FML) > +#define AARCH64_ISA_RCPC8_4 (aarch64_isa_flags & AARCH64_FL_RCPC8_4) > > /* Crypto is an optional extension to AdvSIMD. */ > #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO) > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > 0d7ca9998466d8d4f9e79faf451a281f8d154d7d..b1a963689a35d406bf383ea7f90c8c2087be7c0a > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -4490,9 +4490,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, > poly_int64 offset) > > /* Return true if OFFSET is a signed 9-bit value. */ > > -static inline bool > -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, > - poly_int64 offset) > +bool > +aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, > + poly_int64 offset) > { > HOST_WIDE_INT const_offset; > return (offset.is_constant (&const_offset) > @@ -5767,7 +5767,7 @@ aarch64_classify_address (struct aarch64_address_info > *info, > instruction memory accesses. */ > if (mode == TImode || mode == TFmode) > return (aarch64_offset_7bit_signed_scaled_p (DImode, offset) > - && (offset_9bit_signed_unscaled_p (mode, offset) > + && (aarch64_offset_9bit_signed_unscaled_p (mode, offset) > || offset_12bit_unsigned_scaled_p (mode, offset))); > > /* A 7bit offset check because OImode will emit a ldp/stp > @@ -5781,7 +5781,8 @@ aarch64_classify_address (struct aarch64_address_info > *info, > ldr/str instructions (only big endian will get here). */ > if (mode == CImode) > return (aarch64_offset_7bit_signed_scaled_p (TImode, offset) > - && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32) > + && (aarch64_offset_9bit_signed_unscaled_p (V16QImode, > + offset + 32) > || offset_12bit_unsigned_scaled_p (V16QImode, > offset + 32))); > > @@ -5821,7 +5822,7 @@ aarch64_classify_address (struct aarch64_address_info > *info, > || known_eq (GET_MODE_SIZE (mode), 16)) > && aarch64_offset_7bit_signed_scaled_p (mode, offset)); > else > - return (offset_9bit_signed_unscaled_p (mode, offset) > + return (aarch64_offset_9bit_signed_unscaled_p (mode, offset) > || offset_12bit_unsigned_scaled_p (mode, offset)); > } > > @@ -5874,7 +5875,7 @@ aarch64_classify_address (struct aarch64_address_info > *info, > */ > if (mode == TImode || mode == TFmode) > return (aarch64_offset_7bit_signed_scaled_p (mode, offset) > - && offset_9bit_signed_unscaled_p (mode, offset)); > + && aarch64_offset_9bit_signed_unscaled_p (mode, offset)); > > if (load_store_pair_p) > return ((known_eq (GET_MODE_SIZE (mode), 4) > @@ -5882,7 +5883,7 @@ aarch64_classify_address (struct aarch64_address_info > *info, > || known_eq (GET_MODE_SIZE (mode), 16)) > && aarch64_offset_7bit_signed_scaled_p (mode, offset)); > else > - return offset_9bit_signed_unscaled_p (mode, offset); > + return aarch64_offset_9bit_signed_unscaled_p (mode, offset); > } > return false; > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index > 955769a64d2030839cdb337321a808626188190e..7158bf0f2efdfb00763af13ac29c54a6723f19fa > 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -263,7 +263,7 @@ > ;; alternative). This attribute is used to compute attribute "enabled", use > type > ;; "any" to enable an alternative in all cases. > > -(define_enum "arches" [ any fp simd sve fp16]) > +(define_enum "arches" [ any rcpc8_4 fp simd sve fp16]) > > (define_enum_attr "arch" "arches" (const_string "any")) > > @@ -285,6 +285,9 @@ > (ior > (eq_attr "arch" "any") > > + (and (eq_attr "arch" "rcpc8_4") > + (match_test "AARCH64_ISA_RCPC8_4")) > + > (and (eq_attr "arch" "fp") > (match_test "TARGET_FLOAT")) > > diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md > index > 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec > 100644 > --- a/gcc/config/aarch64/atomics.md > +++ b/gcc/config/aarch64/atomics.md > @@ -481,9 +481,9 @@ > ) > > (define_insn "atomic_store<mode>" > - [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q") > + [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand" "=Q,Ust")
This is less than ideal because on earlier architectures the predicate will allow the offset variants but register allocation will then have to undo that to match the first alternative. I think what we should do is define a wrapped variant of aarch64_9bit_offset_memory_operand which uses that function but only allows the offset when RCPC8_4 is enabled. Something like aarch64_rcpc_memory_operand (...) { if (TARGET_RCPC8_4) return aarch64_9bit_offset_memory_operand (...); return aarch64_sync_memory_operand (...); } OK with that change. R. > (unspec_volatile:ALLI > - [(match_operand:ALLI 1 "general_operand" "rZ") > + [(match_operand:ALLI 1 "general_operand" "rZ,rZ") > (match_operand:SI 2 "const_int_operand")] ;; model > UNSPECV_STL))] > "" > @@ -491,9 +491,12 @@ > enum memmodel model = memmodel_from_int (INTVAL (operands[2])); > if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire > (model)) > return "str<atomic_sfx>\t%<w>1, %0"; > - else > + else if (which_alternative == 0) > return "stlr<atomic_sfx>\t%<w>1, %0"; > + else > + return "stlur<atomic_sfx>\t%<w>1, %0"; > } > + [(set_attr "arch" "*,rcpc8_4")] > ) > > (define_insn "@aarch64_load_exclusive<mode>" > diff --git a/gcc/config/aarch64/constraints.md > b/gcc/config/aarch64/constraints.md > index > 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..809b35e5fd377a8c6245138e0639c3afc41c7c13 > 100644 > --- a/gcc/config/aarch64/constraints.md > +++ b/gcc/config/aarch64/constraints.md > @@ -218,6 +218,11 @@ > (and (match_code "mem") > (match_test "REG_P (XEXP (op, 0))"))) > > +(define_memory_constraint "Ust" > + "@internal > + A memory address with 9bit unscaled offset." > + (match_operand 0 "aarch64_9bit_offset_memory_operand")) > + > (define_memory_constraint "Ump" > "@internal > A memory address suitable for a load/store pair operation." > diff --git a/gcc/config/aarch64/predicates.md > b/gcc/config/aarch64/predicates.md > index > d8f377b9603e76a29dd92f95e9905121eaf7b800..8016344f0e79bf881bfbe37547f115d094a66d0a > 100644 > --- a/gcc/config/aarch64/predicates.md > +++ b/gcc/config/aarch64/predicates.md > @@ -359,6 +359,31 @@ > (and (match_operand 0 "memory_operand") > (match_code "reg" "0"))) > > +(define_predicate "aarch64_9bit_offset_memory_operand" > + (and (match_operand 0 "memory_operand") > + (ior (match_code "reg" "0") > + (and (match_code "plus" "0") > + (match_code "reg" "00") > + (match_code "const_int" "01")))) > +{ > + rtx mem_op = XEXP (op, 0); > + > + if (REG_P (mem_op)) > + return GET_MODE (mem_op) == DImode; > + > + rtx plus_op0 = XEXP (mem_op, 0); > + rtx plus_op1 = XEXP (mem_op, 1); > + > + if (GET_MODE (plus_op0) != DImode) > + return false; > + > + poly_int64 offset; > + if (!poly_int_rtx_p (plus_op1, &offset)) > + gcc_unreachable (); > + > + return aarch64_offset_9bit_signed_unscaled_p (mode, offset); > +}) > + > ;; Predicates for parallel expanders based on mode. > (define_special_predicate "vect_par_cnst_hi_half" > (match_code "parallel") > diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c > b/gcc/testsuite/gcc.target/aarch64/atomic-store.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..8cabc05b0d739dbfdcecf681348b62634fcfc9a4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c > @@ -0,0 +1,75 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=armv8.4-a -O2" } */ > + > +#include <stdatomic.h> > + > +typedef __INT8_TYPE__ int8_t; > +typedef __INT16_TYPE__ int16_t; > +typedef __INT32_TYPE__ int32_t; > +typedef __INT64_TYPE__ int64_t; > + > +#define STORE_TESTS(size) \ > + void \ > + foo##size (int##size##_t *atomic_vals) \ > +{ \ > + atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \ > + atomic_store_explicit (atomic_vals, 2, memory_order_release); \ > + atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \ > + atomic_store ((atomic_vals + 2), 2); \ > + atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \ > +} > + > +STORE_TESTS (8); > +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > { target { ! ilp32 } } } } */ > +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2 > { target { ilp32 } } } } */ > +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > 1\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > 2\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" > 1 } } */ > + > +STORE_TESTS (16); > +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, > 2\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, > 4\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" > 1 } } */ > + > +STORE_TESTS (32); > +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } > } */ > +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, > 4\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, > 8\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" > 1 } } */ > + > +STORE_TESTS (64); > +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } > } */ > +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, > 8\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, > 16\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" > 1 } } */ > + > +void > +foo_toolarge_offset (int64_t *atomic_vals) > +{ > + /* 9bit signed unscaled immediate => > + largest representable value +255. > + smallest representable value -256. */ > + atomic_store_explicit (atomic_vals + 32, 2, memory_order_release); > + atomic_store_explicit (atomic_vals - 33, 2, memory_order_release); > +} > + > +void > +foo_negative (int8_t *atomic_vals) > +{ > + atomic_store_explicit (atomic_vals - 2, 2, memory_order_release); > +} > +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > -2\\\]" 1 { target { ! ilp32 } } } } */ > + > +#pragma GCC target ("arch=armv8.3-a") > +void > +foo_older_arch (int64_t *atomic_vals) > +{ > + atomic_store_explicit (atomic_vals + 2, 2, memory_order_release); > +} > + > +/* Three times, one for each of the three above functions. */ > +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 > } } */ > > > stlur-use.patch > > > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index > ef95fc829b83886e2ff00e4664e31af916e99b0c..7a6254e46893fb36dc2ae57e7cfe78af67fb0e49 > 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, > rtx, rtx, rtx); > bool aarch64_mov_operand_p (rtx, machine_mode); > rtx aarch64_reverse_mask (machine_mode, unsigned int); > bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64); > +bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64); > char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx); > char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx); > char *aarch64_output_sve_inc_dec_immediate (const char *, rtx); > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index > c1218503bab19323eee1cca8b7e4bea8fbfcf573..cc21e1656b75b4ed1e94d0eb4b2b3af0039ba47e > 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version; > #define AARCH64_FL_SM4 (1 << 17) /* Has ARMv8.4-A SM3 and SM4. > */ > #define AARCH64_FL_SHA3 (1 << 18) /* Has ARMv8.4-a SHA3 and > SHA512. */ > #define AARCH64_FL_F16FML (1 << 19) /* Has ARMv8.4-a FP16 extensions. > */ > +#define AARCH64_FL_RCPC8_4 (1 << 20) /* Has ARMv8.4-a RCPC extensions. > */ > > /* Statistical Profiling extensions. */ > -#define AARCH64_FL_PROFILE (1 << 20) > +#define AARCH64_FL_PROFILE (1 << 21) > > /* Has FP and SIMD. */ > #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD) > @@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version; > (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3) > #define AARCH64_FL_FOR_ARCH8_4 \ > (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \ > - | AARCH64_FL_DOTPROD) > + | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4) > > /* Macros to test ISA flags. */ > > @@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version; > #define AARCH64_ISA_SM4 (aarch64_isa_flags & AARCH64_FL_SM4) > #define AARCH64_ISA_SHA3 (aarch64_isa_flags & AARCH64_FL_SHA3) > #define AARCH64_ISA_F16FML (aarch64_isa_flags & AARCH64_FL_F16FML) > +#define AARCH64_ISA_RCPC8_4 (aarch64_isa_flags & AARCH64_FL_RCPC8_4) > > /* Crypto is an optional extension to AdvSIMD. */ > #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO) > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > 0d7ca9998466d8d4f9e79faf451a281f8d154d7d..b1a963689a35d406bf383ea7f90c8c2087be7c0a > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -4490,9 +4490,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, > poly_int64 offset) > > /* Return true if OFFSET is a signed 9-bit value. */ > > -static inline bool > -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, > - poly_int64 offset) > +bool > +aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, > + poly_int64 offset) > { > HOST_WIDE_INT const_offset; > return (offset.is_constant (&const_offset) > @@ -5767,7 +5767,7 @@ aarch64_classify_address (struct aarch64_address_info > *info, > instruction memory accesses. */ > if (mode == TImode || mode == TFmode) > return (aarch64_offset_7bit_signed_scaled_p (DImode, offset) > - && (offset_9bit_signed_unscaled_p (mode, offset) > + && (aarch64_offset_9bit_signed_unscaled_p (mode, offset) > || offset_12bit_unsigned_scaled_p (mode, offset))); > > /* A 7bit offset check because OImode will emit a ldp/stp > @@ -5781,7 +5781,8 @@ aarch64_classify_address (struct aarch64_address_info > *info, > ldr/str instructions (only big endian will get here). */ > if (mode == CImode) > return (aarch64_offset_7bit_signed_scaled_p (TImode, offset) > - && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32) > + && (aarch64_offset_9bit_signed_unscaled_p (V16QImode, > + offset + 32) > || offset_12bit_unsigned_scaled_p (V16QImode, > offset + 32))); > > @@ -5821,7 +5822,7 @@ aarch64_classify_address (struct aarch64_address_info > *info, > || known_eq (GET_MODE_SIZE (mode), 16)) > && aarch64_offset_7bit_signed_scaled_p (mode, offset)); > else > - return (offset_9bit_signed_unscaled_p (mode, offset) > + return (aarch64_offset_9bit_signed_unscaled_p (mode, offset) > || offset_12bit_unsigned_scaled_p (mode, offset)); > } > > @@ -5874,7 +5875,7 @@ aarch64_classify_address (struct aarch64_address_info > *info, > */ > if (mode == TImode || mode == TFmode) > return (aarch64_offset_7bit_signed_scaled_p (mode, offset) > - && offset_9bit_signed_unscaled_p (mode, offset)); > + && aarch64_offset_9bit_signed_unscaled_p (mode, offset)); > > if (load_store_pair_p) > return ((known_eq (GET_MODE_SIZE (mode), 4) > @@ -5882,7 +5883,7 @@ aarch64_classify_address (struct aarch64_address_info > *info, > || known_eq (GET_MODE_SIZE (mode), 16)) > && aarch64_offset_7bit_signed_scaled_p (mode, offset)); > else > - return offset_9bit_signed_unscaled_p (mode, offset); > + return aarch64_offset_9bit_signed_unscaled_p (mode, offset); > } > return false; > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index > 955769a64d2030839cdb337321a808626188190e..7158bf0f2efdfb00763af13ac29c54a6723f19fa > 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -263,7 +263,7 @@ > ;; alternative). This attribute is used to compute attribute "enabled", use > type > ;; "any" to enable an alternative in all cases. > > -(define_enum "arches" [ any fp simd sve fp16]) > +(define_enum "arches" [ any rcpc8_4 fp simd sve fp16]) > > (define_enum_attr "arch" "arches" (const_string "any")) > > @@ -285,6 +285,9 @@ > (ior > (eq_attr "arch" "any") > > + (and (eq_attr "arch" "rcpc8_4") > + (match_test "AARCH64_ISA_RCPC8_4")) > + > (and (eq_attr "arch" "fp") > (match_test "TARGET_FLOAT")) > > diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md > index > 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec > 100644 > --- a/gcc/config/aarch64/atomics.md > +++ b/gcc/config/aarch64/atomics.md > @@ -481,9 +481,9 @@ > ) > > (define_insn "atomic_store<mode>" > - [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q") > + [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand" "=Q,Ust") > (unspec_volatile:ALLI > - [(match_operand:ALLI 1 "general_operand" "rZ") > + [(match_operand:ALLI 1 "general_operand" "rZ,rZ") > (match_operand:SI 2 "const_int_operand")] ;; model > UNSPECV_STL))] > "" > @@ -491,9 +491,12 @@ > enum memmodel model = memmodel_from_int (INTVAL (operands[2])); > if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire > (model)) > return "str<atomic_sfx>\t%<w>1, %0"; > - else > + else if (which_alternative == 0) > return "stlr<atomic_sfx>\t%<w>1, %0"; > + else > + return "stlur<atomic_sfx>\t%<w>1, %0"; > } > + [(set_attr "arch" "*,rcpc8_4")] > ) > > (define_insn "@aarch64_load_exclusive<mode>" > diff --git a/gcc/config/aarch64/constraints.md > b/gcc/config/aarch64/constraints.md > index > 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..809b35e5fd377a8c6245138e0639c3afc41c7c13 > 100644 > --- a/gcc/config/aarch64/constraints.md > +++ b/gcc/config/aarch64/constraints.md > @@ -218,6 +218,11 @@ > (and (match_code "mem") > (match_test "REG_P (XEXP (op, 0))"))) > > +(define_memory_constraint "Ust" > + "@internal > + A memory address with 9bit unscaled offset." > + (match_operand 0 "aarch64_9bit_offset_memory_operand")) > + > (define_memory_constraint "Ump" > "@internal > A memory address suitable for a load/store pair operation." > diff --git a/gcc/config/aarch64/predicates.md > b/gcc/config/aarch64/predicates.md > index > d8f377b9603e76a29dd92f95e9905121eaf7b800..8016344f0e79bf881bfbe37547f115d094a66d0a > 100644 > --- a/gcc/config/aarch64/predicates.md > +++ b/gcc/config/aarch64/predicates.md > @@ -359,6 +359,31 @@ > (and (match_operand 0 "memory_operand") > (match_code "reg" "0"))) > > +(define_predicate "aarch64_9bit_offset_memory_operand" > + (and (match_operand 0 "memory_operand") > + (ior (match_code "reg" "0") > + (and (match_code "plus" "0") > + (match_code "reg" "00") > + (match_code "const_int" "01")))) > +{ > + rtx mem_op = XEXP (op, 0); > + > + if (REG_P (mem_op)) > + return GET_MODE (mem_op) == DImode; > + > + rtx plus_op0 = XEXP (mem_op, 0); > + rtx plus_op1 = XEXP (mem_op, 1); > + > + if (GET_MODE (plus_op0) != DImode) > + return false; > + > + poly_int64 offset; > + if (!poly_int_rtx_p (plus_op1, &offset)) > + gcc_unreachable (); > + > + return aarch64_offset_9bit_signed_unscaled_p (mode, offset); > +}) > + > ;; Predicates for parallel expanders based on mode. > (define_special_predicate "vect_par_cnst_hi_half" > (match_code "parallel") > diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c > b/gcc/testsuite/gcc.target/aarch64/atomic-store.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..8cabc05b0d739dbfdcecf681348b62634fcfc9a4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c > @@ -0,0 +1,75 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=armv8.4-a -O2" } */ > + > +#include <stdatomic.h> > + > +typedef __INT8_TYPE__ int8_t; > +typedef __INT16_TYPE__ int16_t; > +typedef __INT32_TYPE__ int32_t; > +typedef __INT64_TYPE__ int64_t; > + > +#define STORE_TESTS(size) \ > + void \ > + foo##size (int##size##_t *atomic_vals) \ > +{ \ > + atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \ > + atomic_store_explicit (atomic_vals, 2, memory_order_release); \ > + atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \ > + atomic_store ((atomic_vals + 2), 2); \ > + atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \ > +} > + > +STORE_TESTS (8); > +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > { target { ! ilp32 } } } } */ > +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2 > { target { ilp32 } } } } */ > +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > 1\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > 2\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" > 1 } } */ > + > +STORE_TESTS (16); > +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, > 2\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, > 4\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" > 1 } } */ > + > +STORE_TESTS (32); > +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } > } */ > +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, > 4\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, > 8\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" > 1 } } */ > + > +STORE_TESTS (64); > +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } > } */ > +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, > 8\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, > 16\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" > 1 } } */ > + > +void > +foo_toolarge_offset (int64_t *atomic_vals) > +{ > + /* 9bit signed unscaled immediate => > + largest representable value +255. > + smallest representable value -256. */ > + atomic_store_explicit (atomic_vals + 32, 2, memory_order_release); > + atomic_store_explicit (atomic_vals - 33, 2, memory_order_release); > +} > + > +void > +foo_negative (int8_t *atomic_vals) > +{ > + atomic_store_explicit (atomic_vals - 2, 2, memory_order_release); > +} > +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, > -2\\\]" 1 { target { ! ilp32 } } } } */ > + > +#pragma GCC target ("arch=armv8.3-a") > +void > +foo_older_arch (int64_t *atomic_vals) > +{ > + atomic_store_explicit (atomic_vals + 2, 2, memory_order_release); > +} > + > +/* Three times, one for each of the three above functions. */ > +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 > } } */ >