On 18/09/18 10:15, Matthew Malcomson wrote:
> [PATCH][GCC][AARCH64] Use STLUR for atomic_store
>
> Use the STLUR instruction introduced in Armv8.4-a.
> This instruction has the store-release semantic like STLR but can take a
> 9-bit unscaled signed immediate offset.
>
> Example test case:
> ```
> void
> foo ()
> {
> int32_t *atomic_vals = calloc (4, sizeof (int32_t));
> atomic_store_explicit (atomic_vals + 1, 2, memory_order_release);
> }
> ```
>
> Before patch generates
> ```
> foo:
> stp x29, x30, [sp, -16]!
> mov x1, 4
> mov x0, x1
> mov x29, sp
> bl calloc
> mov w1, 2
> add x0, x0, 4
> stlr w1, [x0]
> ldp x29, x30, [sp], 16
> ret
> ```
>
> After patch generates
> ```
> foo:
> stp x29, x30, [sp, -16]!
> mov x1, 4
> mov x0, x1
> mov x29, sp
> bl calloc
> mov w1, 2
> stlur w1, [x0, 4]
> ldp x29, x30, [sp], 16
> ret
> ```
>
> We introduce a new feature flag to indicate the presence of this instruction.
> The feature flag is called AARCH64_ISA_RCPC8_4 and is included when targeting
> armv8.4 architecture.
>
> We also introduce an "arch" attribute to be checked called "rcpc8_4" after
> this
> feature flag.
>
> Full bootstrap and regression test done on aarch64-none-linux-gnu.
> Ok for trunk?
>
> gcc/
>
> 2018-09-18 Matthew Malcomson <[email protected]>
>
> * config/aarch64/aarch64-protos.h
> (aarch64_offset_9bit_signed_unscaled_p): New declaration.
> * config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value.
> (arch_enabled): Add check for "rcpc8_4" attribute value of "arch".
> * config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield.
> (AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4.
> (AARCH64_FL_PROFILE): Move index so flags are ordered.
> (AARCH64_ISA_RCPC8_4): New flag.
> * config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed
> to aarch64_offset_9bit_signed_unscaled_p.
> * config/aarch64/atomics.md (atomic_store<mode>): Allow offset
> and use stlur.
> * config/aarch64/constraints.md (Ust): New constraint.
> * config/aarch64/predicates.md.
> (aarch64_9bit_offset_memory_operand): New predicate.
>
> gcc/testsuite/
>
> 2018-09-18 Matthew Malcomson <[email protected]>
>
> * gcc.target/aarch64/atomic-store.c: New.
>
>
> ############### Attachment also inlined for ease of reply
> ###############
>
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h
> b/gcc/config/aarch64/aarch64-protos.h
> index
> ef95fc829b83886e2ff00e4664e31af916e99b0c..7a6254e46893fb36dc2ae57e7cfe78af67fb0e49
> 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx,
> rtx, rtx, rtx);
> bool aarch64_mov_operand_p (rtx, machine_mode);
> rtx aarch64_reverse_mask (machine_mode, unsigned int);
> bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
> +bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
> char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
> char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
> char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index
> c1218503bab19323eee1cca8b7e4bea8fbfcf573..cc21e1656b75b4ed1e94d0eb4b2b3af0039ba47e
> 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version;
> #define AARCH64_FL_SM4 (1 << 17) /* Has ARMv8.4-A SM3 and SM4.
> */
> #define AARCH64_FL_SHA3 (1 << 18) /* Has ARMv8.4-a SHA3 and
> SHA512. */
> #define AARCH64_FL_F16FML (1 << 19) /* Has ARMv8.4-a FP16 extensions.
> */
> +#define AARCH64_FL_RCPC8_4 (1 << 20) /* Has ARMv8.4-a RCPC extensions.
> */
>
> /* Statistical Profiling extensions. */
> -#define AARCH64_FL_PROFILE (1 << 20)
> +#define AARCH64_FL_PROFILE (1 << 21)
>
> /* Has FP and SIMD. */
> #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD)
> @@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version;
> (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
> #define AARCH64_FL_FOR_ARCH8_4 \
> (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
> - | AARCH64_FL_DOTPROD)
> + | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
>
> /* Macros to test ISA flags. */
>
> @@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version;
> #define AARCH64_ISA_SM4 (aarch64_isa_flags & AARCH64_FL_SM4)
> #define AARCH64_ISA_SHA3 (aarch64_isa_flags & AARCH64_FL_SHA3)
> #define AARCH64_ISA_F16FML (aarch64_isa_flags & AARCH64_FL_F16FML)
> +#define AARCH64_ISA_RCPC8_4 (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
>
> /* Crypto is an optional extension to AdvSIMD. */
> #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index
> 0d7ca9998466d8d4f9e79faf451a281f8d154d7d..b1a963689a35d406bf383ea7f90c8c2087be7c0a
> 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -4490,9 +4490,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode,
> poly_int64 offset)
>
> /* Return true if OFFSET is a signed 9-bit value. */
>
> -static inline bool
> -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
> - poly_int64 offset)
> +bool
> +aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
> + poly_int64 offset)
> {
> HOST_WIDE_INT const_offset;
> return (offset.is_constant (&const_offset)
> @@ -5767,7 +5767,7 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> instruction memory accesses. */
> if (mode == TImode || mode == TFmode)
> return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
> - && (offset_9bit_signed_unscaled_p (mode, offset)
> + && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
> || offset_12bit_unsigned_scaled_p (mode, offset)));
>
> /* A 7bit offset check because OImode will emit a ldp/stp
> @@ -5781,7 +5781,8 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> ldr/str instructions (only big endian will get here). */
> if (mode == CImode)
> return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
> - && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
> + && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
> + offset + 32)
> || offset_12bit_unsigned_scaled_p (V16QImode,
> offset + 32)));
>
> @@ -5821,7 +5822,7 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> || known_eq (GET_MODE_SIZE (mode), 16))
> && aarch64_offset_7bit_signed_scaled_p (mode, offset));
> else
> - return (offset_9bit_signed_unscaled_p (mode, offset)
> + return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
> || offset_12bit_unsigned_scaled_p (mode, offset));
> }
>
> @@ -5874,7 +5875,7 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> */
> if (mode == TImode || mode == TFmode)
> return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
> - && offset_9bit_signed_unscaled_p (mode, offset));
> + && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
>
> if (load_store_pair_p)
> return ((known_eq (GET_MODE_SIZE (mode), 4)
> @@ -5882,7 +5883,7 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> || known_eq (GET_MODE_SIZE (mode), 16))
> && aarch64_offset_7bit_signed_scaled_p (mode, offset));
> else
> - return offset_9bit_signed_unscaled_p (mode, offset);
> + return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
> }
> return false;
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index
> 955769a64d2030839cdb337321a808626188190e..7158bf0f2efdfb00763af13ac29c54a6723f19fa
> 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -263,7 +263,7 @@
> ;; alternative). This attribute is used to compute attribute "enabled", use
> type
> ;; "any" to enable an alternative in all cases.
>
> -(define_enum "arches" [ any fp simd sve fp16])
> +(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
>
> (define_enum_attr "arch" "arches" (const_string "any"))
>
> @@ -285,6 +285,9 @@
> (ior
> (eq_attr "arch" "any")
>
> + (and (eq_attr "arch" "rcpc8_4")
> + (match_test "AARCH64_ISA_RCPC8_4"))
> +
> (and (eq_attr "arch" "fp")
> (match_test "TARGET_FLOAT"))
>
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index
> 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec
> 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -481,9 +481,9 @@
> )
>
> (define_insn "atomic_store<mode>"
> - [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
> + [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand" "=Q,Ust")
This is less than ideal because on earlier architectures the predicate
will allow the offset variants but register allocation will then have to
undo that to match the first alternative.
I think what we should do is define a wrapped variant of
aarch64_9bit_offset_memory_operand which uses that function but only
allows the offset when RCPC8_4 is enabled.
Something like
aarch64_rcpc_memory_operand (...)
{
if (TARGET_RCPC8_4)
return aarch64_9bit_offset_memory_operand (...);
return aarch64_sync_memory_operand (...);
}
OK with that change.
R.
> (unspec_volatile:ALLI
> - [(match_operand:ALLI 1 "general_operand" "rZ")
> + [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
> (match_operand:SI 2 "const_int_operand")] ;; model
> UNSPECV_STL))]
> ""
> @@ -491,9 +491,12 @@
> enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire
> (model))
> return "str<atomic_sfx>\t%<w>1, %0";
> - else
> + else if (which_alternative == 0)
> return "stlr<atomic_sfx>\t%<w>1, %0";
> + else
> + return "stlur<atomic_sfx>\t%<w>1, %0";
> }
> + [(set_attr "arch" "*,rcpc8_4")]
> )
>
> (define_insn "@aarch64_load_exclusive<mode>"
> diff --git a/gcc/config/aarch64/constraints.md
> b/gcc/config/aarch64/constraints.md
> index
> 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..809b35e5fd377a8c6245138e0639c3afc41c7c13
> 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -218,6 +218,11 @@
> (and (match_code "mem")
> (match_test "REG_P (XEXP (op, 0))")))
>
> +(define_memory_constraint "Ust"
> + "@internal
> + A memory address with 9bit unscaled offset."
> + (match_operand 0 "aarch64_9bit_offset_memory_operand"))
> +
> (define_memory_constraint "Ump"
> "@internal
> A memory address suitable for a load/store pair operation."
> diff --git a/gcc/config/aarch64/predicates.md
> b/gcc/config/aarch64/predicates.md
> index
> d8f377b9603e76a29dd92f95e9905121eaf7b800..8016344f0e79bf881bfbe37547f115d094a66d0a
> 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -359,6 +359,31 @@
> (and (match_operand 0 "memory_operand")
> (match_code "reg" "0")))
>
> +(define_predicate "aarch64_9bit_offset_memory_operand"
> + (and (match_operand 0 "memory_operand")
> + (ior (match_code "reg" "0")
> + (and (match_code "plus" "0")
> + (match_code "reg" "00")
> + (match_code "const_int" "01"))))
> +{
> + rtx mem_op = XEXP (op, 0);
> +
> + if (REG_P (mem_op))
> + return GET_MODE (mem_op) == DImode;
> +
> + rtx plus_op0 = XEXP (mem_op, 0);
> + rtx plus_op1 = XEXP (mem_op, 1);
> +
> + if (GET_MODE (plus_op0) != DImode)
> + return false;
> +
> + poly_int64 offset;
> + if (!poly_int_rtx_p (plus_op1, &offset))
> + gcc_unreachable ();
> +
> + return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
> +})
> +
> ;; Predicates for parallel expanders based on mode.
> (define_special_predicate "vect_par_cnst_hi_half"
> (match_code "parallel")
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..8cabc05b0d739dbfdcecf681348b62634fcfc9a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> @@ -0,0 +1,75 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2" } */
> +
> +#include <stdatomic.h>
> +
> +typedef __INT8_TYPE__ int8_t;
> +typedef __INT16_TYPE__ int16_t;
> +typedef __INT32_TYPE__ int32_t;
> +typedef __INT64_TYPE__ int64_t;
> +
> +#define STORE_TESTS(size) \
> + void \
> + foo##size (int##size##_t *atomic_vals) \
> +{ \
> + atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> + atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> + atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> + atomic_store ((atomic_vals + 2), 2); \
> + atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> +}
> +
> +STORE_TESTS (8);
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> { target { ! ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2
> { target { ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+,
> 1\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+,
> 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]"
> 1 } } */
> +
> +STORE_TESTS (16);
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> } } */
> +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+,
> 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+,
> 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]"
> 1 } } */
> +
> +STORE_TESTS (32);
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 }
> } */
> +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+,
> 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+,
> 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]"
> 1 } } */
> +
> +STORE_TESTS (64);
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 }
> } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+,
> 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+,
> 16\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]"
> 1 } } */
> +
> +void
> +foo_toolarge_offset (int64_t *atomic_vals)
> +{
> + /* 9bit signed unscaled immediate =>
> + largest representable value +255.
> + smallest representable value -256. */
> + atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
> + atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
> +}
> +
> +void
> +foo_negative (int8_t *atomic_vals)
> +{
> + atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
> +}
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+,
> -2\\\]" 1 { target { ! ilp32 } } } } */
> +
> +#pragma GCC target ("arch=armv8.3-a")
> +void
> +foo_older_arch (int64_t *atomic_vals)
> +{
> + atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
> +}
> +
> +/* Three times, one for each of the three above functions. */
> +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4
> } } */
>
>
> stlur-use.patch
>
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h
> b/gcc/config/aarch64/aarch64-protos.h
> index
> ef95fc829b83886e2ff00e4664e31af916e99b0c..7a6254e46893fb36dc2ae57e7cfe78af67fb0e49
> 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx,
> rtx, rtx, rtx);
> bool aarch64_mov_operand_p (rtx, machine_mode);
> rtx aarch64_reverse_mask (machine_mode, unsigned int);
> bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
> +bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
> char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
> char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
> char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index
> c1218503bab19323eee1cca8b7e4bea8fbfcf573..cc21e1656b75b4ed1e94d0eb4b2b3af0039ba47e
> 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version;
> #define AARCH64_FL_SM4 (1 << 17) /* Has ARMv8.4-A SM3 and SM4.
> */
> #define AARCH64_FL_SHA3 (1 << 18) /* Has ARMv8.4-a SHA3 and
> SHA512. */
> #define AARCH64_FL_F16FML (1 << 19) /* Has ARMv8.4-a FP16 extensions.
> */
> +#define AARCH64_FL_RCPC8_4 (1 << 20) /* Has ARMv8.4-a RCPC extensions.
> */
>
> /* Statistical Profiling extensions. */
> -#define AARCH64_FL_PROFILE (1 << 20)
> +#define AARCH64_FL_PROFILE (1 << 21)
>
> /* Has FP and SIMD. */
> #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD)
> @@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version;
> (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
> #define AARCH64_FL_FOR_ARCH8_4 \
> (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
> - | AARCH64_FL_DOTPROD)
> + | AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
>
> /* Macros to test ISA flags. */
>
> @@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version;
> #define AARCH64_ISA_SM4 (aarch64_isa_flags & AARCH64_FL_SM4)
> #define AARCH64_ISA_SHA3 (aarch64_isa_flags & AARCH64_FL_SHA3)
> #define AARCH64_ISA_F16FML (aarch64_isa_flags & AARCH64_FL_F16FML)
> +#define AARCH64_ISA_RCPC8_4 (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
>
> /* Crypto is an optional extension to AdvSIMD. */
> #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index
> 0d7ca9998466d8d4f9e79faf451a281f8d154d7d..b1a963689a35d406bf383ea7f90c8c2087be7c0a
> 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -4490,9 +4490,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode,
> poly_int64 offset)
>
> /* Return true if OFFSET is a signed 9-bit value. */
>
> -static inline bool
> -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
> - poly_int64 offset)
> +bool
> +aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
> + poly_int64 offset)
> {
> HOST_WIDE_INT const_offset;
> return (offset.is_constant (&const_offset)
> @@ -5767,7 +5767,7 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> instruction memory accesses. */
> if (mode == TImode || mode == TFmode)
> return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
> - && (offset_9bit_signed_unscaled_p (mode, offset)
> + && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
> || offset_12bit_unsigned_scaled_p (mode, offset)));
>
> /* A 7bit offset check because OImode will emit a ldp/stp
> @@ -5781,7 +5781,8 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> ldr/str instructions (only big endian will get here). */
> if (mode == CImode)
> return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
> - && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
> + && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
> + offset + 32)
> || offset_12bit_unsigned_scaled_p (V16QImode,
> offset + 32)));
>
> @@ -5821,7 +5822,7 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> || known_eq (GET_MODE_SIZE (mode), 16))
> && aarch64_offset_7bit_signed_scaled_p (mode, offset));
> else
> - return (offset_9bit_signed_unscaled_p (mode, offset)
> + return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
> || offset_12bit_unsigned_scaled_p (mode, offset));
> }
>
> @@ -5874,7 +5875,7 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> */
> if (mode == TImode || mode == TFmode)
> return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
> - && offset_9bit_signed_unscaled_p (mode, offset));
> + && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
>
> if (load_store_pair_p)
> return ((known_eq (GET_MODE_SIZE (mode), 4)
> @@ -5882,7 +5883,7 @@ aarch64_classify_address (struct aarch64_address_info
> *info,
> || known_eq (GET_MODE_SIZE (mode), 16))
> && aarch64_offset_7bit_signed_scaled_p (mode, offset));
> else
> - return offset_9bit_signed_unscaled_p (mode, offset);
> + return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
> }
> return false;
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index
> 955769a64d2030839cdb337321a808626188190e..7158bf0f2efdfb00763af13ac29c54a6723f19fa
> 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -263,7 +263,7 @@
> ;; alternative). This attribute is used to compute attribute "enabled", use
> type
> ;; "any" to enable an alternative in all cases.
>
> -(define_enum "arches" [ any fp simd sve fp16])
> +(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
>
> (define_enum_attr "arch" "arches" (const_string "any"))
>
> @@ -285,6 +285,9 @@
> (ior
> (eq_attr "arch" "any")
>
> + (and (eq_attr "arch" "rcpc8_4")
> + (match_test "AARCH64_ISA_RCPC8_4"))
> +
> (and (eq_attr "arch" "fp")
> (match_test "TARGET_FLOAT"))
>
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index
> 36c06756a1f94cadae097b3aad654fbeba1cf2f3..73078e412d01a43c05195f01488b95a2bc7a20ec
> 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -481,9 +481,9 @@
> )
>
> (define_insn "atomic_store<mode>"
> - [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
> + [(set (match_operand:ALLI 0 "aarch64_9bit_offset_memory_operand" "=Q,Ust")
> (unspec_volatile:ALLI
> - [(match_operand:ALLI 1 "general_operand" "rZ")
> + [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
> (match_operand:SI 2 "const_int_operand")] ;; model
> UNSPECV_STL))]
> ""
> @@ -491,9 +491,12 @@
> enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire
> (model))
> return "str<atomic_sfx>\t%<w>1, %0";
> - else
> + else if (which_alternative == 0)
> return "stlr<atomic_sfx>\t%<w>1, %0";
> + else
> + return "stlur<atomic_sfx>\t%<w>1, %0";
> }
> + [(set_attr "arch" "*,rcpc8_4")]
> )
>
> (define_insn "@aarch64_load_exclusive<mode>"
> diff --git a/gcc/config/aarch64/constraints.md
> b/gcc/config/aarch64/constraints.md
> index
> 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..809b35e5fd377a8c6245138e0639c3afc41c7c13
> 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -218,6 +218,11 @@
> (and (match_code "mem")
> (match_test "REG_P (XEXP (op, 0))")))
>
> +(define_memory_constraint "Ust"
> + "@internal
> + A memory address with 9bit unscaled offset."
> + (match_operand 0 "aarch64_9bit_offset_memory_operand"))
> +
> (define_memory_constraint "Ump"
> "@internal
> A memory address suitable for a load/store pair operation."
> diff --git a/gcc/config/aarch64/predicates.md
> b/gcc/config/aarch64/predicates.md
> index
> d8f377b9603e76a29dd92f95e9905121eaf7b800..8016344f0e79bf881bfbe37547f115d094a66d0a
> 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -359,6 +359,31 @@
> (and (match_operand 0 "memory_operand")
> (match_code "reg" "0")))
>
> +(define_predicate "aarch64_9bit_offset_memory_operand"
> + (and (match_operand 0 "memory_operand")
> + (ior (match_code "reg" "0")
> + (and (match_code "plus" "0")
> + (match_code "reg" "00")
> + (match_code "const_int" "01"))))
> +{
> + rtx mem_op = XEXP (op, 0);
> +
> + if (REG_P (mem_op))
> + return GET_MODE (mem_op) == DImode;
> +
> + rtx plus_op0 = XEXP (mem_op, 0);
> + rtx plus_op1 = XEXP (mem_op, 1);
> +
> + if (GET_MODE (plus_op0) != DImode)
> + return false;
> +
> + poly_int64 offset;
> + if (!poly_int_rtx_p (plus_op1, &offset))
> + gcc_unreachable ();
> +
> + return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
> +})
> +
> ;; Predicates for parallel expanders based on mode.
> (define_special_predicate "vect_par_cnst_hi_half"
> (match_code "parallel")
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..8cabc05b0d739dbfdcecf681348b62634fcfc9a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
> @@ -0,0 +1,75 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.4-a -O2" } */
> +
> +#include <stdatomic.h>
> +
> +typedef __INT8_TYPE__ int8_t;
> +typedef __INT16_TYPE__ int16_t;
> +typedef __INT32_TYPE__ int32_t;
> +typedef __INT64_TYPE__ int64_t;
> +
> +#define STORE_TESTS(size) \
> + void \
> + foo##size (int##size##_t *atomic_vals) \
> +{ \
> + atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
> + atomic_store_explicit (atomic_vals, 2, memory_order_release); \
> + atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
> + atomic_store ((atomic_vals + 2), 2); \
> + atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
> +}
> +
> +STORE_TESTS (8);
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> { target { ! ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2
> { target { ilp32 } } } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+,
> 1\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+,
> 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]"
> 1 } } */
> +
> +STORE_TESTS (16);
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> } } */
> +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+,
> 2\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+,
> 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]"
> 1 } } */
> +
> +STORE_TESTS (32);
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 }
> } */
> +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1
> } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+,
> 4\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+,
> 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]"
> 1 } } */
> +
> +STORE_TESTS (64);
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 }
> } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+,
> 8\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+,
> 16\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]"
> 1 } } */
> +
> +void
> +foo_toolarge_offset (int64_t *atomic_vals)
> +{
> + /* 9bit signed unscaled immediate =>
> + largest representable value +255.
> + smallest representable value -256. */
> + atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
> + atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
> +}
> +
> +void
> +foo_negative (int8_t *atomic_vals)
> +{
> + atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
> +}
> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+,
> -2\\\]" 1 { target { ! ilp32 } } } } */
> +
> +#pragma GCC target ("arch=armv8.3-a")
> +void
> +foo_older_arch (int64_t *atomic_vals)
> +{
> + atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
> +}
> +
> +/* Three times, one for each of the three above functions. */
> +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4
> } } */
>