Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

Richard Sandiford Wed, 29 Nov 2023 15:46:52 -0800

Not my specialist subject, but here goes anyway:

Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> ping
>
> From: Wilco Dijkstra
> Sent: 02 June 2023 18:28
> To: GCC Patches <gcc-patches@gcc.gnu.org>
> Cc: Richard Sandiford <richard.sandif...@arm.com>; Kyrylo Tkachov 
> <kyrylo.tkac...@arm.com>
> Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 
> [PR110061]
>
>
> Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible 
> with
> existing binaries, gives better performance than locking atomics and is what
> most users expect.


Please add a justification for why it's backwards compatible, rather
than just stating that it's so.

> Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
> supported.
> This results in an implicit store which is invisible to software as long as 
> the given
> address is writeable (which will be true when using atomics in actual code).

Thanks for adding this.  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95722
suggests that it's still an open question whether this is a correct thing
to do, but it sounds from Joseph's comment that he isn't sure whether
atomic loads from read-only data are valid.

Linus's comment in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70490
suggests that a reasonable compromise might be to use a storing
implementation but not advertise that it is lock-free.  Also,
the comment above libat_is_lock_free says:

/* Note that this can return that a size/alignment is not lock-free even if
   all the operations that we use to implement the respective accesses provide
   lock-free forward progress as specified in C++14:  Users likely expect
   "lock-free" to also mean "fast", which is why we do not return true if, for
   example, we implement loads with this size/alignment using a CAS.  */

We don't use a CAS for the fallbacks, but like you say, we do use a
load/store exclusive loop.  So did you consider not doing this:

> +/* State we have lock-free 128-bit atomics.  */
> +#undef FAST_ATOMIC_LDST_16
> +#define FAST_ATOMIC_LDST_16            1

?

Otherwise it looks reasonable to me, for whatever that's worth, but:

> A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.
>
> Passes regress, OK for commit?
>
> libatomic/
>         PR target/110061
>         config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
>         config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
>         State we have lock-free atomics.
>
> ---
>
> diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
> b/libatomic/config/linux/aarch64/atomic_16.S
> index 
> 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
>  100644
> --- a/libatomic/config/linux/aarch64/atomic_16.S
> +++ b/libatomic/config/linux/aarch64/atomic_16.S
> @@ -22,6 +22,21 @@
>     <http://www.gnu.org/licenses/>.  */
>
>
> +/* AArch64 128-bit lock-free atomic implementation.
> +
> +   128-bit atomics are now lock-free for all AArch64 architecture versions.
> +   This is backwards compatible with existing binaries and gives better
> +   performance than locking atomics.
> +
> +   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
> +   This results in an implicit store which is invisible to software as long
> +   as the given address is writeable.  Since all other atomics have explicit
> +   writes, this will be true when using atomics in actual code.
> +
> +   The libat_<op>_16 entry points are ARMv8.0.
> +   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
> +
> +
>          .arch   armv8-a+lse
>
>  #define ENTRY(name)             \
> @@ -37,6 +52,10 @@ name:                                \
>          .cfi_endproc;           \
>          .size name, .-name;
>
> +#define ALIAS(alias,name)      \
> +       .global alias;          \
> +       .set alias, name;
> +
>  #define res0 x0
>  #define res1 x1
>  #define in0  x2
> @@ -70,6 +89,24 @@ name:                                \
>  #define SEQ_CST 5
>
>
> +ENTRY (libat_load_16)
> +       mov     x5, x0
> +       cbnz    w1, 2f
> +
> +       /* RELAXED.  */
> +1:     ldxp    res0, res1, [x5]
> +       stxp    w4, res0, res1, [x5]
> +       cbnz    w4, 1b
> +       ret
> +
> +       /* ACQUIRE/CONSUME/SEQ_CST.  */
> +2:     ldaxp   res0, res1, [x5]
> +       stxp    w4, res0, res1, [x5]
> +       cbnz    w4, 2b
> +       ret
> +END (libat_load_16)
> +
> +
>  ENTRY (libat_load_16_i1)
>          cbnz    w1, 1f
>
> @@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
>  END (libat_load_16_i1)
>
>
> +ENTRY (libat_store_16)
> +       cbnz    w4, 2f
> +
> +       /* RELAXED.  */
> +1:     ldxp    xzr, tmp0, [x0]
> +       stxp    w4, in0, in1, [x0]
> +       cbnz    w4, 1b
> +       ret
> +
> +       /* RELEASE/SEQ_CST.  */
> +2:     ldxp    xzr, tmp0, [x0]
> +       stlxp   w4, in0, in1, [x0]
> +       cbnz    w4, 2b
> +       ret
> +END (libat_store_16)
> +
> +
>  ENTRY (libat_store_16_i1)
>          cbnz    w4, 1f
>
> @@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
>          ret
>
>          /* RELEASE/SEQ_CST.  */
> -1:     ldaxp   xzr, tmp0, [x0]
> +1:     ldxp    xzr, tmp0, [x0]
>          stlxp   w4, in0, in1, [x0]
>          cbnz    w4, 1b
>          ret
>  END (libat_store_16_i1)
>
>
> -ENTRY (libat_exchange_16_i1)
> +ENTRY (libat_exchange_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
>          stxp    w4, in0, in1, [x5]
>          cbnz    w4, 3b
>          ret
> -4:
> -       cmp     w4, RELEASE
> -       b.ne    6f
>
> -       /* RELEASE.  */
> -5:     ldxp    res0, res1, [x5]
> +       /* RELEASE/ACQ_REL/SEQ_CST.  */
> +4:     ldaxp   res0, res1, [x5]
>          stlxp   w4, in0, in1, [x5]
> -       cbnz    w4, 5b
> +       cbnz    w4, 4b
>          ret
> +END (libat_exchange_16)

Please explain (here and in the commit message) why you're adding
acquire semantics to the RELEASE case.

Thanks,
Richard

>
> -       /* ACQ_REL/SEQ_CST.  */
> -6:     ldaxp   res0, res1, [x5]
> -       stlxp   w4, in0, in1, [x5]
> -       cbnz    w4, 6b
> +
> +ENTRY (libat_compare_exchange_16)
> +       ldp     exp0, exp1, [x1]
> +       cbz     w4, 3f
> +       cmp     w4, RELEASE
> +       b.hs    4f
> +
> +       /* ACQUIRE/CONSUME.  */
> +1:     ldaxp   tmp0, tmp1, [x0]
> +       cmp     tmp0, exp0
> +       ccmp    tmp1, exp1, 0, eq
> +       bne     2f
> +       stxp    w4, in0, in1, [x0]
> +       cbnz    w4, 1b
> +       mov     x0, 1
>          ret
> -END (libat_exchange_16_i1)
> +
> +2:     stp     tmp0, tmp1, [x1]
> +       mov     x0, 0
> +       ret
> +
> +       /* RELAXED.  */
> +3:     ldxp    tmp0, tmp1, [x0]
> +       cmp     tmp0, exp0
> +       ccmp    tmp1, exp1, 0, eq
> +       bne     2b
> +       stxp    w4, in0, in1, [x0]
> +       cbnz    w4, 3b
> +       mov     x0, 1
> +       ret
> +
> +       /* RELEASE/ACQ_REL/SEQ_CST.  */
> +4:     ldaxp   tmp0, tmp1, [x0]
> +       cmp     tmp0, exp0
> +       ccmp    tmp1, exp1, 0, eq
> +       bne     2b
> +       stlxp   w4, in0, in1, [x0]
> +       cbnz    w4, 4b
> +       mov     x0, 1
> +       ret
> +END (libat_compare_exchange_16)
>
>
>  ENTRY (libat_compare_exchange_16_i1)
> @@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1)
>  END (libat_compare_exchange_16_i1)
>
>
> -ENTRY (libat_fetch_add_16_i1)
> +ENTRY (libat_fetch_add_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_add_16_i1)
> +END (libat_fetch_add_16)
>
>
> -ENTRY (libat_add_fetch_16_i1)
> +ENTRY (libat_add_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_add_fetch_16_i1)
> +END (libat_add_fetch_16)
>
>
> -ENTRY (libat_fetch_sub_16_i1)
> +ENTRY (libat_fetch_sub_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_sub_16_i1)
> +END (libat_fetch_sub_16)
>
>
> -ENTRY (libat_sub_fetch_16_i1)
> +ENTRY (libat_sub_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_sub_fetch_16_i1)
> +END (libat_sub_fetch_16)
>
>
> -ENTRY (libat_fetch_or_16_i1)
> +ENTRY (libat_fetch_or_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_or_16_i1)
> +END (libat_fetch_or_16)
>
>
> -ENTRY (libat_or_fetch_16_i1)
> +ENTRY (libat_or_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_or_fetch_16_i1)
> +END (libat_or_fetch_16)
>
>
> -ENTRY (libat_fetch_and_16_i1)
> +ENTRY (libat_fetch_and_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_and_16_i1)
> +END (libat_fetch_and_16)
>
>
> -ENTRY (libat_and_fetch_16_i1)
> +ENTRY (libat_and_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_and_fetch_16_i1)
> +END (libat_and_fetch_16)
>
>
> -ENTRY (libat_fetch_xor_16_i1)
> +ENTRY (libat_fetch_xor_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_xor_16_i1)
> +END (libat_fetch_xor_16)
>
>
> -ENTRY (libat_xor_fetch_16_i1)
> +ENTRY (libat_xor_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_xor_fetch_16_i1)
> +END (libat_xor_fetch_16)
>
>
> -ENTRY (libat_fetch_nand_16_i1)
> +ENTRY (libat_fetch_nand_16)
>          mov     x5, x0
>          mvn     in0, in0
>          mvn     in1, in1
> @@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_nand_16_i1)
> +END (libat_fetch_nand_16)
>
>
> -ENTRY (libat_nand_fetch_16_i1)
> +ENTRY (libat_nand_fetch_16)
>          mov     x5, x0
>          mvn     in0, in0
>          mvn     in1, in1
> @@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_nand_fetch_16_i1)
> +END (libat_nand_fetch_16)
>
>
> -ENTRY (libat_test_and_set_16_i1)
> -       mov     w2, 1
> -       cbnz    w1, 2f
> -
> -       /* RELAXED.  */
> -       swpb    w0, w2, [x0]
> -       ret
> +/* __atomic_test_and_set is always inlined, so this entry is unused and
> +   only required for completeness.  */
> +ENTRY (libat_test_and_set_16)
>
> -       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
> -2:     swpalb  w0, w2, [x0]
> +       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
> +       mov     x5, x0
> +1:     ldaxrb  w0, [x5]
> +       stlxrb  w4, w2, [x5]
> +       cbnz    w4, 1b
>          ret
> -END (libat_test_and_set_16_i1)
> +END (libat_test_and_set_16)
> +
> +
> +/* Alias entry points which are the same in baseline and LSE2.  */
> +
> +ALIAS (libat_exchange_16_i1, libat_exchange_16)
> +ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
> +ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
> +ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
> +ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
> +ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
> +ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
> +ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
> +ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
> +ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
> +ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
> +ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
> +ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
> +ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
>
>
>  /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
> diff --git a/libatomic/config/linux/aarch64/host-config.h 
> b/libatomic/config/linux/aarch64/host-config.h
> index 
> bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79
>  100644
> --- a/libatomic/config/linux/aarch64/host-config.h
> +++ b/libatomic/config/linux/aarch64/host-config.h
> @@ -35,10 +35,19 @@
>  #endif
>  #define IFUNC_NCOND(N)  (1)
>
> -#if N == 16 && IFUNC_ALT != 0
> +#endif /* HAVE_IFUNC */
> +
> +/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
> +#if N == 16
>  # define DONE 1
>  #endif
>
> -#endif /* HAVE_IFUNC */
> +/* State we have lock-free 128-bit atomics.  */
> +#undef FAST_ATOMIC_LDST_16
> +#define FAST_ATOMIC_LDST_16            1
> +#undef MAYBE_HAVE_ATOMIC_CAS_16
> +#define MAYBE_HAVE_ATOMIC_CAS_16       1
> +#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
> +#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
>
>  #include_next <host-config.h>

Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

Reply via email to