On 08/01/25 2:30 pm, jeevitha wrote:
> Hi All,
>
> The following patch has been bootstrapped and regtested on powerpc64le-linux.
>
> Changes to amo.h include the addition of the following load atomic operations:
> Compare and Swap Not Equal, Fetch and Increment Bounded, Fetch and Increment
> Equal, and Fetch and Decrement Bounded. Additionally, Store Twin is added for
> store atomic operations.
>
> 2024-01-08 Peter Bergner <berg...@linux.ibm.com>
>
> gcc/:
> * config/rs6000/amo.h: Add missing atomic memory operations.
> * doc/extend.texi (PowerPC Atomic Memory Operation Functions):
> Document new functions.
>
> gcc/testsuite/:
> * gcc.target/powerpc/amo3.c: New test.
> * gcc.target/powerpc/amo4.c: Likewise.
> * gcc.target/powerpc/amo5.c: Likewise.
> * gcc.target/powerpc/amo6.c: Likewise.
> * gcc.target/powerpc/amo7.c: Likewise.
>
> Co-authored-by: Jeevitha Palanisamy <jeevi...@linux.ibm.com>
>
> diff --git a/gcc/config/rs6000/amo.h b/gcc/config/rs6000/amo.h
> index 25ab1c7b4c4..10960208d31 100644
> --- a/gcc/config/rs6000/amo.h
> +++ b/gcc/config/rs6000/amo.h
> @@ -71,6 +71,64 @@ NAME (TYPE *_PTR, TYPE _VALUE)
> \
> return _RET;
> \
> }
>
> +/* Implementation of the LWAT/LDAT operations that take two input registers
> + and modify one word or double-word of memory and return the value that was
> + previously in the memory location. The destination and two source
> + registers are encoded with only one register number, so we need three
> + consecutive GPR registers and there is no C/C++ type that will give
> + us that, so we have to use register asm variables to achieve that.
> +
> + The LWAT/LDAT opcode requires the address to be a single register,
> + and that points to a suitably aligned memory location. Asm volatile
> + is used to prevent the optimizer from moving the operation. */
> +
> +#define _AMO_LD_CMPSWP(NAME, TYPE, OPCODE, FC)
> \
> +static __inline__ TYPE
> \
> +NAME (TYPE *_PTR, TYPE _COND, TYPE _VALUE) \
> +{ \
> + register TYPE _ret asm ("r8"); \
> + register TYPE _cond asm ("r9") = _COND; \
> + register TYPE _value asm ("r10") = _VALUE; \
> + __asm__ __volatile__ (OPCODE " %[ret],%P[addr],%[code]" \
> + : [addr] "+Q" (_PTR[0]), [ret] "=r" (_ret) \
> + : "r" (_cond), "r" (_value), [code] "n" (FC)); \
> + return _ret;
> \
> +}
> +
> +/* Implementation of the LWAT/LDAT fetch and increment operations.
> +
> + The LWAT/LDAT opcode requires the address to be a single register that
> + points to a suitably aligned memory location. Asm volatile is used to
> + prevent the optimizer from moving the operation. */
> +
> +#define _AMO_LD_INCREMENT(NAME, TYPE, OPCODE, FC) \
> +static __inline__ TYPE
> \
> +NAME (TYPE *_PTR) \
> +{ \
> + TYPE _RET; \
> + __asm__ volatile (OPCODE " %[ret],%P[addr],%[code]\n"
> \
> + : [addr] "+Q" (_PTR[0]), [ret] "=r" (_RET) \
> + : "Q" (*(TYPE (*)[2]) _PTR), [code] "n" (FC)); \
> + return _RET;
> \
> +}
> +
> +/* Implementation of the LWAT/LDAT fetch and decrement operations.
> +
> + The LWAT/LDAT opcode requires the address to be a single register that
> + points to a suitably aligned memory location. Asm volatile is used to
> + prevent the optimizer from moving the operation. */
> +
> +#define _AMO_LD_DECREMENT(NAME, TYPE, OPCODE, FC) \
> +static __inline__ TYPE
> \
> +NAME (TYPE *_PTR) \
> +{ \
> + TYPE _RET; \
> + __asm__ volatile (OPCODE " %[ret],%P[addr],%[code]\n"
> \
> + : [addr] "+Q" (_PTR[1]), [ret] "=r" (_RET) \
> + : "Q" (*(TYPE (*)[2]) (_PTR)), [code] "n" (FC)); \
> + return _RET;
> \
> +}
> +
> _AMO_LD_SIMPLE (amo_lwat_add, uint32_t, "lwat", _AMO_LD_ADD)
> _AMO_LD_SIMPLE (amo_lwat_xor, uint32_t, "lwat", _AMO_LD_XOR)
> _AMO_LD_SIMPLE (amo_lwat_ior, uint32_t, "lwat", _AMO_LD_IOR)
> @@ -78,11 +136,19 @@ _AMO_LD_SIMPLE (amo_lwat_and, uint32_t, "lwat",
> _AMO_LD_AND)
> _AMO_LD_SIMPLE (amo_lwat_umax, uint32_t, "lwat", _AMO_LD_UMAX)
> _AMO_LD_SIMPLE (amo_lwat_umin, uint32_t, "lwat", _AMO_LD_UMIN)
> _AMO_LD_SIMPLE (amo_lwat_swap, uint32_t, "lwat", _AMO_LD_SWAP)
> +_AMO_LD_CMPSWP (amo_lwat_cas_neq, uint32_t, "lwat", _AMO_LD_CS_NE)
> +_AMO_LD_INCREMENT (amo_lwat_inc_eq, uint32_t, "lwat", _AMO_LD_INC_EQUAL)
> +_AMO_LD_INCREMENT (amo_lwat_inc_bounded, uint32_t, "lwat",
> _AMO_LD_INC_BOUNDED)
> +_AMO_LD_DECREMENT (amo_lwat_dec_bounded, uint32_t, "lwat",
> _AMO_LD_DEC_BOUNDED)
>
> _AMO_LD_SIMPLE (amo_lwat_sadd, int32_t, "lwat", _AMO_LD_ADD)
> _AMO_LD_SIMPLE (amo_lwat_smax, int32_t, "lwat", _AMO_LD_SMAX)
> _AMO_LD_SIMPLE (amo_lwat_smin, int32_t, "lwat", _AMO_LD_SMIN)
> _AMO_LD_SIMPLE (amo_lwat_sswap, int32_t, "lwat", _AMO_LD_SWAP)
> +_AMO_LD_CMPSWP (amo_lwat_scas_neq, int32_t, "lwat", _AMO_LD_CS_NE)
> +_AMO_LD_INCREMENT (amo_lwat_sinc_eq, int32_t, "lwat", _AMO_LD_INC_EQUAL)
> +_AMO_LD_INCREMENT (amo_lwat_sinc_bounded, int32_t, "lwat",
> _AMO_LD_INC_BOUNDED)
> +_AMO_LD_DECREMENT (amo_lwat_sdec_bounded, int32_t, "lwat",
> _AMO_LD_DEC_BOUNDED)
>
> _AMO_LD_SIMPLE (amo_ldat_add, uint64_t, "ldat", _AMO_LD_ADD)
> _AMO_LD_SIMPLE (amo_ldat_xor, uint64_t, "ldat", _AMO_LD_XOR)
> @@ -91,12 +157,19 @@ _AMO_LD_SIMPLE (amo_ldat_and, uint64_t, "ldat",
> _AMO_LD_AND)
> _AMO_LD_SIMPLE (amo_ldat_umax, uint64_t, "ldat", _AMO_LD_UMAX)
> _AMO_LD_SIMPLE (amo_ldat_umin, uint64_t, "ldat", _AMO_LD_UMIN)
> _AMO_LD_SIMPLE (amo_ldat_swap, uint64_t, "ldat", _AMO_LD_SWAP)
> +_AMO_LD_CMPSWP (amo_ldat_cas_neq, uint64_t, "ldat", _AMO_LD_CS_NE)
> +_AMO_LD_INCREMENT (amo_ldat_inc_eq, uint64_t, "ldat", _AMO_LD_INC_EQUAL)
> +_AMO_LD_INCREMENT (amo_ldat_inc_bounded, uint64_t, "ldat",
> _AMO_LD_INC_BOUNDED)
> +_AMO_LD_DECREMENT (amo_ldat_dec_bounded, uint64_t, "ldat",
> _AMO_LD_DEC_BOUNDED)
>
> _AMO_LD_SIMPLE (amo_ldat_sadd, int64_t, "ldat", _AMO_LD_ADD)
> _AMO_LD_SIMPLE (amo_ldat_smax, int64_t, "ldat", _AMO_LD_SMAX)
> _AMO_LD_SIMPLE (amo_ldat_smin, int64_t, "ldat", _AMO_LD_SMIN)
> _AMO_LD_SIMPLE (amo_ldat_sswap, int64_t, "ldat", _AMO_LD_SWAP)
> -
> +_AMO_LD_CMPSWP (amo_ldat_scas_neq, int64_t, "ldat", _AMO_LD_CS_NE)
> +_AMO_LD_INCREMENT (amo_ldat_sinc_eq, int64_t, "ldat", _AMO_LD_INC_EQUAL)
> +_AMO_LD_INCREMENT (amo_ldat_sinc_bounded, int64_t, "ldat",
> _AMO_LD_INC_BOUNDED)
> +_AMO_LD_DECREMENT (amo_ldat_sdec_bounded, int64_t, "ldat",
> _AMO_LD_DEC_BOUNDED)
> /* Enumeration of the STWAT/STDAT sub-opcodes. */
> enum _AMO_ST {
> _AMO_ST_ADD = 0x00, /* Store Add. */
> @@ -127,16 +200,35 @@ NAME (TYPE *_PTR, TYPE _VALUE)
> \
> return; \
> }
>
> +/* Implementation of the STWAT/STDAT store twin operation that takes
> + one register and modifies two words or double-wordxs of memory.
> + No value is returned.
> +
> + The STWAT/STDAT opcode requires the address to be a single register
> + that points to a suitably aligned memory location. Asm volatile is
> + used to prevent the optimizer from moving the operation. */
> +
> +#define _AMO_ST_TWIN(NAME, TYPE, OPCODE, FC) \
> +static __inline__ void
> \
> +NAME (TYPE *_PTR, TYPE _VALUE)
> \
> +{ \
> + __asm__ volatile (OPCODE " %[src],%P[addr],%[code]"
> \
> + : [addr] "+Q" (*(TYPE (*)[2]) _PTR) \
> + : [src] "r" (_VALUE), [code] "n" (FC)); \
> +}
> +
> _AMO_ST_SIMPLE (amo_stwat_add, uint32_t, "stwat", _AMO_ST_ADD)
> _AMO_ST_SIMPLE (amo_stwat_xor, uint32_t, "stwat", _AMO_ST_XOR)
> _AMO_ST_SIMPLE (amo_stwat_ior, uint32_t, "stwat", _AMO_ST_IOR)
> _AMO_ST_SIMPLE (amo_stwat_and, uint32_t, "stwat", _AMO_ST_AND)
> _AMO_ST_SIMPLE (amo_stwat_umax, uint32_t, "stwat", _AMO_ST_UMAX)
> _AMO_ST_SIMPLE (amo_stwat_umin, uint32_t, "stwat", _AMO_ST_UMIN)
> +_AMO_ST_TWIN (amo_stwat_twin, uint32_t, "stwat", _AMO_ST_TWIN)
>
> _AMO_ST_SIMPLE (amo_stwat_sadd, int32_t, "stwat", _AMO_ST_ADD)
> _AMO_ST_SIMPLE (amo_stwat_smax, int32_t, "stwat", _AMO_ST_SMAX)
> _AMO_ST_SIMPLE (amo_stwat_smin, int32_t, "stwat", _AMO_ST_SMIN)
> +_AMO_ST_TWIN (amo_stwat_stwin, int32_t, "stwat", _AMO_ST_TWIN)
>
> _AMO_ST_SIMPLE (amo_stdat_add, uint64_t, "stdat", _AMO_ST_ADD)
> _AMO_ST_SIMPLE (amo_stdat_xor, uint64_t, "stdat", _AMO_ST_XOR)
> @@ -144,9 +236,11 @@ _AMO_ST_SIMPLE (amo_stdat_ior, uint64_t, "stdat",
> _AMO_ST_IOR)
> _AMO_ST_SIMPLE (amo_stdat_and, uint64_t, "stdat", _AMO_ST_AND)
> _AMO_ST_SIMPLE (amo_stdat_umax, uint64_t, "stdat", _AMO_ST_UMAX)
> _AMO_ST_SIMPLE (amo_stdat_umin, uint64_t, "stdat", _AMO_ST_UMIN)
> +_AMO_ST_TWIN (amo_stdat_twin, uint64_t, "stdat", _AMO_ST_TWIN)
>
> _AMO_ST_SIMPLE (amo_stdat_sadd, int64_t, "stdat", _AMO_ST_ADD)
> _AMO_ST_SIMPLE (amo_stdat_smax, int64_t, "stdat", _AMO_ST_SMAX)
> _AMO_ST_SIMPLE (amo_stdat_smin, int64_t, "stdat", _AMO_ST_SMIN)
> +_AMO_ST_TWIN (amo_stdat_stwin, int64_t, "stdat", _AMO_ST_TWIN)
> #endif /* _ARCH_PWR9 && _ARCH_PPC64. */
> #endif /* _POWERPC_AMO_H. */
> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
> index 1e1b4cc837d..4c2db70880d 100644
> --- a/gcc/doc/extend.texi
> +++ b/gcc/doc/extend.texi
> @@ -24707,11 +24707,19 @@ uint32_t amo_lwat_and (uint32_t *, uint32_t);
> uint32_t amo_lwat_umax (uint32_t *, uint32_t);
> uint32_t amo_lwat_umin (uint32_t *, uint32_t);
> uint32_t amo_lwat_swap (uint32_t *, uint32_t);
> +uint32_t amo_lwat_cas_neq (uint32_t *, uint32_t, uint32_t);
> +uint32_t amo_lwat_inc_eq (uint32_t *);
> +uint32_t amo_lwat_inc_bounded (uint32_t *);
> +uint32_t amo_lwat_dec_bounded (uint32_t *);
>
> int32_t amo_lwat_sadd (int32_t *, int32_t);
> int32_t amo_lwat_smax (int32_t *, int32_t);
> int32_t amo_lwat_smin (int32_t *, int32_t);
> int32_t amo_lwat_sswap (int32_t *, int32_t);
> +int32_t amo_lwat_scas_neq (int32_t *, int32_t, int32_t);
> +int32_t amo_lwat_sinc_eq (int32_t *);
> +int32_t amo_lwat_sinc_bounded (int32_t *);
> +int32_t amo_lwat_sdec_bounded (int32_t *);
>
> uint64_t amo_ldat_add (uint64_t *, uint64_t);
> uint64_t amo_ldat_xor (uint64_t *, uint64_t);
> @@ -24720,11 +24728,19 @@ uint64_t amo_ldat_and (uint64_t *, uint64_t);
> uint64_t amo_ldat_umax (uint64_t *, uint64_t);
> uint64_t amo_ldat_umin (uint64_t *, uint64_t);
> uint64_t amo_ldat_swap (uint64_t *, uint64_t);
> +uint64_t amo_ldat_cas_neq (uint64_t *, uint64_t, uint64_t);
> +uint64_t amo_ldat_inc_eq (uint64_t *);
> +uint64_t amo_ldat_inc_bounded (uint64_t *);
> +uint64_t amo_ldat_dec_bounded (uint64_t *);
>
> int64_t amo_ldat_sadd (int64_t *, int64_t);
> int64_t amo_ldat_smax (int64_t *, int64_t);
> int64_t amo_ldat_smin (int64_t *, int64_t);
> int64_t amo_ldat_sswap (int64_t *, int64_t);
> +int64_t amo_ldat_scas_neq (int64_t *, int64_t, int64_t);
> +int64_t amo_ldat_sinc_eq (int64_t *);
> +int64_t amo_ldat_sinc_bounded (int64_t *);
> +int64_t amo_ldat_sdec_bounded (int64_t *);
>
> void amo_stwat_add (uint32_t *, uint32_t);
> void amo_stwat_xor (uint32_t *, uint32_t);
> @@ -24732,10 +24748,12 @@ void amo_stwat_ior (uint32_t *, uint32_t);
> void amo_stwat_and (uint32_t *, uint32_t);
> void amo_stwat_umax (uint32_t *, uint32_t);
> void amo_stwat_umin (uint32_t *, uint32_t);
> +void amo_stwat_twin (uint32_t *, uint32_t);
>
> void amo_stwat_sadd (int32_t *, int32_t);
> void amo_stwat_smax (int32_t *, int32_t);
> void amo_stwat_smin (int32_t *, int32_t);
> +void amo_stwat_stwin (int32_t *, int32_t);
>
> void amo_stdat_add (uint64_t *, uint64_t);
> void amo_stdat_xor (uint64_t *, uint64_t);
> @@ -24743,10 +24761,12 @@ void amo_stdat_ior (uint64_t *, uint64_t);
> void amo_stdat_and (uint64_t *, uint64_t);
> void amo_stdat_umax (uint64_t *, uint64_t);
> void amo_stdat_umin (uint64_t *, uint64_t);
> +void amo_stdat_twin (uint64_t *, uint64_t);
>
> void amo_stdat_sadd (int64_t *, int64_t);
> void amo_stdat_smax (int64_t *, int64_t);
> void amo_stdat_smin (int64_t *, int64_t);
> +void amo_stdat_stwin (int64_t *, int64_t);
> @end smallexample
>
> @node PowerPC Matrix-Multiply Assist Built-in Functions
> diff --git a/gcc/testsuite/gcc.target/powerpc/amo3.c
> b/gcc/testsuite/gcc.target/powerpc/amo3.c
> new file mode 100644
> index 00000000000..27fb962fdec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/amo3.c
> @@ -0,0 +1,131 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
> +/* { dg-require-effective-target powerpc_vsx } */
> +
> +/* Verify P9 atomic memory operations. */
The comments in the other tests mention the ISA level, perhaps we should change
here too for consistency. That is, mention the ISA level instead of P9.
> +
> +#include <amo.h>
> +#include <stdint.h>
> +
> +uint32_t
> +do_lw_cs_ne (uint32_t *mem, uint32_t cond, uint32_t value)
> +{
> + return amo_lwat_cas_neq (mem, cond, value);
> +}
> +
> +int32_t
> +do_lw_scs_ne (int32_t *mem, int32_t cond, int32_t value)
> +{
> + return amo_lwat_scas_neq (mem, cond, value);
> +}
> +
> +uint32_t
> +do_lw_inc_equal (uint32_t *mem)
> +{
> + return amo_lwat_inc_eq (mem);
> +}
> +
> +int32_t
> +do_lw_sinc_equal (int32_t *mem)
> +{
> + return amo_lwat_sinc_eq (mem);
> +}
> +
> +uint32_t
> +do_lw_inc_bounded (uint32_t *mem)
> +{
> + return amo_lwat_inc_bounded (mem);
> +}
> +
> +int32_t
> +do_lw_sinc_bounded (int32_t *mem)
> +{
> + return amo_lwat_sinc_bounded (mem);
> +}
> +uint32_t
> +do_lw_dec_bounded (uint32_t *mem)
> +{
> + return amo_lwat_dec_bounded (mem);
> +}
> +
> +int32_t
> +do_lw_sdec_bounded (int32_t *mem)
> +{
> + return amo_lwat_sdec_bounded (mem);
> +}
> +
> +uint64_t
> +do_ld_cs_ne (uint64_t *mem, uint64_t cond, uint64_t value)
> +{
> + return amo_ldat_cas_neq (mem, cond, value);
> +}
> +
> +int64_t
> +do_ld_scs_ne (int64_t *mem, int64_t cond, int64_t value)
> +{
> + return amo_ldat_scas_neq (mem, cond, value);
> +}
> +
> +uint64_t
> +do_ld_inc_equal (uint64_t *mem)
> +{
> + return amo_ldat_inc_eq (mem);
> +}
> +
> +int64_t
> +do_ld_sinc_equal (int64_t *mem)
> +{
> + return amo_ldat_sinc_eq (mem);
> +}
> +
> +uint64_t
> +do_ld_inc_bounded (uint64_t *mem)
> +{
> + return amo_ldat_inc_bounded (mem);
> +}
> +
> +int64_t
> +do_ld_sinc_bounded (int64_t *mem)
> +{
> + return amo_ldat_sinc_bounded (mem);
> +}
> +uint64_t
> +do_ld_dec_bounded (uint64_t *mem)
> +{
> + return amo_ldat_dec_bounded (mem);
> +}
> +
> +int64_t
> +do_ld_sdec_bounded (int64_t *mem)
> +{
> + return amo_ldat_sdec_bounded (mem);
> +}
> +
> +void
> +do_sw_twin (uint32_t *mem, uint32_t value)
> +{
> + amo_stwat_twin (mem, value);
> +}
> +
> +void
> +do_sw_stwin (int32_t *mem, int32_t value)
> +{
> + amo_stwat_stwin (mem, value);
> +}
> +
> +void
> +do_sd_twin (uint64_t *mem, uint64_t value)
> +{
> + amo_stdat_twin (mem, value);
> +}
> +
> +void
> +do_sd_stwin (int64_t *mem, int64_t value)
> +{
> + amo_stdat_stwin (mem, value);
> +}
> +
> +/* { dg-final { scan-assembler-times {\mldat\M} 8 } } */
> +/* { dg-final { scan-assembler-times {\mlwat\M} 8 } } */
> +/* { dg-final { scan-assembler-times {\mstdat\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mstwat\M} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/amo4.c
> b/gcc/testsuite/gcc.target/powerpc/amo4.c
> new file mode 100644
> index 00000000000..f354f2caac4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/amo4.c
> @@ -0,0 +1,92 @@
> +/* { dg-do run { target { lp64 && p9vector_hw } } } */
> +/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
> +/* { dg-require-effective-target powerpc_vsx } */
> +
> +#include <amo.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +#include <limits.h>
> +
> +/* Test whether the ISA 3.1 amo (atomic memory operations) functions perform
> as
> + expected. */
It should be "ISA 3.0" and not "ISA 3.1". Ditto for other tests.
Regards,
Surya