This is a patch implementing builtins for an atomic exchange with full,
acquire, and release memory barrier semantics. It is similar to
__sync_lock_test_and_set(), but the target does not have the option of
implementing a reduced functionality of only implementing a store of 1.
Also, unlike __sync_lock_test_and_set(), we have all three memory
barrier variants.
The compiler will fall back to a full barrier if the user requests an
acquire/release and it is not available in the target. Also, if no
variant is available, we will fall back to a compare and swap loop with
a full barrier at the end.
The real reason for this patch is to implement atomic stores in the C++
runtime library, which can currently incorrectly move prior stores past
an atomic store, thus invalidating the happens-before promise for the
sequentially consistent model. I am attaching the corresponding patch
to libstdc++ to show how I intend to use the builtin. This is not an
official submission for the C++ library bits, as I have not yet fully
tested the library. I will do so separately.
In a followup patch I will be implementing acq/rel/full variants for all
the __sync_* builtins which we can use for the atomic loads and for some
of the OpenMP atomics Jakub has been working on.
Oh yeah, I would gladly accept patterns/patches for other architectures :).
Tested on x86-64 Linux.
OK for mainline?
* c-family/c-common.c (resolve_overloaded_builtin): Add
BUILT_IN_LOCK_TEST_AND_SET_*_N variants.
* doc/extend.texi: Document __sync_lock_test_and_set_* variants.
* libgcc-std.ver: Add __sync_swap_*.
* optabs.h: Add DOI_sync_swap*.
Define sync_swap*_optab.
* optabs.c (expand_sync_swap): New.
* genopinit.c: Add sync_swap_{acq,rel,full}.
* config/i386/sync.md ("sync_lock_test_and_set_full<mode>"): New.
* config/i386/i386.md: Add UNSPECV_SWAP_FULL.
* tree.h (enum membar_mode): Same.
* builtins.c (expand_builtin_swap): New.
(expand_builtin): Add cases for BUILT_IN_SWAP_*.
* sync-builtins.def (BUILT_IN_SWAP_*): New.
* expr.h (expand_sync_swap): Protoize.
(expand_builtin_synchronize): Same.
Index: doc/extend.texi
===================================================================
--- doc/extend.texi (revision 173831)
+++ doc/extend.texi (working copy)
@@ -6719,6 +6719,22 @@ speculated to) before the builtin, but p
be globally visible yet, and previous memory loads may not yet be
satisfied.
+@item @var{type} __sync_swap_full (@var{type} *ptr, @var{type} value, ...)
+@itemx @var{type} __sync_swap_acq (@var{type} *ptr, @var{type} value, ...)
+@itemx @var{type} __sync_swap_rel (@var{type} *ptr, @var{type} value, ...)
+@findex __sync_swap_full
+@findex __sync_swap_acq
+@findex __sync_swap_rel
+These builtins implement an atomic exchange operation. They write
+@var{value} into @code{*@var{ptr}}, and return the previous contents
+of @code{*@var{ptr}}. The different variants provide a full barrier,
+acquire barrier, or a release barrier respectively depending on the
+suffix.
+
+If the acquire or release variants of these operations are not
+available on the given target, the compiler will fall back to a full
+barrier.
+
@item void __sync_lock_release (@var{type} *ptr, ...)
@findex __sync_lock_release
This builtin releases the lock acquired by @code{__sync_lock_test_and_set}.
Index: c-family/c-common.c
===================================================================
--- c-family/c-common.c (revision 173831)
+++ c-family/c-common.c (working copy)
@@ -9035,6 +9035,9 @@ resolve_overloaded_builtin (location_t l
case BUILT_IN_VAL_COMPARE_AND_SWAP_N:
case BUILT_IN_LOCK_TEST_AND_SET_N:
case BUILT_IN_LOCK_RELEASE_N:
+ case BUILT_IN_SWAP_FULL_N:
+ case BUILT_IN_SWAP_ACQ_N:
+ case BUILT_IN_SWAP_REL_N:
{
int n = sync_resolve_size (function, params);
tree new_function, first_param, result;
Index: optabs.c
===================================================================
--- optabs.c (revision 173831)
+++ optabs.c (working copy)
@@ -6988,6 +6988,70 @@ expand_sync_lock_test_and_set (rtx mem,
return NULL_RTX;
}
+
+/* This function expands an atomic exchange operation: atomically store
+ VAL in MEM and return the previous value in MEM.
+
+ TARGET is an option place to stick the return value.
+ MBMODE is the memory barrier type to use for the operation. */
+
+rtx
+expand_sync_swap (rtx mem, rtx val, rtx target, enum membar_mode mbmode)
+{
+ enum machine_mode mode = GET_MODE (mem);
+ enum insn_code icode;
+ direct_optab op;
+
+ switch (mbmode)
+ {
+ case MEMBAR_MODE_ACQUIRE:
+ op = sync_swap_acq_optab;
+ break;
+ case MEMBAR_MODE_RELEASE:
+ op = sync_swap_rel_optab;
+ break;
+ case MEMBAR_MODE_FULL:
+ op = sync_swap_full_optab;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ /* If no variant is found, try the full barrier. */
+ if (direct_optab_handler (op, mode) == CODE_FOR_nothing)
+ op = sync_swap_full_optab;
+
+ /* If the target supports the swap directly, great. */
+ icode = direct_optab_handler (op, mode);
+ if (icode != CODE_FOR_nothing)
+ {
+ struct expand_operand ops[3];
+
+ create_output_operand (&ops[0], target, mode);
+ create_fixed_operand (&ops[1], mem);
+ /* VAL may have been promoted to a wider mode. Shrink it if so. */
+ create_convert_operand_to (&ops[2], val, mode, true);
+ if (maybe_expand_insn (icode, 3, ops))
+ return ops[0].value;
+ }
+
+ /* Otherwise, use a compare-and-swap loop for the exchange. */
+ if (direct_optab_handler (sync_compare_and_swap_optab, mode)
+ != CODE_FOR_nothing)
+ {
+ if (!target || !register_operand (target, mode))
+ target = gen_reg_rtx (mode);
+ if (GET_MODE (val) != VOIDmode && GET_MODE (val) != mode)
+ val = convert_modes (mode, GET_MODE (val), val, 1);
+ if (expand_compare_and_swap_loop (mem, target, val, NULL_RTX))
+ {
+ /* Issue a full barrier. */
+ expand_builtin_synchronize ();
+ return target;
+ }
+ }
+
+ return NULL_RTX;
+}
/* Return true if OPERAND is suitable for operand number OPNO of
instruction ICODE. */
Index: optabs.h
===================================================================
--- optabs.h (revision 173831)
+++ optabs.h (working copy)
@@ -669,9 +669,19 @@ enum direct_optab_index
/* Atomic compare and swap. */
DOI_sync_compare_and_swap,
- /* Atomic exchange with acquire semantics. */
+ /* Atomic exchange with acquire semantics. Exchange not fully
+ guaranteed. Some targets may only support a store of 1. */
DOI_sync_lock_test_and_set,
+ /* Atomic exchange with acquire semantics. */
+ DOI_sync_swap_acq,
+
+ /* Atomic exchange with release semantics. */
+ DOI_sync_swap_rel,
+
+ /* Atomic exchange with full barrier semantics. */
+ DOI_sync_swap_full,
+
/* Atomic clear with release semantics. */
DOI_sync_lock_release,
@@ -720,6 +730,12 @@ typedef struct direct_optab_d *direct_op
(&direct_optab_table[(int) DOI_sync_compare_and_swap])
#define sync_lock_test_and_set_optab \
(&direct_optab_table[(int) DOI_sync_lock_test_and_set])
+#define sync_swap_acq_optab \
+ (&direct_optab_table[(int) DOI_sync_swap_acq])
+#define sync_swap_rel_optab \
+ (&direct_optab_table[(int) DOI_sync_swap_rel])
+#define sync_swap_full_optab \
+ (&direct_optab_table[(int) DOI_sync_swap_full])
#define sync_lock_release_optab \
(&direct_optab_table[(int) DOI_sync_lock_release])
Index: genopinit.c
===================================================================
--- genopinit.c (revision 173831)
+++ genopinit.c (working copy)
@@ -239,6 +239,9 @@ static const char * const optabs[] =
"set_direct_optab_handler (sync_new_nand_optab, $A,
CODE_FOR_$(sync_new_nand$I$a$))",
"set_direct_optab_handler (sync_compare_and_swap_optab, $A,
CODE_FOR_$(sync_compare_and_swap$I$a$))",
"set_direct_optab_handler (sync_lock_test_and_set_optab, $A,
CODE_FOR_$(sync_lock_test_and_set$I$a$))",
+ "set_direct_optab_handler (sync_swap_acq_optab, $A,
CODE_FOR_$(sync_swap_acq$I$a$))",
+ "set_direct_optab_handler (sync_swap_rel_optab, $A,
CODE_FOR_$(sync_swap_rel$I$a$))",
+ "set_direct_optab_handler (sync_swap_full_optab, $A,
CODE_FOR_$(sync_swap_full$I$a$))",
"set_direct_optab_handler (sync_lock_release_optab, $A,
CODE_FOR_$(sync_lock_release$I$a$))",
"set_optab_handler (vec_set_optab, $A, CODE_FOR_$(vec_set$a$))",
"set_optab_handler (vec_extract_optab, $A, CODE_FOR_$(vec_extract$a$))",
Index: builtins.c
===================================================================
--- builtins.c (revision 173831)
+++ builtins.c (working copy)
@@ -5682,9 +5682,35 @@ expand_builtin_lock_test_and_set (enum m
return expand_sync_lock_test_and_set (mem, val, target);
}
+/* Expand the __sync_swap_* intrinsics.
+
+ EXP is the CALL_EXPR.
+ TARGET is an optional place for us to store the results.
+ MBMODE is the memory barrier mode to use. */
+
+static rtx
+expand_builtin_swap (enum machine_mode mode, tree exp, rtx target,
+ enum membar_mode mbmode)
+{
+ rtx val, mem;
+ enum machine_mode old_mode;
+
+ /* Expand the operands. */
+ mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), mode);
+ val = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, mode, EXPAND_NORMAL);
+ /* If VAL is promoted to a wider mode, convert it back to MODE. Take care
+ of CONST_INTs, where we know the old_mode only from the call argument. */
+ old_mode = GET_MODE (val);
+ if (old_mode == VOIDmode)
+ old_mode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp, 1)));
+ val = convert_modes (mode, old_mode, val, 1);
+
+ return expand_sync_swap (mem, val, target, mbmode);
+}
+
/* Expand the __sync_synchronize intrinsic. */
-static void
+void
expand_builtin_synchronize (void)
{
gimple x;
@@ -6495,6 +6521,39 @@ expand_builtin (tree exp, rtx target, rt
return target;
break;
+ case BUILT_IN_SWAP_ACQ_1:
+ case BUILT_IN_SWAP_ACQ_2:
+ case BUILT_IN_SWAP_ACQ_4:
+ case BUILT_IN_SWAP_ACQ_8:
+ case BUILT_IN_SWAP_ACQ_16:
+ mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_ACQ_1);
+ target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_ACQUIRE);
+ if (target)
+ return target;
+ break;
+
+ case BUILT_IN_SWAP_REL_1:
+ case BUILT_IN_SWAP_REL_2:
+ case BUILT_IN_SWAP_REL_4:
+ case BUILT_IN_SWAP_REL_8:
+ case BUILT_IN_SWAP_REL_16:
+ mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_REL_1);
+ target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_RELEASE);
+ if (target)
+ return target;
+ break;
+
+ case BUILT_IN_SWAP_FULL_1:
+ case BUILT_IN_SWAP_FULL_2:
+ case BUILT_IN_SWAP_FULL_4:
+ case BUILT_IN_SWAP_FULL_8:
+ case BUILT_IN_SWAP_FULL_16:
+ mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_FULL_1);
+ target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_FULL);
+ if (target)
+ return target;
+ break;
+
case BUILT_IN_LOCK_TEST_AND_SET_1:
case BUILT_IN_LOCK_TEST_AND_SET_2:
case BUILT_IN_LOCK_TEST_AND_SET_4:
Index: sync-builtins.def
===================================================================
--- sync-builtins.def (revision 173831)
+++ sync-builtins.def (working copy)
@@ -235,6 +235,63 @@ DEF_SYNC_BUILTIN (BUILT_IN_LOCK_TEST_AND
DEF_SYNC_BUILTIN (BUILT_IN_LOCK_TEST_AND_SET_16, "__sync_lock_test_and_set_16",
BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_N,
+ "__sync_swap_acq",
+ BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_1,
+ "__sync_swap_acq_1",
+ BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_2,
+ "__sync_swap_acq_2",
+ BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_4,
+ "__sync_swap_acq_4",
+ BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_8,
+ "__sync_swap_acq_8",
+ BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_16,
+ "__sync_swap_acq_16",
+ BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_N,
+ "__sync_swap_rel",
+ BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_1,
+ "__sync_swap_rel_1",
+ BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_2,
+ "__sync_swap_rel_2",
+ BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_4,
+ "__sync_swap_rel_4",
+ BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_8,
+ "__sync_swap_rel_8",
+ BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_16,
+ "__sync_swap_rel_16",
+ BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_N,
+ "__sync_swap_full",
+ BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_1,
+ "__sync_swap_full_1",
+ BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_2,
+ "__sync_swap_full_2",
+ BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_4,
+ "__sync_swap_full_4",
+ BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_8,
+ "__sync_swap_full_8",
+ BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_16,
+ "__sync_swap_full_16",
+ BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+
DEF_SYNC_BUILTIN (BUILT_IN_LOCK_RELEASE_N, "__sync_lock_release",
BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
DEF_SYNC_BUILTIN (BUILT_IN_LOCK_RELEASE_1, "__sync_lock_release_1",
Index: expr.h
===================================================================
--- expr.h (revision 173831)
+++ expr.h (working copy)
@@ -161,6 +161,14 @@ enum optab_methods
OPTAB_MUST_WIDEN
};
+/* Memory barrier type. */
+enum membar_mode
+{
+ MEMBAR_MODE_RELEASE,
+ MEMBAR_MODE_ACQUIRE,
+ MEMBAR_MODE_FULL
+};
+
/* Generate code for a simple binary or unary operation. "Simple" in
this case means "can be unambiguously described by a (mode, code)
pair and mapped to a single optab." */
@@ -217,6 +225,7 @@ rtx expand_bool_compare_and_swap (rtx, r
rtx expand_sync_operation (rtx, rtx, enum rtx_code);
rtx expand_sync_fetch_operation (rtx, rtx, enum rtx_code, bool, rtx);
rtx expand_sync_lock_test_and_set (rtx, rtx, rtx);
+rtx expand_sync_swap (rtx, rtx, rtx, enum membar_mode);
/* Functions from expmed.c: */
@@ -248,6 +257,7 @@ extern void expand_builtin_setjmp_receiv
extern rtx expand_builtin_saveregs (void);
extern void expand_builtin_trap (void);
extern rtx builtin_strncpy_read_str (void *, HOST_WIDE_INT, enum machine_mode);
+extern void expand_builtin_synchronize (void);
/* Functions from expr.c: */
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 173831)
+++ config/i386/i386.md (working copy)
@@ -250,6 +250,7 @@
UNSPECV_MWAIT
UNSPECV_CMPXCHG
UNSPECV_XCHG
+ UNSPECV_SWAP_FULL
UNSPECV_LOCK
UNSPECV_PROLOGUE_USE
UNSPECV_CLD
Index: config/i386/sync.md
===================================================================
--- config/i386/sync.md (revision 173831)
+++ config/i386/sync.md (working copy)
@@ -232,6 +232,15 @@
return "lock{%;} add{<imodesuffix>}\t{%1, %0|%0, %1}";
})
+(define_insn "sync_swap_full<mode>"
+ [(set (match_operand:SWI 0 "register_operand" "=<r>")
+ (unspec_volatile:SWI
+ [(match_operand:SWI 1 "memory_operand" "+m")] UNSPECV_SWAP_FULL))
+ (set (match_dup 1)
+ (match_operand:SWI 2 "register_operand" "0"))]
+ ""
+ "xchg{<imodesuffix>}\t{%1, %0|%0, %1}")
+
;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space.
(define_insn "sync_lock_test_and_set<mode>"
[(set (match_operand:SWI 0 "register_operand" "=<r>")
Index: libgcc-std.ver
===================================================================
--- libgcc-std.ver (revision 173831)
+++ libgcc-std.ver (working copy)
@@ -1919,3 +1919,26 @@ GCC_4.6.0 {
__morestack_initial_sp
__splitstack_find
}
+
+%inherit GCC_4.7.0 GCC_4.6.0
+GCC_4.7.0 {
+ __sync_swap_acq_1
+ __sync_swap_rel_1
+ __sync_swap_full_1
+
+ __sync_swap_acq_2
+ __sync_swap_rel_2
+ __sync_swap_full_2
+
+ __sync_swap_acq_4
+ __sync_swap_rel_4
+ __sync_swap_full_4
+
+ __sync_swap_acq_8
+ __sync_swap_rel_8
+ __sync_swap_full_8
+
+ __sync_swap_acq_16
+ __sync_swap_rel_16
+ __sync_swap_full_16
+}
(__sso_string_base<>::_M_assign): Likewise.
* include/bits/atomic_2.h (_ITp<>::store): Use __sync_swap_full.
(_ITp<>::store volatile): Same.
(_PTp<>::store): Same.
(_PTp<>::store volatile): Same.
Index: include/bits/atomic_2.h
===================================================================
--- include/bits/atomic_2.h (revision 173831)
+++ include/bits/atomic_2.h (working copy)
@@ -249,14 +249,12 @@ namespace __atomic2
__glibcxx_assert(__m != memory_order_acq_rel);
__glibcxx_assert(__m != memory_order_consume);
- if (__m == memory_order_relaxed)
- _M_i = __i;
+ if (__m == memory_order_seq_cst)
+ (void)__sync_swap_full (&_M_i, __i);
else
{
// write_mem_barrier();
_M_i = __i;
- if (__m == memory_order_seq_cst)
- __sync_synchronize();
}
}
@@ -267,14 +265,12 @@ namespace __atomic2
__glibcxx_assert(__m != memory_order_acq_rel);
__glibcxx_assert(__m != memory_order_consume);
- if (__m == memory_order_relaxed)
- _M_i = __i;
+ if (__m == memory_order_seq_cst)
+ (void)__sync_swap_full (&_M_i, __i);
else
{
// write_mem_barrier();
_M_i = __i;
- if (__m == memory_order_seq_cst)
- __sync_synchronize();
}
}
@@ -540,14 +536,12 @@ namespace __atomic2
__glibcxx_assert(__m != memory_order_acq_rel);
__glibcxx_assert(__m != memory_order_consume);
- if (__m == memory_order_relaxed)
- _M_p = __p;
+ if (__m = memory_order_seq_cst)
+ __sync_swap_full (&_M_p, __p);
else
{
// write_mem_barrier();
_M_p = __p;
- if (__m == memory_order_seq_cst)
- __sync_synchronize();
}
}
@@ -559,14 +553,12 @@ namespace __atomic2
__glibcxx_assert(__m != memory_order_acq_rel);
__glibcxx_assert(__m != memory_order_consume);
- if (__m == memory_order_relaxed)
- _M_p = __p;
+ if (__m = memory_order_seq_cst)
+ __sync_swap_full (&_M_p, __p);
else
{
// write_mem_barrier();
_M_p = __p;
- if (__m == memory_order_seq_cst)
- __sync_synchronize();
}
}