There are targets, which only offer 32-bit atomic operations (for
example 32-bit RISC-V). For these targets, split the 64-bit atomic
bitwise-or operation into two parts.
For this test case
int a(int i);
int b(int i);
int f(int i)
{
if (i) {
return a(i);
} else {
return b(i);
}
}
with options
-O2 -fprofile-update=atomic -fcondition-coverage
the code generation to 64-bit vs. 32-bit RISC-V looks like:
addi a5,a5,%lo(.LANCHOR0)
beq a0,zero,.L2
li a4,1
- amoor.d zero,a4,0(a5)
- addi a5,a5,8
- amoor.d zero,zero,0(a5)
+ amoor.w zero,a4,0(a5)
+ addi a4,a5,4
+ amoor.w zero,zero,0(a4)
+ addi a4,a5,8
+ amoor.w zero,zero,0(a4)
+ addi a5,a5,12
+ amoor.w zero,zero,0(a5)
tail a
.L2:
- amoor.d zero,zero,0(a5)
+ amoor.w zero,zero,0(a5)
+ addi a4,a5,4
+ amoor.w zero,zero,0(a4)
li a4,1
- addi a5,a5,8
- amoor.d zero,a4,0(a5)
+ addi a3,a5,8
+ amoor.w zero,a4,0(a3)
+ addi a5,a5,12
+ amoor.w zero,zero,0(a5)
tail b
Not related to this patch, even with -O2 the compiler generates
no-operations like
amoor.d zero,zero,0(a5)
and
amoor.w zero,zero,0(a5)
Would this be possible to filter out in instrument_decisions()?
gcc/ChangeLog:
* tree-profile.cc (split_update_decision_counter): New.
(instrument_decisions): Use counter_update to determine which
atomic operations are available. Use
split_update_decision_counter() if 64-bit atomic operations can
be split up into two 32-bit atomic operations.
---
gcc/tree-profile.cc | 73 +++++++++++++++++++++++++++++++++++++++++----
1 file changed, 67 insertions(+), 6 deletions(-)
diff --git a/gcc/tree-profile.cc b/gcc/tree-profile.cc
index fe20e84838d..0ac4e826beb 100644
--- a/gcc/tree-profile.cc
+++ b/gcc/tree-profile.cc
@@ -1006,6 +1006,57 @@ resolve_counters (vec<counters>& cands)
}
+/* At edge E, update the decision counter referenced by REF with the
+ COUNTER. Generate two separate 32-bit atomic bitwise-or operations
+ specified by ATOMIC_IOR_32 in the RELAXED memory order. */
+static void
+split_update_decision_counter (edge e, tree ref, tree counter, tree
+ atomic_ior_32, tree relaxed)
+{
+ gimple_stmt_iterator gsi = gsi_last (PENDING_STMT (e));
+ ref = unshare_expr (ref);
+
+ /* Get the low and high address of the referenced counter */
+ tree addr_low = build_addr (ref);
+ tree addr_high = make_temp_ssa_name (TREE_TYPE (addr_low), NULL,
+ "PROF_decision");
+ tree four = build_int_cst (size_type_node, 4);
+ gassign *assign1 = gimple_build_assign (addr_high, POINTER_PLUS_EXPR,
+ addr_low, four);
+ gsi_insert_after (&gsi, assign1, GSI_NEW_STMT);
+ if (WORDS_BIG_ENDIAN)
+ std::swap (addr_low, addr_high);
+
+ /* Get the low 32-bit of the counter */
+ tree counter_low_32 = make_temp_ssa_name (uint32_type_node, NULL,
+ "PROF_decision");
+ gassign *assign2 = gimple_build_assign (counter_low_32, NOP_EXPR, counter);
+ gsi_insert_after (&gsi, assign2, GSI_NEW_STMT);
+
+ /* Get the high 32-bit of the counter */
+ tree shift_32 = build_int_cst (integer_type_node, 32);
+ tree counter_high_64 = make_temp_ssa_name (gcov_type_node, NULL,
+ "PROF_decision");
+ gassign *assign3 = gimple_build_assign (counter_high_64, LSHIFT_EXPR,
+ counter, shift_32);
+ gsi_insert_after (&gsi, assign3, GSI_NEW_STMT);
+ tree counter_high_32 = make_temp_ssa_name (uint32_type_node, NULL,
+ "PROF_decision");
+ gassign *assign4 = gimple_build_assign (counter_high_32, NOP_EXPR,
+ counter_high_64);
+ gsi_insert_after (&gsi, assign4, GSI_NEW_STMT);
+
+ /* Atomically bitwise-or the low 32-bit counter parts */
+ gcall *call1 = gimple_build_call (atomic_ior_32, 3, addr_low,
+ counter_low_32, relaxed);
+ gsi_insert_after (&gsi, call1, GSI_NEW_STMT);
+
+ /* Atomically bitwise-or the high 32-bit counter parts */
+ gcall *call2 = gimple_build_call (atomic_ior_32, 3, addr_high,
+ counter_high_32, relaxed);
+ gsi_insert_after (&gsi, call2, GSI_NEW_STMT);
+}
+
/* Add instrumentation to a decision subgraph. EXPR should be the
(topologically sorted) block of nodes returned by cov_blocks, MAPS the
bitmaps returned by cov_maps, and MASKS the block of bitsets returned by
@@ -1108,11 +1159,16 @@ instrument_decisions (array_slice<basic_block> expr,
size_t condno,
gcc_assert (xi == bitmap_count_bits (core));
const tree relaxed = build_int_cst (integer_type_node, MEMMODEL_RELAXED);
- const bool atomic = flag_profile_update == PROFILE_UPDATE_ATOMIC;
- const tree atomic_ior = builtin_decl_explicit
- (TYPE_PRECISION (gcov_type_node) > 32
- ? BUILT_IN_ATOMIC_FETCH_OR_8
- : BUILT_IN_ATOMIC_FETCH_OR_4);
+ const bool use_atomic_builtin =
+ counter_update == COUNTER_UPDATE_ATOMIC_BUILTIN;
+ const bool use_atomic_split =
+ counter_update == COUNTER_UPDATE_ATOMIC_SPLIT ||
+ counter_update == COUNTER_UPDATE_ATOMIC_PARTIAL;
+ const tree atomic_ior_32 =
+ builtin_decl_explicit (BUILT_IN_ATOMIC_FETCH_OR_4);
+ const tree atomic_ior = TYPE_PRECISION (gcov_type_node) > 32 ?
+ builtin_decl_explicit (BUILT_IN_ATOMIC_FETCH_OR_8) :
+ atomic_ior_32;
/* Flush to the gcov accumulators. */
for (const basic_block b : expr)
@@ -1149,7 +1205,7 @@ instrument_decisions (array_slice<basic_block> expr,
size_t condno,
{
tree ref = tree_coverage_counter_ref (GCOV_COUNTER_CONDS,
2*condno + k);
- if (atomic)
+ if (use_atomic_builtin)
{
ref = unshare_expr (ref);
gcall *flush = gimple_build_call (atomic_ior, 3,
@@ -1157,6 +1213,11 @@ instrument_decisions (array_slice<basic_block> expr,
size_t condno,
next[k], relaxed);
gsi_insert_on_edge (e, flush);
}
+ else if (use_atomic_split)
+ {
+ split_update_decision_counter (e, ref, next[k],
+ atomic_ior_32, relaxed);
+ }
else
{
tree get = emit_assign (e, ref);
--
2.51.0