On 9/23/20 9:28 AM, Richard Biener wrote: > On Tue, 22 Sep 2020, Tom de Vries wrote: > >> [ was: Re: [Patch] [middle-end & nvptx] gcc/tracer.c: Don't split BB >> with SIMT LANE [PR95654] ] >> >> On 9/16/20 8:20 PM, Alexander Monakov wrote: >>> >>> >>> On Wed, 16 Sep 2020, Tom de Vries wrote: >>> >>>> [ cc-ing author omp support for nvptx. ] >>> >>> The issue looks familiar. I recognized it back in 2017 (and LLVM people >>> recognized it too for their GPU targets). In an attempt to get agreement >>> to fix the issue "properly" for GCC I found a similar issue that affects >>> all targets, not just offloading, and filed it as PR 80053. >>> >>> (yes, there are no addressable labels involved in offloading, but >>> nevertheless >>> the nature of the middle-end issue is related) >> >> Hi Alexander, >> >> thanks for looking into this. >> >> Seeing that the attempt to fix things properly is stalled, for now I'm >> proposing a point-fix, similar to the original patch proposed by Tobias. >> >> Richi, Jakub, OK for trunk? > > I notice that we call ignore_bb_p many times in tracer.c but one call > is conveniently early in tail_duplicate (void): > > int n = count_insns (bb); > if (!ignore_bb_p (bb)) > blocks[bb->index] = heap.insert (-bb->count.to_frequency (cfun), > bb); > > where count_insns already walks all stmts in the block. It would be > nice to avoid repeatedly walking all stmts, maybe adjusting the above > call is enough and/or count_insns can compute this and/or the ignore_bb_p > result can be cached (optimize_bb_for_size_p might change though, > but maybe all other ignore_bb_p calls effectively just are that, > checks for blocks that became optimize_bb_for_size_p). >
This untested follow-up patch tries something in that direction. Is this what you meant? Thanks, - Tom
try --- gcc/tracer.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 18 deletions(-) diff --git a/gcc/tracer.c b/gcc/tracer.c index de80416f163..63bfbf1e286 100644 --- a/gcc/tracer.c +++ b/gcc/tracer.c @@ -53,7 +53,6 @@ #include "fibonacci_heap.h" #include "tracer.h" -static int count_insns (basic_block); static bool better_p (const_edge, const_edge); static edge find_best_successor (basic_block); static edge find_best_predecessor (basic_block); @@ -84,58 +83,127 @@ bb_seen_p (basic_block bb) return bitmap_bit_p (bb_seen, bb->index); } -/* Return true if we should ignore the basic block for purposes of tracing. */ -bool -ignore_bb_p (const_basic_block bb) +static bool +can_duplicate_bb_p (const_basic_block bb) { if (bb->index < NUM_FIXED_BLOCKS) - return true; - if (optimize_bb_for_size_p (bb)) - return true; + return false; if (gimple *g = last_stmt (CONST_CAST_BB (bb))) { /* A transaction is a single entry multiple exit region. It must be duplicated in its entirety or not at all. */ if (gimple_code (g) == GIMPLE_TRANSACTION) - return true; + return false; /* An IFN_UNIQUE call must be duplicated as part of its group, or not at all. */ if (is_gimple_call (g) && gimple_call_internal_p (g) && gimple_call_internal_unique_p (g)) - return true; + return false; + } + + return true; +} + +static bool +can_duplicate_insn_p (gimple *g) +{ + if (is_gimple_call (g) + && (gimple_call_internal_p (g, IFN_GOMP_SIMT_ENTER_ALLOC) + || gimple_call_internal_p (g, IFN_GOMP_SIMT_EXIT))) + return false; + + return true; +} + +static sbitmap can_duplicate_bb; +static sbitmap can_duplicate_bb_computed; + +static inline void +cache_can_duplicate_bb_p (const_basic_block bb, bool val) +{ + unsigned int size = SBITMAP_SIZE (can_duplicate_bb); + + if ((unsigned int)bb->index >= size) + { + can_duplicate_bb = sbitmap_resize (can_duplicate_bb, size * 2, 0); + can_duplicate_bb_computed + = sbitmap_resize (can_duplicate_bb_computed, size * 2, 0); } + if (val) + bitmap_set_bit (can_duplicate_bb, bb->index); + bitmap_set_bit (can_duplicate_bb_computed, bb->index); +} + +static bool +compute_can_duplicate_bb_p (const_basic_block bb) +{ + if (!can_duplicate_bb_p (bb)) + return false; + for (gimple_stmt_iterator gsi = gsi_start_bb (CONST_CAST_BB (bb)); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple *g = gsi_stmt (gsi); - if (is_gimple_call (g) - && (gimple_call_internal_p (g, IFN_GOMP_SIMT_ENTER_ALLOC) - || gimple_call_internal_p (g, IFN_GOMP_SIMT_EXIT))) - return true; + if (!can_duplicate_insn_p (g)) + return false; + } + + return true; +} + +static bool +cached_can_duplicate_bb_p (const_basic_block bb) +{ + if (can_duplicate_bb) + { + unsigned int size = SBITMAP_SIZE (can_duplicate_bb); + /* Assume added bb's should be ignored. */ + if ((unsigned int)bb->index < size + && bitmap_bit_p (can_duplicate_bb_computed, bb->index)) + return !bitmap_bit_p (can_duplicate_bb, bb->index); } - return false; + bool val = compute_can_duplicate_bb_p (bb); + if (can_duplicate_bb) + cache_can_duplicate_bb_p (bb, val); + + return val; +} + +/* Return true if we should ignore the basic block for purposes of tracing. */ +bool +ignore_bb_p (const_basic_block bb) +{ + if (optimize_bb_for_size_p (bb)) + return true; + + return !cached_can_duplicate_bb_p (bb); } /* Return number of instructions in the block. */ -static int -count_insns (basic_block bb) +static void +analyze_bb (basic_block bb, int *count) { gimple_stmt_iterator gsi; gimple *stmt; int n = 0; + bool can_duplicate = can_duplicate_bb_p (bb); for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { stmt = gsi_stmt (gsi); n += estimate_num_insns (stmt, &eni_size_weights); + if (can_duplicate_bb) + can_duplicate = can_duplicate_insn_p (stmt); } - return n; + + *count = n; + cache_can_duplicate_bb_p (bb, can_duplicate); } /* Return true if E1 is more frequent than E2. */ @@ -283,6 +351,10 @@ tail_duplicate (void) resize it. */ bb_seen = sbitmap_alloc (last_basic_block_for_fn (cfun) * 2); bitmap_clear (bb_seen); + can_duplicate_bb = sbitmap_alloc (last_basic_block_for_fn (cfun)); + bitmap_clear (can_duplicate_bb); + can_duplicate_bb_computed = sbitmap_alloc (last_basic_block_for_fn (cfun)); + bitmap_clear (can_duplicate_bb_computed); initialize_original_copy_tables (); if (profile_info && profile_status_for_fn (cfun) == PROFILE_READ) @@ -296,7 +368,8 @@ tail_duplicate (void) FOR_EACH_BB_FN (bb, cfun) { - int n = count_insns (bb); + int n; + analyze_bb (bb, &n); if (!ignore_bb_p (bb)) blocks[bb->index] = heap.insert (-bb->count.to_frequency (cfun), bb); @@ -386,6 +459,9 @@ tail_duplicate (void) free_original_copy_tables (); sbitmap_free (bb_seen); + sbitmap_free (can_duplicate_bb); + sbitmap_free (can_duplicate_bb_computed); + can_duplicate_bb = NULL; free (trace); free (counts);