On 9/23/20 9:28 AM, Richard Biener wrote:
> On Tue, 22 Sep 2020, Tom de Vries wrote:
> 
>> [ was: Re: [Patch] [middle-end & nvptx] gcc/tracer.c: Don't split BB
>> with SIMT LANE [PR95654] ]
>>
>> On 9/16/20 8:20 PM, Alexander Monakov wrote:
>>>
>>>
>>> On Wed, 16 Sep 2020, Tom de Vries wrote:
>>>
>>>> [ cc-ing author omp support for nvptx. ]
>>>
>>> The issue looks familiar. I recognized it back in 2017 (and LLVM people
>>> recognized it too for their GPU targets). In an attempt to get agreement
>>> to fix the issue "properly" for GCC I found a similar issue that affects
>>> all targets, not just offloading, and filed it as PR 80053.
>>>
>>> (yes, there are no addressable labels involved in offloading, but 
>>> nevertheless
>>> the nature of the middle-end issue is related)
>>
>> Hi Alexander,
>>
>> thanks for looking into this.
>>
>> Seeing that the attempt to fix things properly is stalled, for now I'm
>> proposing a point-fix, similar to the original patch proposed by Tobias.
>>
>> Richi, Jakub, OK for trunk?
> 
> I notice that we call ignore_bb_p many times in tracer.c but one call
> is conveniently early in tail_duplicate (void):
> 
>       int n = count_insns (bb);
>       if (!ignore_bb_p (bb))
>         blocks[bb->index] = heap.insert (-bb->count.to_frequency (cfun), 
> bb);
> 
> where count_insns already walks all stmts in the block.  It would be
> nice to avoid repeatedly walking all stmts, maybe adjusting the above
> call is enough and/or count_insns can compute this and/or the ignore_bb_p
> result can be cached (optimize_bb_for_size_p might change though,
> but maybe all other ignore_bb_p calls effectively just are that,
> checks for blocks that became optimize_bb_for_size_p).
> 

This untested follow-up patch tries something in that direction.

Is this what you meant?

Thanks,
- Tom
try

---
 gcc/tracer.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 94 insertions(+), 18 deletions(-)

diff --git a/gcc/tracer.c b/gcc/tracer.c
index de80416f163..63bfbf1e286 100644
--- a/gcc/tracer.c
+++ b/gcc/tracer.c
@@ -53,7 +53,6 @@
 #include "fibonacci_heap.h"
 #include "tracer.h"
 
-static int count_insns (basic_block);
 static bool better_p (const_edge, const_edge);
 static edge find_best_successor (basic_block);
 static edge find_best_predecessor (basic_block);
@@ -84,58 +83,127 @@ bb_seen_p (basic_block bb)
   return bitmap_bit_p (bb_seen, bb->index);
 }
 
-/* Return true if we should ignore the basic block for purposes of tracing.  */
-bool
-ignore_bb_p (const_basic_block bb)
+static bool
+can_duplicate_bb_p (const_basic_block bb)
 {
   if (bb->index < NUM_FIXED_BLOCKS)
-    return true;
-  if (optimize_bb_for_size_p (bb))
-    return true;
+    return false;
 
   if (gimple *g = last_stmt (CONST_CAST_BB (bb)))
     {
       /* A transaction is a single entry multiple exit region.  It
 	 must be duplicated in its entirety or not at all.  */
       if (gimple_code (g) == GIMPLE_TRANSACTION)
-	return true;
+	return false;
 
       /* An IFN_UNIQUE call must be duplicated as part of its group,
 	 or not at all.  */
       if (is_gimple_call (g)
 	  && gimple_call_internal_p (g)
 	  && gimple_call_internal_unique_p (g))
-	return true;
+	return false;
+    }
+
+  return true;
+}
+
+static bool
+can_duplicate_insn_p (gimple *g)
+{
+  if (is_gimple_call (g)
+      && (gimple_call_internal_p (g, IFN_GOMP_SIMT_ENTER_ALLOC)
+	  || gimple_call_internal_p (g, IFN_GOMP_SIMT_EXIT)))
+    return false;
+
+  return true;
+}
+
+static sbitmap can_duplicate_bb;
+static sbitmap can_duplicate_bb_computed;
+
+static inline void
+cache_can_duplicate_bb_p (const_basic_block bb, bool val)
+{
+  unsigned int size = SBITMAP_SIZE (can_duplicate_bb);
+
+  if ((unsigned int)bb->index >= size)
+    {
+      can_duplicate_bb = sbitmap_resize (can_duplicate_bb, size * 2, 0);
+      can_duplicate_bb_computed
+	= sbitmap_resize (can_duplicate_bb_computed, size * 2, 0);
     }
 
+  if (val)
+    bitmap_set_bit (can_duplicate_bb, bb->index);
+  bitmap_set_bit (can_duplicate_bb_computed, bb->index);
+}
+
+static bool
+compute_can_duplicate_bb_p (const_basic_block bb)
+{
+  if (!can_duplicate_bb_p (bb))
+    return false;
+
   for (gimple_stmt_iterator gsi = gsi_start_bb (CONST_CAST_BB (bb));
        !gsi_end_p (gsi); gsi_next (&gsi))
     {
       gimple *g = gsi_stmt (gsi);
-      if (is_gimple_call (g)
-	  && (gimple_call_internal_p (g, IFN_GOMP_SIMT_ENTER_ALLOC)
-	      || gimple_call_internal_p (g, IFN_GOMP_SIMT_EXIT)))
-	return true;
+      if (!can_duplicate_insn_p (g))
+	return false;
+    }
+
+  return true;
+}
+
+static bool
+cached_can_duplicate_bb_p (const_basic_block bb)
+{
+  if (can_duplicate_bb)
+    {
+      unsigned int size = SBITMAP_SIZE (can_duplicate_bb);
+      /* Assume added bb's should be ignored.  */
+      if ((unsigned int)bb->index < size
+	  && bitmap_bit_p (can_duplicate_bb_computed, bb->index))
+	return !bitmap_bit_p (can_duplicate_bb, bb->index);
     }
 
-  return false;
+  bool val = compute_can_duplicate_bb_p (bb);
+  if (can_duplicate_bb)
+    cache_can_duplicate_bb_p (bb, val);
+
+  return val;
+}
+
+/* Return true if we should ignore the basic block for purposes of tracing.  */
+bool
+ignore_bb_p (const_basic_block bb)
+{
+  if (optimize_bb_for_size_p (bb))
+    return true;
+
+  return !cached_can_duplicate_bb_p (bb);
 }
 
 /* Return number of instructions in the block.  */
 
-static int
-count_insns (basic_block bb)
+static void
+analyze_bb (basic_block bb, int *count)
 {
   gimple_stmt_iterator gsi;
   gimple *stmt;
   int n = 0;
+  bool can_duplicate = can_duplicate_bb_p (bb);
 
   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
     {
       stmt = gsi_stmt (gsi);
       n += estimate_num_insns (stmt, &eni_size_weights);
+      if (can_duplicate_bb)
+	can_duplicate = can_duplicate_insn_p (stmt);
     }
-  return n;
+
+  *count = n;
+  cache_can_duplicate_bb_p (bb, can_duplicate);
 }
 
 /* Return true if E1 is more frequent than E2.  */
@@ -283,6 +351,10 @@ tail_duplicate (void)
      resize it.  */
   bb_seen = sbitmap_alloc (last_basic_block_for_fn (cfun) * 2);
   bitmap_clear (bb_seen);
+  can_duplicate_bb = sbitmap_alloc (last_basic_block_for_fn (cfun));
+  bitmap_clear (can_duplicate_bb);
+  can_duplicate_bb_computed = sbitmap_alloc (last_basic_block_for_fn (cfun));
+  bitmap_clear (can_duplicate_bb_computed);
   initialize_original_copy_tables ();
 
   if (profile_info && profile_status_for_fn (cfun) == PROFILE_READ)
@@ -296,7 +368,8 @@ tail_duplicate (void)
 
   FOR_EACH_BB_FN (bb, cfun)
     {
-      int n = count_insns (bb);
+      int n;
+      analyze_bb (bb, &n);
       if (!ignore_bb_p (bb))
 	blocks[bb->index] = heap.insert (-bb->count.to_frequency (cfun), bb);
 
@@ -386,6 +459,9 @@ tail_duplicate (void)
 
   free_original_copy_tables ();
   sbitmap_free (bb_seen);
+  sbitmap_free (can_duplicate_bb);
+  sbitmap_free (can_duplicate_bb_computed);
+  can_duplicate_bb = NULL;
   free (trace);
   free (counts);
 

Reply via email to