[gomp4] Frame propagation for routines

Nathan Sidwell Wed, 24 Aug 2016 07:35:07 -0700

I've committed this to the gomp4 branch. It addresses an issue I was puzzledwe'd not met, but then I realized I'd been turning the optimizer on and thusinlining things, which hid the problem.

If we pass a reference (i.e. addressof) a frame object to an openecc routinethat itself contains partitioned execution, the partitioned instances willinterpret the address as referring to their own .local stack frame -- eventhough the address has been 'globalized'.

The openacc std doesn't say whether the other threads should refer to theoriginal unique instance, or clone that object. However, for non-references,the object is cloned, and I have taken that approach as it's the simplest.


nathan

2016-08-24  Nathan Sidwell  <nat...@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining):
	Emit insns for calls too.
	(nvptx_find_par): Always look for worker-level predecessor insn.
	(nvptx_propagate): Add is_call parm, return bool.  Copy frame for
	calls.
	(nvptx_vpropagate, nvptx_wpropagate): Adjust.
	(nvptx_process_pars): Propagate frames for calls.

	libgomp/
	* testsuite/libgomp.oacc-c++/ref-1.C: New.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 239735)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -335,8 +335,7 @@ nvptx_emit_forking (unsigned mask, bool
 	 it creates a block with a single successor before entering a
 	 partitooned region.  That is a good candidate for the end of
 	 an SESE region.  */
-      if (!is_call)
-	emit_insn (gen_nvptx_fork (op));
+      emit_insn (gen_nvptx_fork (op));
       emit_insn (gen_nvptx_forked (op));
     }
 }
@@ -355,8 +354,7 @@ nvptx_emit_joining (unsigned mask, bool
       /* Emit joining for all non-call pars to ensure there's a single
 	 predecessor for the block the join insn ends up in.  This is
 	 needed for skipping entire loops.  */
-      if (!is_call)
-	emit_insn (gen_nvptx_joining (op));
+      emit_insn (gen_nvptx_joining (op));
       emit_insn (gen_nvptx_join (op));
     }
 }
@@ -2489,8 +2487,7 @@ nvptx_find_par (bb_insn_map_t *map, para
 	    par = new parallel (par, mask);
 	    par->forked_block = block;
 	    par->forked_insn = end;
-	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	      par->fork_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
 	  }
@@ -2505,8 +2502,7 @@ nvptx_find_par (bb_insn_map_t *map, para
 	    gcc_assert (par->mask == mask);
 	    par->join_block = block;
 	    par->join_insn = end;
-	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	      par->joining_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
 	    par = par->parent;
@@ -3191,29 +3187,34 @@ nvptx_find_sese (auto_vec<basic_block> &
 #undef BB_SET_SESE
 #undef BB_GET_SESE
 
-/* Propagate live state at the start of a partitioned region.  BLOCK
-   provides the live register information, and might not contain
-   INSN. Propagation is inserted just after INSN. RW indicates whether
-   we are reading and/or writing state.  This
+/* Propagate live state at the start of a partitioned region.  IS_CALL
+   indicates whether the propagation is for a (partitioned) call
+   instruction.  BLOCK provides the live register information, and
+   might not contain INSN. Propagation is inserted just after INSN. RW
+   indicates whether we are reading and/or writing state.  This
    separation is needed for worker-level proppagation where we
    essentially do a spill & fill.  FN is the underlying worker
    function to generate the propagation instructions for single
    register.  DATA is user data.
 
-   We propagate the live register set and the entire frame.  We could
-   do better by (a) propagating just the live set that is used within
-   the partitioned regions and (b) only propagating stack entries that
-   are used.  The latter might be quite hard to determine.  */
+   Returns true if we didn't emit any instructions.
+
+   We propagate the live register set for non-calls and the entire
+   frame for calls and non-calls.  We could do better by (a)
+   propagating just the live set that is used within the partitioned
+   regions and (b) only propagating stack entries that are used.  The
+   latter might be quite hard to determine.  */
 
 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
 
-static void
-nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
-		 propagator_fn fn, void *data)
+static bool
+nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
+		 propagate_mask rw, propagator_fn fn, void *data)
 {
   bitmap live = DF_LIVE_IN (block);
   bitmap_iterator iterator;
   unsigned ix;
+  bool empty = true;
 
   /* Copy the frame array.  */
   HOST_WIDE_INT fs = get_frame_size ();
@@ -3225,6 +3226,7 @@ nvptx_propagate (basic_block block, rtx_
       rtx pred = NULL_RTX;
       rtx_code_label *label = NULL;
 
+      empty = false;
       /* The frame size might not be DImode compatible, but the frame
 	 array's declaration will be.  So it's ok to round up here.  */
       fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
@@ -3271,18 +3273,21 @@ nvptx_propagate (basic_block block, rtx_
       insn = emit_insn_after (cpy, insn);
     }
 
-  /* Copy live registers.  */
-  EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
-    {
-      rtx reg = regno_reg_rtx[ix];
+  if (!is_call)
+    /* Copy live registers.  */
+    EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
+      {
+	rtx reg = regno_reg_rtx[ix];
 
-      if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
-	{
-	  rtx bcast = fn (reg, rw, 0, data);
+	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
+	  {
+	    rtx bcast = fn (reg, rw, 0, data);
 
-	  insn = emit_insn_after (bcast, insn);
-	}
-    }
+	    insn = emit_insn_after (bcast, insn);
+	    empty = false;
+	  }
+      }
+  return empty;
 }
 
 /* Worker for nvptx_vpropagate.  */
@@ -3298,12 +3303,13 @@ vprop_gen (rtx reg, propagate_mask pm,
 }
 
 /* Propagate state that is live at start of BLOCK across the vectors
-   of a single warp.  Propagation is inserted just after INSN.   */
+   of a single warp.  Propagation is inserted just after INSN.
+   IS_CALL and return as for nvptx_propagate.  */
 
-static void
-nvptx_vpropagate (basic_block block, rtx_insn *insn)
+static bool
+nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
 {
-  nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
+  return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
 }
 
 /* Worker for nvptx_wpropagate.  */
@@ -3339,10 +3345,10 @@ wprop_gen (rtx reg, propagate_mask pm, u
 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
    indicates if this is just before partitioned mode (do spill), or
    just after it starts (do fill). Sequence is inserted just after
-   INSN.  */
+   INSN.  IS_CALL and return as for nvptx_propagate.  */
 
-static void
-nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
+static bool
+nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
 {
   wcast_data_t data;
 
@@ -3350,7 +3356,9 @@ nvptx_wpropagate (bool pre_p, basic_bloc
   data.offset = 0;
   data.ptr = NULL_RTX;
 
-  nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
+  bool empty = nvptx_propagate (is_call, block, insn,
+				pre_p ? PM_read : PM_write, wprop_gen, &data);
+  gcc_assert (empty == !data.offset);
   if (data.offset)
     {
       /* Stuff was emitted, initialize the base pointer now.  */
@@ -3360,6 +3368,7 @@ nvptx_wpropagate (bool pre_p, basic_bloc
       if (worker_bcast_size < data.offset)
 	worker_bcast_size = data.offset;
     }
+  return empty;
 }
 
 /* Emit a worker-level synchronization barrier.  We use different
@@ -3617,18 +3626,23 @@ nvptx_process_pars (parallel *par)
       inner_mask |= par->inner_mask;
     }
 
-  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-    /* No propagation needed for a call.  */;
-  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
-    {
-      nvptx_wpropagate (false, par->forked_block, par->forked_insn);
-      nvptx_wpropagate (true, par->forked_block, par->fork_insn);
-      /* Insert begin and end synchronizations.  */
-      emit_insn_after (nvptx_wsync (false), par->forked_insn);
-      emit_insn_before (nvptx_wsync (true), par->joining_insn);
+  bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
+  
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+    {
+      nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
+      bool empty = nvptx_wpropagate (true, is_call,
+				     par->forked_block, par->fork_insn);
+
+      if (!empty || !is_call)
+	{
+	  /* Insert begin and end synchronizations.  */
+	  emit_insn_after (nvptx_wsync (false), par->forked_insn);
+	  emit_insn_before (nvptx_wsync (true), par->joining_insn);
+	}
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
-    nvptx_vpropagate (par->forked_block, par->forked_insn);
+    nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
 
   /* Now do siblings.  */
   if (par->next)
Index: libgomp/testsuite/libgomp.oacc-c++/ref-1.C
===================================================================
--- libgomp/testsuite/libgomp.oacc-c++/ref-1.C	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c++/ref-1.C	(working copy)
@@ -0,0 +1,75 @@
+/* { dg-do run } */
+
+extern "C" int printf (char const *, ...);
+
+#pragma acc routine vector
+void Vector (int *ptr,int n,const int &inc)
+{
+  #pragma acc loop vector
+  for (unsigned ix = 0; ix < n; ix++)
+    ptr[ix] += inc;
+}
+
+#pragma acc routine worker
+void Worker (int *ptr, int m, int n, const int &inc)
+{
+  #pragma acc loop worker
+  for (unsigned ix = 0; ix < m; ix++)
+    Vector(ptr + ix * n, n, inc);
+}
+
+int main ()
+{
+  const int n = 32, m=32;
+  
+  int ary[m][n];
+  unsigned ix,  iy;
+  
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      ary[ix][iy] = (ix << 8) + iy;
+
+#pragma acc parallel copy(ary)
+  {
+    Worker (&ary[0][0], m, n, 1<<16);
+  }
+
+  int err = 0;
+  
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 16) + (ix << 8) + iy))
+	{
+	  printf ("ary[%u][%u] = %x expected %x\n",
+		  ix, iy, ary[ix][iy], ((1 << 16) + (ix << 8) + iy));
+	  err++;
+	}
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+     }
+
+#pragma acc parallel copy(ary)
+  {
+    Vector (&ary[0][0], m * n, (1<<24) - (1<<16));
+  }
+  
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
+	{
+	  printf ("ary[%u][%u] = %x expected %x\n",
+		  ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
+	  err++;
+	}
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+     }
+
+  return 0;
+}

[gomp4] Frame propagation for routines

Reply via email to