I've committed this to the gomp4 branch. It addresses an issue I was puzzled
we'd not met, but then I realized I'd been turning the optimizer on and thus
inlining things, which hid the problem.
If we pass a reference (i.e. addressof) a frame object to an openecc routine
that itself contains partitioned execution, the partitioned instances will
interpret the address as referring to their own .local stack frame -- even
though the address has been 'globalized'.
The openacc std doesn't say whether the other threads should refer to the
original unique instance, or clone that object. However, for non-references,
the object is cloned, and I have taken that approach as it's the simplest.
nathan
2016-08-24 Nathan Sidwell <nat...@codesourcery.com>
gcc/
* config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining):
Emit insns for calls too.
(nvptx_find_par): Always look for worker-level predecessor insn.
(nvptx_propagate): Add is_call parm, return bool. Copy frame for
calls.
(nvptx_vpropagate, nvptx_wpropagate): Adjust.
(nvptx_process_pars): Propagate frames for calls.
libgomp/
* testsuite/libgomp.oacc-c++/ref-1.C: New.
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c (revision 239735)
+++ gcc/config/nvptx/nvptx.c (working copy)
@@ -335,8 +335,7 @@ nvptx_emit_forking (unsigned mask, bool
it creates a block with a single successor before entering a
partitooned region. That is a good candidate for the end of
an SESE region. */
- if (!is_call)
- emit_insn (gen_nvptx_fork (op));
+ emit_insn (gen_nvptx_fork (op));
emit_insn (gen_nvptx_forked (op));
}
}
@@ -355,8 +354,7 @@ nvptx_emit_joining (unsigned mask, bool
/* Emit joining for all non-call pars to ensure there's a single
predecessor for the block the join insn ends up in. This is
needed for skipping entire loops. */
- if (!is_call)
- emit_insn (gen_nvptx_joining (op));
+ emit_insn (gen_nvptx_joining (op));
emit_insn (gen_nvptx_join (op));
}
}
@@ -2489,8 +2487,7 @@ nvptx_find_par (bb_insn_map_t *map, para
par = new parallel (par, mask);
par->forked_block = block;
par->forked_insn = end;
- if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
- && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+ if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
par->fork_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
}
@@ -2505,8 +2502,7 @@ nvptx_find_par (bb_insn_map_t *map, para
gcc_assert (par->mask == mask);
par->join_block = block;
par->join_insn = end;
- if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
- && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+ if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
par->joining_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
par = par->parent;
@@ -3191,29 +3187,34 @@ nvptx_find_sese (auto_vec<basic_block> &
#undef BB_SET_SESE
#undef BB_GET_SESE
-/* Propagate live state at the start of a partitioned region. BLOCK
- provides the live register information, and might not contain
- INSN. Propagation is inserted just after INSN. RW indicates whether
- we are reading and/or writing state. This
+/* Propagate live state at the start of a partitioned region. IS_CALL
+ indicates whether the propagation is for a (partitioned) call
+ instruction. BLOCK provides the live register information, and
+ might not contain INSN. Propagation is inserted just after INSN. RW
+ indicates whether we are reading and/or writing state. This
separation is needed for worker-level proppagation where we
essentially do a spill & fill. FN is the underlying worker
function to generate the propagation instructions for single
register. DATA is user data.
- We propagate the live register set and the entire frame. We could
- do better by (a) propagating just the live set that is used within
- the partitioned regions and (b) only propagating stack entries that
- are used. The latter might be quite hard to determine. */
+ Returns true if we didn't emit any instructions.
+
+ We propagate the live register set for non-calls and the entire
+ frame for calls and non-calls. We could do better by (a)
+ propagating just the live set that is used within the partitioned
+ regions and (b) only propagating stack entries that are used. The
+ latter might be quite hard to determine. */
typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
-static void
-nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
- propagator_fn fn, void *data)
+static bool
+nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
+ propagate_mask rw, propagator_fn fn, void *data)
{
bitmap live = DF_LIVE_IN (block);
bitmap_iterator iterator;
unsigned ix;
+ bool empty = true;
/* Copy the frame array. */
HOST_WIDE_INT fs = get_frame_size ();
@@ -3225,6 +3226,7 @@ nvptx_propagate (basic_block block, rtx_
rtx pred = NULL_RTX;
rtx_code_label *label = NULL;
+ empty = false;
/* The frame size might not be DImode compatible, but the frame
array's declaration will be. So it's ok to round up here. */
fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
@@ -3271,18 +3273,21 @@ nvptx_propagate (basic_block block, rtx_
insn = emit_insn_after (cpy, insn);
}
- /* Copy live registers. */
- EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
- {
- rtx reg = regno_reg_rtx[ix];
+ if (!is_call)
+ /* Copy live registers. */
+ EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
+ {
+ rtx reg = regno_reg_rtx[ix];
- if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
- {
- rtx bcast = fn (reg, rw, 0, data);
+ if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
+ {
+ rtx bcast = fn (reg, rw, 0, data);
- insn = emit_insn_after (bcast, insn);
- }
- }
+ insn = emit_insn_after (bcast, insn);
+ empty = false;
+ }
+ }
+ return empty;
}
/* Worker for nvptx_vpropagate. */
@@ -3298,12 +3303,13 @@ vprop_gen (rtx reg, propagate_mask pm,
}
/* Propagate state that is live at start of BLOCK across the vectors
- of a single warp. Propagation is inserted just after INSN. */
+ of a single warp. Propagation is inserted just after INSN.
+ IS_CALL and return as for nvptx_propagate. */
-static void
-nvptx_vpropagate (basic_block block, rtx_insn *insn)
+static bool
+nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
{
- nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
+ return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
}
/* Worker for nvptx_wpropagate. */
@@ -3339,10 +3345,10 @@ wprop_gen (rtx reg, propagate_mask pm, u
/* Spill or fill live state that is live at start of BLOCK. PRE_P
indicates if this is just before partitioned mode (do spill), or
just after it starts (do fill). Sequence is inserted just after
- INSN. */
+ INSN. IS_CALL and return as for nvptx_propagate. */
-static void
-nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
+static bool
+nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
{
wcast_data_t data;
@@ -3350,7 +3356,9 @@ nvptx_wpropagate (bool pre_p, basic_bloc
data.offset = 0;
data.ptr = NULL_RTX;
- nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
+ bool empty = nvptx_propagate (is_call, block, insn,
+ pre_p ? PM_read : PM_write, wprop_gen, &data);
+ gcc_assert (empty == !data.offset);
if (data.offset)
{
/* Stuff was emitted, initialize the base pointer now. */
@@ -3360,6 +3368,7 @@ nvptx_wpropagate (bool pre_p, basic_bloc
if (worker_bcast_size < data.offset)
worker_bcast_size = data.offset;
}
+ return empty;
}
/* Emit a worker-level synchronization barrier. We use different
@@ -3617,18 +3626,23 @@ nvptx_process_pars (parallel *par)
inner_mask |= par->inner_mask;
}
- if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
- /* No propagation needed for a call. */;
- else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
- {
- nvptx_wpropagate (false, par->forked_block, par->forked_insn);
- nvptx_wpropagate (true, par->forked_block, par->fork_insn);
- /* Insert begin and end synchronizations. */
- emit_insn_after (nvptx_wsync (false), par->forked_insn);
- emit_insn_before (nvptx_wsync (true), par->joining_insn);
+ bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
+
+ if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ {
+ nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
+ bool empty = nvptx_wpropagate (true, is_call,
+ par->forked_block, par->fork_insn);
+
+ if (!empty || !is_call)
+ {
+ /* Insert begin and end synchronizations. */
+ emit_insn_after (nvptx_wsync (false), par->forked_insn);
+ emit_insn_before (nvptx_wsync (true), par->joining_insn);
+ }
}
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
- nvptx_vpropagate (par->forked_block, par->forked_insn);
+ nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
/* Now do siblings. */
if (par->next)
Index: libgomp/testsuite/libgomp.oacc-c++/ref-1.C
===================================================================
--- libgomp/testsuite/libgomp.oacc-c++/ref-1.C (nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c++/ref-1.C (working copy)
@@ -0,0 +1,75 @@
+/* { dg-do run } */
+
+extern "C" int printf (char const *, ...);
+
+#pragma acc routine vector
+void Vector (int *ptr,int n,const int &inc)
+{
+ #pragma acc loop vector
+ for (unsigned ix = 0; ix < n; ix++)
+ ptr[ix] += inc;
+}
+
+#pragma acc routine worker
+void Worker (int *ptr, int m, int n, const int &inc)
+{
+ #pragma acc loop worker
+ for (unsigned ix = 0; ix < m; ix++)
+ Vector(ptr + ix * n, n, inc);
+}
+
+int main ()
+{
+ const int n = 32, m=32;
+
+ int ary[m][n];
+ unsigned ix, iy;
+
+ for (ix = m; ix--;)
+ for (iy = n; iy--;)
+ ary[ix][iy] = (ix << 8) + iy;
+
+#pragma acc parallel copy(ary)
+ {
+ Worker (&ary[0][0], m, n, 1<<16);
+ }
+
+ int err = 0;
+
+ for (ix = m; ix--;)
+ for (iy = n; iy--;)
+ if (ary[ix][iy] != ((1 << 16) + (ix << 8) + iy))
+ {
+ printf ("ary[%u][%u] = %x expected %x\n",
+ ix, iy, ary[ix][iy], ((1 << 16) + (ix << 8) + iy));
+ err++;
+ }
+
+ if (err)
+ {
+ printf ("%d failed\n", err);
+ return 1;
+ }
+
+#pragma acc parallel copy(ary)
+ {
+ Vector (&ary[0][0], m * n, (1<<24) - (1<<16));
+ }
+
+ for (ix = m; ix--;)
+ for (iy = n; iy--;)
+ if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
+ {
+ printf ("ary[%u][%u] = %x expected %x\n",
+ ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
+ err++;
+ }
+
+ if (err)
+ {
+ printf ("%d failed\n", err);
+ return 1;
+ }
+
+ return 0;
+}