Hi Jakub,
per the discussion on the bugzilla PR page, reductions on OpenACC loop
directives will automatically get a copy clause mapping on an enclosing
parallel construct (unless bounded by a local variable or an explicit
firstprivate clause).
There is also a patch for libgomp testsuite cases. Asides from the
fortran case which now needs explicit firstprivate clauses to work,
other C/C++ cases have been adjusted to remove explicit copy clauses.
(I have not exhaustively searched everywhere to eliminate them though)
This has been tested using gomp-4_0-branch, which is based on GCC 6,
which is what this PR was originally filed for.
I will be committing this soon for gomp-4_0-branch,
is this okay for gcc-6-branch and trunk as well?
Thanks,
Chung-Lin
2016-08-15 Chung-Lin Tang <[email protected]>
PR middle-end/70895
gcc/
* gimplify.c (omp_add_variable): Adjust/add variable mapping on
enclosing parallel construct for reduction variables on OpenACC loop
directives.
libgomp/
* testsuite/libgomp.oacc-fortran/reduction-7.f90: Add explicit
firstprivate clauses.
* testsuite/libgomp.oacc-c-c++-common/reduction-7.c: Remove explicit
copy clauses.
* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-flt.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/collapse-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/collapse-4.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c: Likewise.
Index: gcc/gimplify.c
===================================================================
--- gcc/gimplify.c (revision 239471)
+++ gcc/gimplify.c (working copy)
@@ -5897,6 +5897,37 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tr
n->value |= flags;
else
splay_tree_insert (ctx->variables, (splay_tree_key)decl, flags);
+
+ /* For reductions clauses in OpenACC loop directives, by default create a
+ copy clause on the enclosing parallel construct for carrying back the
+ results. */
+ if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION))
+ {
+ struct gimplify_omp_ctx *outer_ctx = ctx->outer_context;
+ while (outer_ctx)
+ {
+ n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl);
+ if (n != NULL)
+ {
+ /* Ignore local variables and explicitly declared clauses. */
+ if (n->value & (GOVD_LOCAL | GOVD_EXPLICIT))
+ break;
+ else if (outer_ctx->region_type == ORT_ACC_PARALLEL)
+ {
+ /* Remove firstprivate and make it a copy map. */
+ n->value &= ~GOVD_FIRSTPRIVATE;
+ n->value |= GOVD_MAP;
+ }
+ }
+ else if (outer_ctx->region_type == ORT_ACC_PARALLEL)
+ {
+ splay_tree_insert (outer_ctx->variables, (splay_tree_key)decl,
+ GOVD_MAP | GOVD_SEEN);
+ break;
+ }
+ outer_ctx = outer_ctx->outer_context;
+ }
+ }
}
/* Notice a threadprivate variable DECL used in OMP context CTX.
Index: libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90
===================================================================
--- libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90 (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-fortran/reduction-7.f90 (working copy)
@@ -50,7 +50,7 @@ subroutine redsub_private(sum, n, arr)
end subroutine redsub_private
-! Bogus reduction on an impliclitly firstprivate variable. The results do
+! Bogus reduction on a firstprivate variable. The results do
! survive the parallel region. The goal here is to ensure that gfortran
! doesn't ICE.
@@ -58,7 +58,7 @@ subroutine redsub_bogus(sum, n)
integer :: sum, n, arr(n)
integer :: i
- !$acc parallel
+ !$acc parallel firstprivate(sum)
!$acc loop gang worker vector reduction (+:sum)
do i = 1, n
sum = sum + 1
@@ -72,7 +72,7 @@ subroutine redsub_combined(sum, n, arr)
integer :: sum, n, arr(n)
integer :: i, j
- !$acc parallel copy (arr)
+ !$acc parallel copy (arr) firstprivate(sum)
!$acc loop gang
do i = 1, n
sum = i;
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c (working copy)
@@ -12,7 +12,7 @@ int main ()
int ondev = 0;
int t = 0, h = 0;
-#pragma acc parallel vector_length(32) copy(t) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ondev)
{
#pragma acc loop vector reduction (+:t)
for (unsigned ix = 0; ix < N; ix++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c (working copy)
@@ -11,7 +11,7 @@ int main ()
int ondev = 0;
int t = 0, h = 0;
-#pragma acc parallel num_workers(32) vector_length(32) copy(t) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
{
#pragma acc loop worker vector reduction (+:t)
for (unsigned ix = 0; ix < N; ix++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c (working copy)
@@ -13,8 +13,7 @@ void g_np_1()
for (i = 0; i < 1024; i++)
arr[i] = i;
- #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
- copy(res)
+ #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
/* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 16 } */
/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 16 } */
{
@@ -30,10 +29,9 @@ void g_np_1()
res = hres = 1;
- #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
- copy(res)
- /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 33 } */
- /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 33 } */
+ #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
+ /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 32 } */
+ /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 32 } */
{
#pragma acc loop gang reduction(*:res)
for (i = 0; i < 12; i++)
@@ -57,9 +55,8 @@ void gv_np_1()
for (i = 0; i < 1024; i++)
arr[i] = i;
- #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
- copy(res)
- /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 60 } */
+ #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
+ /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 58 } */
{
#pragma acc loop gang vector reduction(+:res)
for (i = 0; i < 1024; i++)
@@ -83,9 +80,8 @@ void gw_np_1()
for (i = 0; i < 1024; i++)
arr[i] = i;
- #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
- copy(res)
- /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 86 } */
+ #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
+ /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 83 } */
{
#pragma acc loop gang worker reduction(+:res)
for (i = 0; i < 1024; i++)
@@ -109,8 +105,7 @@ void gwv_np_1()
for (i = 0; i < 1024; i++)
arr[i] = i;
- #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
- copy(res)
+ #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
{
#pragma acc loop gang worker vector reduction(+:res)
for (i = 0; i < 1024; i++)
@@ -134,8 +129,7 @@ void gwv_np_2()
for (i = 0; i < 32768; i++)
arr[i] = i;
- #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
- copy(res)
+ #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
{
#pragma acc loop gang reduction(+:res)
for (j = 0; j < 32; j++)
@@ -167,7 +161,7 @@ void gwv_np_3()
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
- copyin(arr) copy(res)
+ copyin(arr)
{
#pragma acc loop gang reduction(+:res)
for (j = 0; j < 32; j++)
@@ -197,8 +191,7 @@ void gwv_np_4()
for (i = 0; i < 32768; i++)
arr[i] = i;
- #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
- copy(res, mres)
+ #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
{
#pragma acc loop gang reduction(+:res) reduction(max:mres)
for (j = 0; j < 32; j++)
@@ -249,7 +242,7 @@ void v_p_1()
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
private(res) copyout(out)
- /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 250 } */
+ /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 243 } */
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
@@ -326,7 +319,7 @@ void w_p_1()
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
private(res) copyout(out)
- /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 327 } */
+ /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 320 } */
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c (working copy)
@@ -12,7 +12,7 @@ int main ()
int ondev = 0;
int t = 0, h = 0;
-#pragma acc parallel num_gangs(32) vector_length(32) copy(t) copy(ondev)
+#pragma acc parallel num_gangs(32) vector_length(32) copy(ondev)
{
#pragma acc loop gang reduction (+:t)
for (unsigned ix = 0; ix < N; ix++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c (working copy)
@@ -12,7 +12,7 @@ int main ()
int ondev = 0;
int t = 0, h = 0;
-#pragma acc parallel num_workers(32) vector_length(32) copy(t) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
{
#pragma acc loop worker reduction(+:t)
for (unsigned ix = 0; ix < N; ix++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c (working copy)
@@ -22,7 +22,7 @@ vector (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel vector_length(32) copyin(ary[0:N])
{
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
@@ -46,7 +46,7 @@ worker (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel num_workers(32) copyin(ary[0:N])
{
#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
@@ -70,7 +70,7 @@ gang (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel num_gangs (32) copyin(ary[0:N])
{
#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/collapse-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/collapse-2.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/collapse-2.c (working copy)
@@ -8,7 +8,7 @@ main (void)
int i, j, k, l = 0, f = 0, x = 0;
int m1 = 4, m2 = -5, m3 = 17;
-#pragma acc parallel copy(l)
+#pragma acc parallel
#pragma acc loop seq collapse(3) reduction(+:l)
for (i = -2; i < m1; i++)
for (j = m2; j < -2; j++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c (working copy)
@@ -19,7 +19,7 @@ vector (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel vector_length(32) copyin(ary[0:N])
{
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
@@ -43,7 +43,7 @@ worker (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel num_workers(32) copyin(ary[0:N])
{
#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
@@ -67,7 +67,7 @@ gang (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel num_gangs (32) copyin(ary[0:N])
{
#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/collapse-4.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/collapse-4.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/collapse-4.c (working copy)
@@ -11,7 +11,7 @@ main (void)
memset (b, '\0', sizeof (b));
-#pragma acc parallel copy(b[0:3][0:3]) copy(l)
+#pragma acc parallel copy(b[0:3][0:3])
{
#pragma acc loop collapse(2) reduction(+:l)
for (i = 0; i < 2; i++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c (working copy)
@@ -11,7 +11,7 @@ int main ()
int ondev = 0;
int t = 0, h = 0;
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(t) copy(ondev)
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ondev)
{
#pragma acc loop gang worker vector reduction(+:t)
for (unsigned ix = 0; ix < N; ix++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c (working copy)
@@ -22,7 +22,7 @@ vector (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel vector_length(32) copyin(ary[0:N])
{
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
@@ -46,7 +46,7 @@ worker (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel num_workers(32) copyin(ary[0:N])
{
#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
@@ -70,7 +70,7 @@ gang (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel num_gangs (32) copyin(ary[0:N])
{
#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-flt.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-flt.c (revision 239471)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-flt.c (working copy)
@@ -19,7 +19,7 @@ vector (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel vector_length(32) copyin(ary[0:N])
{
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
@@ -43,7 +43,7 @@ worker (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel num_workers(32) copyin(ary[0:N])
{
#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
@@ -67,7 +67,7 @@ gang (Type ary[N], Type sum, Type prod)
{
Type tsum = 0, tprod = 1;
-#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
+#pragma acc parallel num_gangs (32) copyin(ary[0:N])
{
#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)