This patch makes the c, c++ and fortran FEs duplicate the reduction clauses in a combined 'acc parallel loop' directive when it splits that directive into separate parallel and loop directives. So given something like
#pragma acc parallel loop reduction(+:var) for (i = 0; i < 10; i++) var++; all of the front ends will split the original directive into #pragma acc parallel reduction(+:var) #pragma acc loop reduction(+:var) for (i = 0; i < 10; i++) var++; Later on, the gimplifier will and an implicit copy(var) clause so that the original variable gets updated. There are still a couple of loose ends with this implementation. For instance, consider #pragma acc parallel loop reduction(+:var) vector num_gangs(10) for (i = 0; i < 10; i++) var++; This will get expanded into #pragma acc parallel reduction(+:var) num_gangs(10) #pragma acc loop vector reduction(+:var) for (i = 0; i < 10; i++) var++; And because OpenACC permits gang to operate in a redundant mode, the output of this loop can be either 10 or 100 depending if the compiler wants to assign gangs to the acc loop (which is permissible because vector clause implies independence). I'm still working out these details with the acc technical committee, but there is a consensus that the reduction clause needs to be duplicated in both the parallel and loop directives. Depending on who I talk to, some people consider the num_gangs example as invalid while others think that the compiler should add an implicit gang clause. By the way, I had to adjust template-reduction.C because of a parser bug that I found in PR70688. I didn't want to make this patch too big by including multiple bug fixes, so I modified that test case temporarily. Later on, once I fix PR70626, I re-enable that test coverage. Is this patch ok for gcc-6 and trunk? I bootstrapped and regression tested on x86_64-none-linux-gnu. Cesar
2016-04-15 Cesar Philippidis <ce...@codesourcery.com> gcc/c-family/ PR middle-end/70626 * c-common.h (c_oacc_split_loop_clauses): Add boolean argument. * c-omp.c (c_oacc_split_loop_clauses): Use it to duplicate reduction clauses in acc parallel loops. gcc/c/ PR middle-end/70626 * c-parser.c (c_parser_oacc_loop): Don't augment mask with OACC_LOOP_CLAUSE_MASK. (c_parser_oacc_kernels_parallel): Update call to c_oacc_split_loop_clauses. gcc/cp/ PR middle-end/70626 * parser.c (cp_parser_oacc_loop): Don't augment mask with OACC_LOOP_CLAUSE_MASK. (cp_parser_oacc_kernels_parallel): Update call to c_oacc_split_loop_clauses. gcc/fortran/ PR middle-end/70626 * trans-openmp.c (gfc_trans_oacc_combined_directive): Duplicate the reduction clause in both parallel and loop directives. gcc/testsuite/ PR middle-end/70626 * c-c++-common/goacc/combined-reduction.c: New test. * gfortran.dg/goacc/reduction-2.f95: Add check for kernels reductions. libgomp/ PR middle-end/70626 * testsuite/libgomp.oacc-c++/template-reduction.C: Adjust test. * testsuite/libgomp.oacc-c-c++-common/combined-reduction.c: New test. * testsuite/libgomp.oacc-fortran/combined-reduction.f90: New test. diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h index fa3746c..dd74d0d 100644 --- a/gcc/c-family/c-common.h +++ b/gcc/c-family/c-common.h @@ -1276,7 +1276,7 @@ extern bool c_omp_check_loop_iv (tree, tree, walk_tree_lh); extern bool c_omp_check_loop_iv_exprs (location_t, tree, tree, tree, tree, walk_tree_lh); extern tree c_finish_oacc_wait (location_t, tree, tree); -extern tree c_oacc_split_loop_clauses (tree, tree *); +extern tree c_oacc_split_loop_clauses (tree, tree *, bool); extern void c_omp_split_clauses (location_t, enum tree_code, omp_clause_mask, tree, tree *); extern tree c_omp_declare_simd_clauses_to_numbers (tree, tree); diff --git a/gcc/c-family/c-omp.c b/gcc/c-family/c-omp.c index 5469d0d5..be401bb 100644 --- a/gcc/c-family/c-omp.c +++ b/gcc/c-family/c-omp.c @@ -861,9 +861,10 @@ c_omp_check_loop_iv_exprs (location_t stmt_loc, tree declv, tree decl, #pragma acc parallel loop */ tree -c_oacc_split_loop_clauses (tree clauses, tree *not_loop_clauses) +c_oacc_split_loop_clauses (tree clauses, tree *not_loop_clauses, + bool is_parallel) { - tree next, loop_clauses; + tree next, loop_clauses, nc; loop_clauses = *not_loop_clauses = NULL_TREE; for (; clauses ; clauses = next) @@ -882,7 +883,23 @@ c_oacc_split_loop_clauses (tree clauses, tree *not_loop_clauses) case OMP_CLAUSE_SEQ: case OMP_CLAUSE_INDEPENDENT: case OMP_CLAUSE_PRIVATE: + OMP_CLAUSE_CHAIN (clauses) = loop_clauses; + loop_clauses = clauses; + break; + + /* Reductions must be duplicated on both constructs. */ case OMP_CLAUSE_REDUCTION: + if (is_parallel) + { + nc = build_omp_clause (OMP_CLAUSE_LOCATION (clauses), + OMP_CLAUSE_REDUCTION); + OMP_CLAUSE_DECL (nc) = OMP_CLAUSE_DECL (clauses); + OMP_CLAUSE_REDUCTION_CODE (nc) + = OMP_CLAUSE_REDUCTION_CODE (clauses); + OMP_CLAUSE_CHAIN (nc) = *not_loop_clauses; + *not_loop_clauses = nc; + } + OMP_CLAUSE_CHAIN (clauses) = loop_clauses; loop_clauses = clauses; break; diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 1b6bacd..17df493 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -13822,6 +13822,8 @@ static tree c_parser_oacc_loop (location_t loc, c_parser *parser, char *p_name, omp_clause_mask mask, tree *cclauses, bool *if_p) { + bool is_parallel = ((mask >> PRAGMA_OACC_CLAUSE_REDUCTION) & 1) == 1; + strcat (p_name, " loop"); mask |= OACC_LOOP_CLAUSE_MASK; @@ -13829,7 +13831,7 @@ c_parser_oacc_loop (location_t loc, c_parser *parser, char *p_name, cclauses == NULL); if (cclauses) { - clauses = c_oacc_split_loop_clauses (clauses, cclauses); + clauses = c_oacc_split_loop_clauses (clauses, cclauses, is_parallel); if (*cclauses) *cclauses = c_finish_omp_clauses (*cclauses, false); if (clauses) @@ -13924,8 +13926,6 @@ c_parser_oacc_kernels_parallel (location_t loc, c_parser *parser, if (strcmp (p, "loop") == 0) { c_parser_consume_token (parser); - mask |= OACC_LOOP_CLAUSE_MASK; - tree block = c_begin_omp_parallel (); tree clauses; c_parser_oacc_loop (loc, parser, p_name, mask, &clauses, if_p); diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index 5486129..c1ed5ef 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -35396,6 +35396,8 @@ static tree cp_parser_oacc_loop (cp_parser *parser, cp_token *pragma_tok, char *p_name, omp_clause_mask mask, tree *cclauses, bool *if_p) { + bool is_parallel = ((mask >> PRAGMA_OACC_CLAUSE_REDUCTION) & 1) == 1; + strcat (p_name, " loop"); mask |= OACC_LOOP_CLAUSE_MASK; @@ -35403,7 +35405,7 @@ cp_parser_oacc_loop (cp_parser *parser, cp_token *pragma_tok, char *p_name, cclauses == NULL); if (cclauses) { - clauses = c_oacc_split_loop_clauses (clauses, cclauses); + clauses = c_oacc_split_loop_clauses (clauses, cclauses, is_parallel); if (*cclauses) *cclauses = finish_omp_clauses (*cclauses, false); if (clauses) @@ -35496,8 +35498,6 @@ cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok, if (strcmp (p, "loop") == 0) { cp_lexer_consume_token (parser->lexer); - mask |= OACC_LOOP_CLAUSE_MASK; - tree block = begin_omp_parallel (); tree clauses; cp_parser_oacc_loop (parser, pragma_tok, p_name, mask, &clauses, diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c index a905ca6..c2d89eb 100644 --- a/gcc/fortran/trans-openmp.c +++ b/gcc/fortran/trans-openmp.c @@ -3497,7 +3497,8 @@ gfc_trans_oacc_combined_directive (gfc_code *code) construct_clauses.independent = false; construct_clauses.tile_list = NULL; construct_clauses.lists[OMP_LIST_PRIVATE] = NULL; - construct_clauses.lists[OMP_LIST_REDUCTION] = NULL; + if (construct_code == OACC_KERNELS) + construct_clauses.lists[OMP_LIST_REDUCTION] = NULL; oacc_clauses = gfc_trans_omp_clauses (&block, &construct_clauses, code->loc); } diff --git a/gcc/testsuite/c-c++-common/goacc/combined-reduction.c b/gcc/testsuite/c-c++-common/goacc/combined-reduction.c new file mode 100644 index 0000000..ecf23f5 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/combined-reduction.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-fopenacc -fdump-tree-gimple" } */ + +#include <assert.h> + +int +main () +{ + int i, v1 = 0, n = 100; + +#pragma acc parallel loop reduction(+:v1) + for (i = 0; i < n; i++) + v1++; + + assert (v1 == n); + +#pragma acc kernels loop reduction(+:v1) + for (i = 0; i < n; i++) + v1++; + + assert (v1 == n); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "omp target oacc_parallel reduction.+:v1. map.tofrom:v1" 1 "gimple" } } */ +/* { dg-final { scan-tree-dump-times "acc loop reduction.+:v1. private.i." 1 "gimple" } } */ +/* { dg-final { scan-tree-dump-times "omp target oacc_kernels map.force_tofrom:n .len: 4.. map.force_tofrom:v1 .len: 4.." 1 "gimple" } } */ +/* { dg-final { scan-tree-dump-times "acc loop reduction.+:v1. private.i." 1 "gimple" } } */ diff --git a/gcc/testsuite/gfortran.dg/goacc/reduction-2.f95 b/gcc/testsuite/gfortran.dg/goacc/reduction-2.f95 index 929fb0e..4d40998 100644 --- a/gcc/testsuite/gfortran.dg/goacc/reduction-2.f95 +++ b/gcc/testsuite/gfortran.dg/goacc/reduction-2.f95 @@ -15,7 +15,7 @@ subroutine foo () !$acc end kernels loop end subroutine -! { dg-final { scan-tree-dump-times "target oacc_parallel firstprivate.a." 1 "gimple" } } +! { dg-final { scan-tree-dump-times "target oacc_parallel reduction..:a. map.tofrom.a." 1 "gimple" } } ! { dg-final { scan-tree-dump-times "acc loop private.p. reduction..:a." 1 "gimple" } } ! { dg-final { scan-tree-dump-times "target oacc_kernels map.force_tofrom:a .len: 4.." 1 "gimple" } } ! { dg-final { scan-tree-dump-times "acc loop private.k. reduction..:a." 1 "gimple" } } diff --git a/libgomp/testsuite/libgomp.oacc-c++/template-reduction.C b/libgomp/testsuite/libgomp.oacc-c++/template-reduction.C index fb5924c..6c85fba 100644 --- a/libgomp/testsuite/libgomp.oacc-c++/template-reduction.C +++ b/libgomp/testsuite/libgomp.oacc-c++/template-reduction.C @@ -7,7 +7,7 @@ sum (T array[]) { T s = 0; -#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy (s, array[0:n]) +#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy (array[0:n]) for (int i = 0; i < n; i++) s += array[i]; @@ -25,7 +25,7 @@ sum () for (int i = 0; i < n; i++) array[i] = i+1; -#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy (s) +#pragma acc parallel loop num_gangs (10) gang reduction (+:s) for (int i = 0; i < n; i++) s += array[i]; @@ -43,7 +43,7 @@ async_sum (T array[]) for (int i = 0; i < n; i++) array[i] = i+1; -#pragma acc parallel loop num_gangs (10) gang reduction (+:s) present (array[0:n]) copy (s) async wait (1) +#pragma acc parallel loop num_gangs (10) gang reduction (+:s) present (array[0:n]) async wait (1) for (int i = 0; i < n; i++) s += array[i]; @@ -59,7 +59,7 @@ async_sum (int c) { T s = 0; -#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy(s) firstprivate (c) async wait (1) +#pragma acc parallel loop num_gangs (10) gang reduction (+:s) firstprivate (c) async wait (1) for (int i = 0; i < n; i++) s += i+c; diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-reduction.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-reduction.c new file mode 100644 index 0000000..b5ce4ed --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-reduction.c @@ -0,0 +1,23 @@ +/* Test a combined acc parallel loop reduction. */ + +/* { dg-do run } */ + +#include <assert.h> + +int +main () +{ + int i, v1 = 0, v2 = 0, n = 100; + +#pragma acc parallel loop reduction(+:v1, v2) + for (i = 0; i < n; i++) + { + v1++; + v2++; + } + + assert (v1 == n); + assert (v2 == n); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-fortran/combined-reduction.f90 b/libgomp/testsuite/libgomp.oacc-fortran/combined-reduction.f90 new file mode 100644 index 0000000..d3a61b5 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/combined-reduction.f90 @@ -0,0 +1,19 @@ +! Test a combined acc parallel loop reduction. + +! { dg-do run } + +program test + implicit none + integer i, n, var + + n = 100 + var = 0 + + !$acc parallel loop reduction(+:var) + do i = 1, 100 + var = var + 1 + end do + !$acc end parallel loop + + if (var .ne. n) call abort +end program test