Kernels regions are decomposed into one or more smaller regions that are to be
executed in sequence. With this patch, all of these regions are launched
asynchronously, and a wait directive is added after them. This means that the
host only waits once for the kernels to complete, not once per kernel. If the
original kernels region was marked async, that asynchronous behavior is
preserved, and no wait is added.
2019-07-16 Gergö Barany <ge...@codesourcery.com>
gcc/
* omp-oacc-kernels.c (add_async_clauses_and_wait): New function...
(decompose_kernels_region_body): ... called from here.
gcc/testsuite/
* c-c++-common/goacc/kernels-conversion.c: Test automatically generated
async clauses.
* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
---
gcc/omp-oacc-kernels.c | 56 ++++++++++++++++++++--
.../c-c++-common/goacc/kernels-conversion.c | 5 ++
.../gfortran.dg/goacc/kernels-conversion.f95 | 5 ++
3 files changed, 63 insertions(+), 3 deletions(-)
diff --git a/gcc/omp-oacc-kernels.c b/gcc/omp-oacc-kernels.c
index 11a960c..0fae74a 100644
--- a/gcc/omp-oacc-kernels.c
+++ b/gcc/omp-oacc-kernels.c
@@ -66,7 +66,13 @@ along with GCC; see the file COPYING3. If not see
gang-parallelizable loop inside an if statement is "gang-serialized" by
the transformation.
The transformation visits loops inside such new gang-single-regions and
- removes and warns about any gang annotations. */
+ removes and warns about any gang annotations.
+ - In order to make the host wait only once for the whole region instead
+ of once per kernel launch, the new parallel and serial regions are
+ annotated async. Unless the original kernels region was marked async,
+ the entire region ends with a wait construct. If the original kernels
+ region was marked async, the generated async statements use the async
+ queue the kernels region was annotated with (possibly implicitly). */
/* Helper function for decompose_kernels_region_body. If STMT contains a
"top-level" OMP_FOR statement, returns a pointer to that statement;
@@ -676,6 +682,38 @@ maybe_build_inner_data_region (location_t loc, gimple
*body,
return body;
}
+/* Helper function of decompose_kernels_region_body. The statements in
+ REGION_BODY are expected to be decomposed parallel regions; add an
+ "async" clause to each. Also add a "wait" pragma at the end of the
+ sequence. */
+
+static void
+add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
+{
+ tree default_async_queue
+ = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
+ for (gimple_stmt_iterator gsi = gsi_start (*region_body);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ tree target_clauses = gimple_omp_target_clauses (stmt);
+ tree new_async_clause = build_omp_clause (loc, OMP_CLAUSE_ASYNC);
+ OMP_CLAUSE_OPERAND (new_async_clause, 0) = default_async_queue;
+ OMP_CLAUSE_CHAIN (new_async_clause) = target_clauses;
+ target_clauses = new_async_clause;
+ gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
+ target_clauses);
+ }
+ /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0). */
+ tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+ tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+ gimple *wait_call = gimple_build_call (wait_fn, 2,
+ sync_arg, integer_zero_node);
+ gimple_set_location (wait_call, loc);
+ gimple_seq_add_stmt (region_body, wait_call);
+}
+
/* Auxiliary analysis of the body of a kernels region, to determine for each
OpenACC loop whether it is control-dependent (i.e., not necessarily
executed every time the kernels region is entered) or not.
@@ -890,10 +928,12 @@ decompose_kernels_region_body (gimple *kernels_region,
tree kernels_clauses)
except that the num_gangs, num_workers, and vector_length clauses will
only be added to loop regions. The other regions are "gang-single" and
get an explicit num_gangs(1) clause. So separate out the num_gangs,
- num_workers, and vector_length clauses here. */
+ num_workers, and vector_length clauses here.
+ Also check for the presence of an async clause but do not remove it
+ from the kernels clauses. */
tree num_gangs_clause = NULL, num_workers_clause = NULL,
vector_length_clause = NULL;
- tree prev_clause = NULL, next_clause = NULL;
+ tree prev_clause = NULL, next_clause = NULL, async_clause = NULL;
tree parallel_clauses = kernels_clauses;
for (tree c = parallel_clauses; c; c = next_clause)
{
@@ -927,6 +967,8 @@ decompose_kernels_region_body (gimple *kernels_region, tree
kernels_clauses)
}
else
prev_clause = c;
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
+ async_clause = c;
}
gimple *kernels_body = gimple_omp_body (kernels_region);
@@ -1090,6 +1132,14 @@ decompose_kernels_region_body (gimple *kernels_region,
tree kernels_clauses)
gimple_seq_add_stmt (®ion_body, single_region);
}
+ /* We want to launch these kernels asynchronously. If the original
+ kernels region had an async clause, this is done automatically because
+ that async clause was copied to the individual regions we created.
+ Otherwise, add an async clause to each newly created region, as well as
+ a wait directive at the end. */
+ if (async_clause == NULL)
+ add_async_clauses_and_wait (loc, ®ion_body);
+
tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
gimple *body = gimple_build_bind (kernels_locals, region_body,
make_node (BLOCK));
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
index ed4d642..3e52ec4 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
@@ -49,5 +49,10 @@ main (void)
/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3
"convert_oacc_kernels" } } */
/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2
"convert_oacc_kernels" } } */
+/* Each of the parallel regions is async, and there is a final call to
+ __builtin_GOACC_wait. */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels.* async\(-1\)" 5
"convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1
"convert_oacc_kernels" } } */
+
/* Check that the original kernels region is removed. */
/* { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" } } */
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
index f89e46b..559916c 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
@@ -47,5 +47,10 @@ end program main
! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3
"convert_oacc_kernels" } }
! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2
"convert_oacc_kernels" } }
+! Each of the parallel regions is async, and there is a final call to
+! __builtin_GOACC_wait.
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels.* async\(-1\)" 5
"convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1
"convert_oacc_kernels" } }
+
! Check that the original kernels region is removed.
! { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" } }
--
2.8.1