Hi,
this patch uses the marked_independent field to skip the dependence
analysis in parloops for loops in oacc kernels regions.
Bootstrapped and reg-tested on x86_64.
Committed to gomp-4_0-branch.
Thanks,
- Tom
Use marked_independent in oacc kernels region
2015-07-14 Tom de Vries <t...@codesourcery.com>
* tree-parloops.c (parallelize_loops): Use marked_independent flag in
oacc kernels region.
* c-c++-common/goacc/kernels-independent.c: New test.
* testsuite/libgomp.oacc-c-c++-common/kernels-independent.c: New test.
---
.../c-c++-common/goacc/kernels-independent.c | 40 +++++++++++++++++++
gcc/tree-parloops.c | 21 ++++++++--
.../kernels-independent.c | 45 ++++++++++++++++++++++
3 files changed, 103 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-independent.c
create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-independent.c
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-independent.c b/gcc/testsuite/c-c++-common/goacc/kernels-independent.c
new file mode 100644
index 0000000..2f086b6
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-independent.c
@@ -0,0 +1,40 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+void
+foo (unsigned int *a, unsigned int *b, unsigned int *c)
+{
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ #pragma acc loop independent
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized, marked independent" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)pragma omp target oacc_parallel.*num_gangs\\(32\\)" 1 "parloops_oacc_kernels" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index f27dfa9..149c336 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -2797,9 +2797,24 @@ parallelize_loops (bool oacc_kernels_p)
if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p))
continue;
- if (!flag_loop_parallelize_all
- && !loop_parallel_p (loop, &parloop_obstack))
- continue;
+ if (!flag_loop_parallelize_all)
+ {
+ bool independent = (oacc_kernels_p
+ && loop->marked_independent);
+
+ if (independent)
+ {
+ if (dump_file
+ && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ " SUCCESS: may be parallelized, marked independent\n");
+ }
+ else
+ independent = loop_parallel_p (loop, &parloop_obstack);
+
+ if (!independent)
+ continue;
+ }
changed = true;
if (dump_file && (dump_flags & TDF_DETAILS))
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-independent.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-independent.c
new file mode 100644
index 0000000..d169a5f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-independent.c
@@ -0,0 +1,45 @@
+/* { dg-do run } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+void __attribute__((noinline,noclone))
+foo (unsigned int *a, unsigned int *b, unsigned int *c)
+{
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ a[i] = i * 2;
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+ {
+ #pragma acc loop independent
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = a[ii] + b[ii];
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != a[i] + b[i])
+ abort ();
+}
+
+int
+main (void)
+{
+ unsigned int *__restrict a;
+ unsigned int *__restrict b;
+ unsigned int *__restrict c;
+
+ a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+ c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+ foo (a, b, c);
+
+ return 0;
+}
--
1.9.1