On 03/02/2018 09:47 PM, Cesar Philippidis wrote:
        libgomp/
        * plugin/plugin-nvptx.c (nvptx_exec): Adjust calculations of
        workers and vectors.

I wrote a test case that triggers this code, and added it to this code.

Build x86_64 with nvptx accelerator and tested libgomp.

Committed.

Thanks,
- Tom

[nvptx] Handle large vectors in libgomp

2018-04-05  Cesar Philippidis  <ce...@codesourcery.com>
	    Tom de Vries  <t...@codesourcery.com>

	* plugin/plugin-nvptx.c (nvptx_exec): Adjust calculations of
	workers and vectors.
	* testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c: New test.

---
 libgomp/plugin/plugin-nvptx.c                      | 10 +++---
 .../vector-length-128-7.c                          | 41 ++++++++++++++++++++++
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index bdc0c30..9b4768f 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -734,8 +734,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   int threads_per_block = threads_per_sm > block_size
     ? block_size : threads_per_sm;
 
-  threads_per_block /= warp_size;
-
   if (threads_per_sm > cpu_size)
     threads_per_sm = cpu_size;
 
@@ -802,6 +800,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
   if (seen_zero)
     {
+      int vectors = dims[GOMP_DIM_VECTOR] > 0
+	? dims[GOMP_DIM_VECTOR] : warp_size;
+      int workers = threads_per_block / vectors;
+
       for (i = 0; i != GOMP_DIM_MAX; i++)
 	if (!dims[i])
 	  {
@@ -819,10 +821,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 		  : 2 * dev_size;
 		break;
 	      case GOMP_DIM_WORKER:
-		dims[i] = threads_per_block;
+		dims[i] = workers;
 		break;
 	      case GOMP_DIM_VECTOR:
-		dims[i] = warp_size;
+		dims[i] = vectors;
 		break;
 	      default:
 		abort ();
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c
new file mode 100644
index 0000000..60c264c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c
@@ -0,0 +1,41 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-additional-options "-foffload=-fdump-tree-oaccdevlow" } */
+/* { dg-additional-options "-foffload=-mlong-vector-in-workers" } */
+/* { dg-set-target-env-var "GOMP_DEBUG" "1" } */
+
+#include <stdlib.h>
+
+#define N 1024
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+unsigned int n = N;
+
+int
+main (void)
+{
+  for (unsigned int i = 0; i < n; ++i)
+    {
+      a[i] = i % 3;
+      b[i] = i % 5;
+    }
+
+#pragma acc parallel vector_length (128) copyin (a,b) copyout (c)
+  {
+#pragma acc loop worker
+    for (unsigned int i = 0; i < 4; i++)
+#pragma acc loop vector
+      for (unsigned int j = 0; j < n / 4; j++)
+	c[(i * N / 4) + j] = a[(i * N / 4) + j] + b[(i * N / 4) + j];
+  }
+
+  for (unsigned int i = 0; i < n; ++i)
+    if (c[i] != (i % 3) + (i % 5))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 0, 128\\)" "oaccdevlow" } } */
+/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=8, vectors=128" } */

Reply via email to