Hi,

we've recently added the new feature allowing vector length larger than 32.

But when we compile a test-case like this:
...
#pragma acc routine vector
void __attribute__((noinline, noclone))
Vector (int *ptr, int n, const int inc)
{
  #pragma acc loop vector
  for (unsigned ix = 0; ix < n; ix++)
    ptr[ix] += inc;
}

int
main (void)
{
  const int n = 32, m=32;

  int ary[m][n];
  unsigned ix,  iy;

#pragma acc parallel copy(ary) vector_length(128)
  {
    Vector (&ary[0][0], m * n, (1<<24) - (1<<16));
  }

  return 0;
}
...
the offloading region is compiled with vector length 128, but the routine is compiled with vector length 32, which leads to runtime failures.

The code for the routine assumes that ntid.x == 32 (because state propagation is done using inter-warp shuffle instructions), and calling the routine from the offloading region where ntid.x == 128 breaks that assumption.

An easy fix would be to make vector_length > 32 the default in routines, but for now we don't want to switch it on by default anywhere.

This patch fixes the runtime failure by forcing vector length 32 if an offloading function contains calls to vector-partitionable routines.

Build x86_64 with nvptx accelerator, tested libgomp.

Committed to og7.

Thanks,
- Tom
[nvptx] Force vl32 if calling vector-partitionable routines

2018-04-23  Tom de Vries  <t...@codesourcery.com>

	PR target/85486
	* omp-offload.c (oacc_fn_attrib_level): Remove static.
	* omp-offload.h (oacc_fn_attrib_level): Declare.
	* config/nvptx/nvptx.c (has_vector_partitionable_routine_calls_p): New
	function.
	(nvptx_goacc_validate_dims): Force vector length 32 if offloading
	function calls vector-partitionable routines.

	* testsuite/libgomp.oacc-c-c++-common/pr85486-2.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85486-3.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr85486.c: New test.

---
 gcc/config/nvptx/nvptx.c                           | 90 ++++++++++++++++++++--
 gcc/omp-offload.c                                  |  2 +-
 gcc/omp-offload.h                                  |  1 +
 .../libgomp.oacc-c-c++-common/pr85486-2.c          | 53 +++++++++++++
 .../libgomp.oacc-c-c++-common/pr85486-3.c          | 56 ++++++++++++++
 .../testsuite/libgomp.oacc-c-c++-common/pr85486.c  | 52 +++++++++++++
 6 files changed, 247 insertions(+), 7 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 3aee9cc..77c4d71 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -5194,6 +5194,40 @@ nvptx_goacc_needs_vl_warp ()
   return attr != NULL_TREE;
 }
 
+/* Return true if FNDECL contains calls to vector-partitionable routines.  */
+
+static bool
+has_vector_partitionable_routine_calls_p (tree fndecl)
+{
+  if (!fndecl)
+    return false;
+
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
+    for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
+	 gsi_next_nondebug (&i))
+      {
+	gimple *stmt = gsi_stmt (i);
+	if (gimple_code (stmt) != GIMPLE_CALL)
+	  continue;
+
+	tree callee = gimple_call_fndecl (stmt);
+	if (!callee)
+	  continue;
+
+	tree attrs  = oacc_get_fn_attrib (callee);
+	if (attrs == NULL_TREE)
+	  return false;
+
+	int partition_level = oacc_fn_attrib_level (attrs);
+	bool seq_routine_p = partition_level == GOMP_DIM_MAX;
+	if (!seq_routine_p)
+	  return true;
+      }
+
+  return false;
+}
+
 /* Validate compute dimensions of an OpenACC offload or routine, fill
    in non-unity defaults.  FN_LEVEL indicates the level at which a
    routine might spawn a loop.  It is negative for non-routines.  If
@@ -5202,13 +5236,45 @@ nvptx_goacc_needs_vl_warp ()
 static bool
 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
 {
-  int default_vector_length = PTX_VECTOR_LENGTH;
+  bool oacc_default_dims_p ATTRIBUTE_UNUSED = false;
+  bool oacc_min_dims_p ATTRIBUTE_UNUSED = false;
+  bool offload_region_p = false;
+  bool routine_p = false;
+  bool routine_seq_p = false;
+
+  if (decl == NULL_TREE)
+    {
+      if (fn_level == -1)
+	oacc_default_dims_p = true;
+      else if (fn_level == -2)
+	oacc_min_dims_p = true;
+      else
+	gcc_unreachable ();
+    }
+  else if (fn_level == -1)
+    offload_region_p = true;
+  else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
+    {
+      routine_p = true;
+      if (fn_level == GOMP_DIM_MAX)
+	routine_seq_p = true;
+    }
+  else
+    gcc_unreachable ();
 
+  int default_vector_length = PTX_VECTOR_LENGTH;
   /* For capability reasons, fallback to vl = 32 for runtime values.  */
   if (dims[GOMP_DIM_VECTOR] == 0)
     default_vector_length = PTX_WARP_SIZE;
   else if (decl)
-    default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
+    {
+      default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
+      if ((offload_region_p
+	   || (routine_p && !routine_seq_p))
+	  && default_vector_length > PTX_WARP_SIZE
+	  && has_vector_partitionable_routine_calls_p (decl))
+	default_vector_length = PTX_WARP_SIZE;
+    }
 
   /* Detect if a function is unsuitable for offloading.  */
   if (!flag_offload_force && decl)
@@ -5234,12 +5300,24 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
 
   bool changed = false;
 
+  if ((offload_region_p
+       || (routine_p && !routine_seq_p))
+      && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE
+      && has_vector_partitionable_routine_calls_p (decl))
+    {
+	warning_at (DECL_SOURCE_LOCATION (decl), 0,
+		    G_("using vector_length (%d) due to call to"
+		       " vector-partitionable routine, ignoring %d"),
+		    PTX_WARP_SIZE, dims[GOMP_DIM_VECTOR]);
+      dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+      changed = true;
+    }
   /* The vector size must be a positive multiple of the warp size,
      unless this is a SEQ routine.  */
-  if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
-      && dims[GOMP_DIM_VECTOR] >= 0
-      && (dims[GOMP_DIM_VECTOR] % 32 != 0
-	  || dims[GOMP_DIM_VECTOR] == 0))
+  else if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
+	   && dims[GOMP_DIM_VECTOR] >= 0
+	   && (dims[GOMP_DIM_VECTOR] % 32 != 0
+	       || dims[GOMP_DIM_VECTOR] == 0))
     {
       if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
 	warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 66c6212..dcd7a87 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -85,7 +85,7 @@ vec<tree, va_gc> *offload_funcs, *offload_vars;
 /* Return level at which oacc routine may spawn a partitioned loop, or
    -1 if it is not a routine (i.e. is an offload fn).  */
 
-static int
+int
 oacc_fn_attrib_level (tree attr)
 {
   tree pos = TREE_VALUE (attr);
diff --git a/gcc/omp-offload.h b/gcc/omp-offload.h
index 014ee52..7507338 100644
--- a/gcc/omp-offload.h
+++ b/gcc/omp-offload.h
@@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see
 #define GCC_OMP_DEVICE_H
 
 extern int oacc_get_default_dim (int dim);
+extern int oacc_fn_attrib_level (tree attr);
 
 extern GTY(()) vec<tree, va_gc> *offload_funcs;
 extern GTY(()) vec<tree, va_gc> *offload_vars;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85486-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85486-2.c
new file mode 100644
index 0000000..a92b5dd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85486-2.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1" } } */
+/* { dg-additional-options "-fopenacc-dim=-:-:128" } */
+
+/* Minimized from ref-1.C.  */
+
+#include <stdio.h>
+
+#pragma acc routine vector
+void __attribute__((noinline, noclone))
+Vector (int *ptr, int n, const int inc)
+{
+  #pragma acc loop vector
+  for (unsigned ix = 0; ix < n; ix++)
+    ptr[ix] += inc;
+}
+
+int
+main (void)
+{
+  const int n = 32, m=32;
+
+  int ary[m][n];
+  unsigned ix,  iy;
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      ary[ix][iy] = (1 << 16) + (ix << 8) + iy;
+
+  int err = 0;
+
+#pragma acc parallel copy (ary)
+  {
+    Vector (&ary[0][0], m * n, (1 << 24) - (1 << 16));
+  }
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
+	{
+	  printf ("ary[%u][%u] = %x expected %x\n",
+		  ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
+	  err++;
+	}
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85486-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85486-3.c
new file mode 100644
index 0000000..ae62206
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85486-3.c
@@ -0,0 +1,56 @@
+/* { dg-do run } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1" } } */
+/* { dg-additional-options "-fopenacc-dim=-:-:-" } */
+/* { dg-set-target-env-var "GOMP_OPENACC_DIM" "-:-:128" } */
+
+/* Minimized from ref-1.C.  */
+
+#include <stdio.h>
+
+#pragma acc routine vector
+void __attribute__((noinline, noclone))
+Vector (int *ptr, int n, const int inc)
+{
+  #pragma acc loop vector
+  for (unsigned ix = 0; ix < n; ix++)
+    ptr[ix] += inc;
+}
+
+int
+main (void)
+{
+  const int n = 32, m=32;
+
+  int ary[m][n];
+  unsigned ix,  iy;
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      ary[ix][iy] = (1 << 16) + (ix << 8) + iy;
+
+  int err = 0;
+
+#pragma acc parallel copy (ary)
+  {
+    Vector (&ary[0][0], m * n, (1 << 24) - (1 << 16));
+  }
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
+	{
+	  printf ("ary[%u][%u] = %x expected %x\n",
+		  ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
+	  err++;
+	}
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+    }
+
+  return 0;
+}
+
+/* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85486.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85486.c
new file mode 100644
index 0000000..f91dee0
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr85486.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1" } } */
+
+/* Minimized from ref-1.C.  */
+
+#include <stdio.h>
+
+#pragma acc routine vector
+void __attribute__((noinline, noclone))
+Vector (int *ptr, int n, const int inc)
+{
+  #pragma acc loop vector
+  for (unsigned ix = 0; ix < n; ix++)
+    ptr[ix] += inc;
+}
+
+int
+main (void)
+{
+  const int n = 32, m=32;
+
+  int ary[m][n];
+  unsigned ix,  iy;
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      ary[ix][iy] = (1 << 16) + (ix << 8) + iy;
+
+  int err = 0;
+
+#pragma acc parallel copy (ary) vector_length (128) /* { dg-warning "using vector_length \\(32\\) due to call to vector-partitionable routine, ignoring 128" } */
+  {
+    Vector (&ary[0][0], m * n, (1 << 24) - (1 << 16));
+  }
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
+	{
+	  printf ("ary[%u][%u] = %x expected %x\n",
+		  ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
+	  err++;
+	}
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+    }
+
+  return 0;
+}

Reply via email to