On 03/22/2018 06:24 PM, Cesar Philippidis wrote:
On 03/22/2018 09:18 AM, Tom de Vries wrote:

That's obviously not good enough.

When I compile this test-case:
...
int
main (void)
{
   int a[10];
#pragma acc parallel num_workers (16)
#pragma acc loop worker
   for (int i = 0; i < 10; i++)
     a[i] = i;

   return 0;
}
...

I get:
...
  .maxntid 32, 16, 1
...

That's the change you need to isolate.

I attached an updated patch which incorporates the
cfun->machine->axis_dim changes. It now generates more precise arguments
for maxntid.

I'll try this out.

Still, this doesn't address my request: "Also, list in the comment a JIT driver version, and sm_ version and a testcase for which this is required"

Thanks,
- Tom


Cesar


0001-emit-.maxntid-hint.patch


 From 11035dc92884146dc4d974156adcb260568db785 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis <ce...@codesourcery.com>
Date: Thu, 22 Mar 2018 08:05:53 -0700
Subject: [PATCH] emit .maxntid hint

---
  gcc/config/nvptx/nvptx.c | 19 +++++++++++++++++++
  gcc/config/nvptx/nvptx.h |  2 ++
  2 files changed, 21 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index eff87732c4b..3958f71e995 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -76,6 +76,7 @@
  #include "target-def.h"
#define WORKAROUND_PTXJIT_BUG 1
+#define WORKAROUND_PTXJIT_BUG_3 1
/* Define dimension sizes for known hardware. */
  #define PTX_VECTOR_LENGTH 32
@@ -1219,6 +1220,16 @@ nvptx_declare_function_name (FILE *file, const char 
*name, const_tree decl)
       stream, in order to share the prototype writing code.  */
    std::stringstream s;
    write_fn_proto (s, true, name, decl);
+
+#if WORKAROUND_PTXJIT_BUG_3
+  /* Emitting a .maxntid seems to have the effect of encouraging the
+     PTX JIT emit SYNC branches.  */
+  if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
+      && lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
+      s << ".maxntid " << cfun->machine->axis_dim[0] << ", "
+       << cfun->machine->axis_dim[1] << ", 1\n";
+#endif
+
    s << "{\n";
bool return_in_mem = write_return_type (s, false, result_type);
@@ -2831,6 +2842,11 @@ struct offload_attrs
    int max_workers;
  };
+/* Define entries for cfun->machine->axis_dim. */
+
+#define MACH_VECTOR_LENGTH 0
+#define MACH_MAX_WORKERS 1
+
  struct parallel
  {
    /* Parent parallel.  */
@@ -4525,6 +4541,9 @@ nvptx_reorg (void)
populate_offload_attrs (&oa); + cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
+      cfun->machine->axis_dim[MACH_MAX_WORKERS] = oa.max_workers;
+
        /* If there is worker neutering, there must be vector
         neutering.  Otherwise the hardware will fail.  */
        gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 8a14507c88a..958516da604 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -226,6 +226,8 @@ struct GTY(()) machine_function
    int return_mode; /* Return mode of current fn.
                      (machine_mode not defined yet.) */
    rtx axis_predicate[2]; /* Neutering predicates.  */
+  int axis_dim[2]; /* Maximum number of threads on each axis, dim[0] is
+                     vector_length, dim[1] is num_workers.   */
    rtx unisimt_master; /* 'Master lane index' for -muniform-simt.  */
    rtx unisimt_predicate; /* Predicate for -muniform-simt.  */
    rtx unisimt_location; /* Mask location for -muniform-simt.  */


Reply via email to