On 03/22/2018 06:24 PM, Cesar Philippidis wrote:
On 03/22/2018 09:18 AM, Tom de Vries wrote:
That's obviously not good enough.
When I compile this test-case:
...
int
main (void)
{
int a[10];
#pragma acc parallel num_workers (16)
#pragma acc loop worker
for (int i = 0; i < 10; i++)
a[i] = i;
return 0;
}
...
I get:
...
.maxntid 32, 16, 1
...
That's the change you need to isolate.
I attached an updated patch which incorporates the
cfun->machine->axis_dim changes. It now generates more precise arguments
for maxntid.
I'll try this out.
Still, this doesn't address my request: "Also, list in the comment a JIT
driver version, and sm_ version and a testcase for which this is required"
Thanks,
- Tom
Cesar
0001-emit-.maxntid-hint.patch
From 11035dc92884146dc4d974156adcb260568db785 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis <ce...@codesourcery.com>
Date: Thu, 22 Mar 2018 08:05:53 -0700
Subject: [PATCH] emit .maxntid hint
---
gcc/config/nvptx/nvptx.c | 19 +++++++++++++++++++
gcc/config/nvptx/nvptx.h | 2 ++
2 files changed, 21 insertions(+)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index eff87732c4b..3958f71e995 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -76,6 +76,7 @@
#include "target-def.h"
#define WORKAROUND_PTXJIT_BUG 1
+#define WORKAROUND_PTXJIT_BUG_3 1
/* Define dimension sizes for known hardware. */
#define PTX_VECTOR_LENGTH 32
@@ -1219,6 +1220,16 @@ nvptx_declare_function_name (FILE *file, const char
*name, const_tree decl)
stream, in order to share the prototype writing code. */
std::stringstream s;
write_fn_proto (s, true, name, decl);
+
+#if WORKAROUND_PTXJIT_BUG_3
+ /* Emitting a .maxntid seems to have the effect of encouraging the
+ PTX JIT emit SYNC branches. */
+ if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
+ && lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
+ s << ".maxntid " << cfun->machine->axis_dim[0] << ", "
+ << cfun->machine->axis_dim[1] << ", 1\n";
+#endif
+
s << "{\n";
bool return_in_mem = write_return_type (s, false, result_type);
@@ -2831,6 +2842,11 @@ struct offload_attrs
int max_workers;
};
+/* Define entries for cfun->machine->axis_dim. */
+
+#define MACH_VECTOR_LENGTH 0
+#define MACH_MAX_WORKERS 1
+
struct parallel
{
/* Parent parallel. */
@@ -4525,6 +4541,9 @@ nvptx_reorg (void)
populate_offload_attrs (&oa);
+ cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
+ cfun->machine->axis_dim[MACH_MAX_WORKERS] = oa.max_workers;
+
/* If there is worker neutering, there must be vector
neutering. Otherwise the hardware will fail. */
gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 8a14507c88a..958516da604 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -226,6 +226,8 @@ struct GTY(()) machine_function
int return_mode; /* Return mode of current fn.
(machine_mode not defined yet.) */
rtx axis_predicate[2]; /* Neutering predicates. */
+ int axis_dim[2]; /* Maximum number of threads on each axis, dim[0] is
+ vector_length, dim[1] is num_workers. */
rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */
rtx unisimt_predicate; /* Predicate for -muniform-simt. */
rtx unisimt_location; /* Mask location for -muniform-simt. */