On 03/02/2018 08:18 PM, Cesar Philippidis wrote:
The attached patch adjusts the existing goacc validate_dims target hook and introduces a new goacc adjust_parallelism target hook.
The attached patch now just introduces the nvptx_adjust_parallelism target hook implementation, which enables test-cases to start using the feature.
Build x86_64 with nvptx accelerator and tested libgomp. Committed. Thanks, - Tom
[nvptx] Enable large vectors 2018-04-05 Cesar Philippidis <ce...@codesourcery.com> Tom de Vries <t...@codesourcery.com> * omp-offload.c (oacc_get_default_dim): New function. * omp-offload.h (oacc_get_default_dim): Declare. * config/nvptx/nvptx.c (NVPTX_GOACC_VL_WARP): Define. (nvptx_goacc_needs_vl_warp): New function. (nvptx_goacc_validate_dims): Take larger vector lengths into account. (nvptx_adjust_parallelism): New function. (TARGET_GOACC_ADJUST_PARALLELISM): Define. (populate_offload_attrs): Handle the situation where the default runtime geometry has not been initialized yet for reductions. * testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c: Expect vector length to be 128. * testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c: Same. * testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c: Same. * testsuite/libgomp.oacc-c-c++-common/vred2d-128.c: Same. * testsuite/libgomp.oacc-fortran/gemm.f90: Same. --- gcc/config/nvptx/nvptx.c | 148 +++++++++++++++++++-- gcc/omp-offload.c | 7 + gcc/omp-offload.h | 2 + .../vector-length-128-1.c | 5 +- .../vector-length-128-10.c | 1 - .../vector-length-128-2.c | 5 +- .../libgomp.oacc-c-c++-common/vred2d-128.c | 2 - libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 | 1 - 8 files changed, 153 insertions(+), 18 deletions(-) diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 51bd69d..595413a 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -71,6 +71,7 @@ #include "fold-const.h" #include "intl.h" #include "tree-hash-traits.h" +#include "omp-offload.h" /* This file should be included last. */ #include "target-def.h" @@ -4634,15 +4635,20 @@ populate_offload_attrs (offload_attrs *oa) if (oa->vector_length == 0) { /* FIXME: Need a more graceful way to handle large vector - lengths in OpenACC routines. */ + lengths in OpenACC routines and also -fopenacc-dims. */ if (!lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (current_function_decl))) oa->vector_length = PTX_WARP_SIZE; - else + else if (PTX_VECTOR_LENGTH != PTX_WARP_SIZE) oa->vector_length = PTX_VECTOR_LENGTH; } if (oa->num_workers == 0) - oa->max_workers = PTX_CTA_SIZE / oa->vector_length; + { + if (oa->vector_length == 0) + oa->max_workers = PTX_WORKER_LENGTH; + else + oa->max_workers = PTX_CTA_SIZE / oa->vector_length; + } else oa->max_workers = oa->num_workers; } @@ -5193,6 +5199,19 @@ nvptx_simt_vf () return PTX_WARP_SIZE; } +#define NVPTX_GOACC_VL_WARP "nvptx vl warp" + +/* Return true of the offloaded function needs a vector_length of + PTX_WARP_SIZE. */ + +static bool +nvptx_goacc_needs_vl_warp () +{ + tree attr = lookup_attribute (NVPTX_GOACC_VL_WARP, + DECL_ATTRIBUTES (current_function_decl)); + return attr != NULL_TREE; +} + /* Validate compute dimensions of an OpenACC offload or routine, fill in non-unity defaults. FN_LEVEL indicates the level at which a routine might spawn a loop. It is negative for non-routines. If @@ -5201,6 +5220,14 @@ nvptx_simt_vf () static bool nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level) { + int default_vector_length = PTX_VECTOR_LENGTH; + + /* For capability reasons, fallback to vl = 32 for runtime values. */ + if (dims[GOMP_DIM_VECTOR] == 0) + default_vector_length = PTX_WARP_SIZE; + else if (decl) + default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR); + /* Detect if a function is unsuitable for offloading. */ if (!flag_offload_force && decl) { @@ -5225,18 +5252,20 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level) bool changed = false; - /* The vector size must be 32, unless this is a SEQ routine. */ + /* The vector size must be a positive multiple of the warp size, + unless this is a SEQ routine. */ if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1 && dims[GOMP_DIM_VECTOR] >= 0 - && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH) + && (dims[GOMP_DIM_VECTOR] % 32 != 0 + || dims[GOMP_DIM_VECTOR] == 0)) { if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0) warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, dims[GOMP_DIM_VECTOR] ? G_("using vector_length (%d), ignoring %d") : G_("using vector_length (%d), ignoring runtime setting"), - PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]); - dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; + default_vector_length, dims[GOMP_DIM_VECTOR]); + dims[GOMP_DIM_VECTOR] = default_vector_length; changed = true; } @@ -5250,16 +5279,77 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level) changed = true; } + /* Ensure that num_worker * vector_length < cta size. */ + if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE) + { + warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, + G_("using vector_length (%d), ignoring %d"), + default_vector_length, dims[GOMP_DIM_VECTOR]); + dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; + changed = true; + } + + /* vector_length must not exceed PTX_CTA_SIZE. */ + if (dims[GOMP_DIM_VECTOR] >= PTX_CTA_SIZE) + { + int new_vector = PTX_CTA_SIZE; + if (decl) + new_vector = default_vector_length; + warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, + G_("using vector_length (%d), ignoring %d"), + new_vector, dims[GOMP_DIM_VECTOR]); + dims[GOMP_DIM_VECTOR] = new_vector; + changed = true; + } + + /* Set vector_length to default_vector_length if there are a sufficient + number of free threads in the CTA. */ + if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] <= 0) + { + if (dims[GOMP_DIM_WORKER] * default_vector_length <= PTX_CTA_SIZE) + dims[GOMP_DIM_VECTOR] = default_vector_length; + else if (dims[GOMP_DIM_WORKER] * PTX_WARP_SIZE <= PTX_CTA_SIZE) + dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; + else + error_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, + "vector_length must be at least 32"); + changed = true; + } + + /* Specify a default vector_length. */ + if (dims[GOMP_DIM_VECTOR] < 0) + { + dims[GOMP_DIM_VECTOR] = default_vector_length; + changed = true; + } + + if (nvptx_goacc_needs_vl_warp () && dims[GOMP_DIM_VECTOR] != PTX_WARP_SIZE) + { + dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; + changed = true; + } + if (!decl) { - dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; + bool new_vector = false; + if (dims[GOMP_DIM_VECTOR] <= 1) + { + dims[GOMP_DIM_VECTOR] = default_vector_length; + new_vector = true; + } if (dims[GOMP_DIM_WORKER] < 0) dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM; if (dims[GOMP_DIM_GANG] < 0) dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM; + if (new_vector + && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE) + dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; changed = true; } + gcc_assert (dims[GOMP_DIM_VECTOR] != 0); + gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE); + return changed; } @@ -5279,6 +5369,45 @@ nvptx_dim_limit (int axis) return 0; } +/* Adjust the parallelism available to a loop given vector_length + associated with the offloaded function. */ + +static unsigned +nvptx_adjust_parallelism (unsigned inner_mask, unsigned outer_mask) +{ + if (nvptx_goacc_needs_vl_warp ()) + return inner_mask; + + bool wv = (inner_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + && (inner_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)); + offload_attrs oa; + + populate_offload_attrs (&oa); + + if (oa.vector_length == PTX_WARP_SIZE) + return inner_mask; + + /* FIXME: This is overly conservative; worker and vector loop will + eventually be combined. */ + if (wv) + return inner_mask & ~GOMP_DIM_MASK (GOMP_DIM_WORKER); + + /* It's difficult to guarantee that warps in large vector_lengths + will remain convergent when a vector loop is nested inside a + worker loop. Therefore, fallback to setting vector_length to + PTX_WARP_SIZE. Hopefully this condition may be relaxed for + sm_70+ targets. */ + if ((inner_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) + && (outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) + { + tree attr = tree_cons (get_identifier (NVPTX_GOACC_VL_WARP), NULL_TREE, + DECL_ATTRIBUTES (current_function_decl)); + DECL_ATTRIBUTES (current_function_decl) = attr; + } + + return inner_mask; +} + /* Determine whether fork & joins are needed. */ static bool @@ -6169,6 +6298,9 @@ nvptx_set_current_function (tree fndecl) #undef TARGET_GOACC_DIM_LIMIT #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit +#undef TARGET_GOACC_ADJUST_PARALLELISM +#define TARGET_GOACC_ADJUST_PARALLELISM nvptx_adjust_parallelism + #undef TARGET_GOACC_FORK_JOIN #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index ed17160..66c6212 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -551,6 +551,13 @@ oacc_xform_tile (gcall *call) static int oacc_default_dims[GOMP_DIM_MAX]; static int oacc_min_dims[GOMP_DIM_MAX]; +int +oacc_get_default_dim (int dim) +{ + gcc_assert (0 <= dim && dim < GOMP_DIM_MAX); + return oacc_default_dims[dim]; +} + /* Parse the default dimension parameter. This is a set of :-separated optional compute dimensions. Each dimension is either a positive integer, or '-' for a dynamic value computed at diff --git a/gcc/omp-offload.h b/gcc/omp-offload.h index 528448b..014ee52 100644 --- a/gcc/omp-offload.h +++ b/gcc/omp-offload.h @@ -22,6 +22,8 @@ along with GCC; see the file COPYING3. If not see #ifndef GCC_OMP_DEVICE_H #define GCC_OMP_DEVICE_H +extern int oacc_get_default_dim (int dim); + extern GTY(()) vec<tree, va_gc> *offload_funcs; extern GTY(()) vec<tree, va_gc> *offload_vars; diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c index fab5b0d..18d77cc 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-1.c @@ -33,7 +33,6 @@ main (void) return 0; } -/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */ -/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 1, 32\\)" "oaccdevlow" } } */ -/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=1, vectors=32" } */ +/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 1, 128\\)" "oaccdevlow" } } */ +/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=1, vectors=128" } */ diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c index e46b5cf..0658cfd 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-10.c @@ -37,4 +37,3 @@ main (void) return 0; } -/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */ diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c index cc6fd55..2ab6499 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-2.c @@ -34,7 +34,6 @@ main (void) return 0; } -/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */ -/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 1, 32\\)" "oaccdevlow" } } */ -/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=1, vectors=32" } */ +/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 1, 128\\)" "oaccdevlow" } } */ +/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=1, vectors=128" } */ diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vred2d-128.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vred2d-128.c index 1dc5fe0..318c0e6 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vred2d-128.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vred2d-128.c @@ -42,8 +42,6 @@ gentest (test3, "acc parallel loop gang worker vector_length (128)", gentest (test4, "acc parallel loop", "acc loop reduction(+:t1) reduction(-:t2)") -/* { dg-prune-output "using vector_length \\(32\\), ignoring 128" } */ - int main () diff --git a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 index 62b8a45..ad67dce 100644 --- a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 +++ b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 @@ -39,7 +39,6 @@ subroutine openacc_sgemm_128 (m, n, k, alpha, a, b, beta, c) real :: temp !$acc parallel loop copy(c(1:m,1:n)) copyin(a(1:k,1:m),b(1:k,1:n)) vector_length (128) - ! { dg-prune-output "using vector_length \\(32\\), ignoring 128" } do j = 1, n !$acc loop do i = 1, m