On 03/02/2018 09:47 PM, Cesar Philippidis wrote:
libgomp/ * plugin/plugin-nvptx.c (nvptx_exec): Adjust calculations of workers and vectors.
I wrote a test case that triggers this code, and added it to this code. Build x86_64 with nvptx accelerator and tested libgomp. Committed. Thanks, - Tom
[nvptx] Handle large vectors in libgomp 2018-04-05 Cesar Philippidis <ce...@codesourcery.com> Tom de Vries <t...@codesourcery.com> * plugin/plugin-nvptx.c (nvptx_exec): Adjust calculations of workers and vectors. * testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c: New test. --- libgomp/plugin/plugin-nvptx.c | 10 +++--- .../vector-length-128-7.c | 41 ++++++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index bdc0c30..9b4768f 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -734,8 +734,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int threads_per_block = threads_per_sm > block_size ? block_size : threads_per_sm; - threads_per_block /= warp_size; - if (threads_per_sm > cpu_size) threads_per_sm = cpu_size; @@ -802,6 +800,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, if (seen_zero) { + int vectors = dims[GOMP_DIM_VECTOR] > 0 + ? dims[GOMP_DIM_VECTOR] : warp_size; + int workers = threads_per_block / vectors; + for (i = 0; i != GOMP_DIM_MAX; i++) if (!dims[i]) { @@ -819,10 +821,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, : 2 * dev_size; break; case GOMP_DIM_WORKER: - dims[i] = threads_per_block; + dims[i] = workers; break; case GOMP_DIM_VECTOR: - dims[i] = warp_size; + dims[i] = vectors; break; default: abort (); diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c new file mode 100644 index 0000000..60c264c --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c @@ -0,0 +1,41 @@ +/* { dg-do run { target openacc_nvidia_accel_selected } } */ +/* { dg-additional-options "-foffload=-fdump-tree-oaccdevlow" } */ +/* { dg-additional-options "-foffload=-mlong-vector-in-workers" } */ +/* { dg-set-target-env-var "GOMP_DEBUG" "1" } */ + +#include <stdlib.h> + +#define N 1024 + +unsigned int a[N]; +unsigned int b[N]; +unsigned int c[N]; +unsigned int n = N; + +int +main (void) +{ + for (unsigned int i = 0; i < n; ++i) + { + a[i] = i % 3; + b[i] = i % 5; + } + +#pragma acc parallel vector_length (128) copyin (a,b) copyout (c) + { +#pragma acc loop worker + for (unsigned int i = 0; i < 4; i++) +#pragma acc loop vector + for (unsigned int j = 0; j < n / 4; j++) + c[(i * N / 4) + j] = a[(i * N / 4) + j] + b[(i * N / 4) + j]; + } + + for (unsigned int i = 0; i < n; ++i) + if (c[i] != (i % 3) + (i % 5)) + abort (); + + return 0; +} + +/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 0, 128\\)" "oaccdevlow" } } */ +/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=8, vectors=128" } */