gomp-4_0-branch already contains the default nvptx runtime enhancements that was recently applied to trunk <https://gcc.gnu.org/ml/gcc-patches/2016-11/msg00252.html>. However, I made some tweaks in trunk involving both pthread locking and formatting changes, which this patch backports.
I've applied this patch to gomp-4_0-branch. Cesar
2016-11-07 Cesar Philippidis <ce...@codesourcery.com> libgomp/ Backport from trunk 2016-11-02 Cesar Philippidis <ce...@codesourcery.com> Nathan Sidwell <nat...@acm.org> * plugin/plugin-nvptx.c (nvptx_exec): Interrogate board attributes to determine default geometry. diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index bd18418..2e7b020 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -910,10 +910,11 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, variable to specify runtime defaults. */ static int default_dims[GOMP_DIM_MAX]; + pthread_mutex_lock (&ptx_dev_lock); if (!default_dims[0]) { /* We only read the environment variable once. You can't - change it in the middle of execution. The sytntax is + change it in the middle of execution. The syntax is the same as for the -fopenacc-dim compilation option. */ const char *env_var = getenv ("GOMP_OPENACC_DIM"); if (env_var) @@ -942,15 +943,17 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, CUdevice dev = nvptx_thread()->ptx_dev->dev; /* 32 is the default for known hardware. */ int gang = 0, worker = 32, vector = 32; + CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm; - if (CUDA_SUCCESS == cuDeviceGetAttribute - (&block_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev) - && CUDA_SUCCESS == cuDeviceGetAttribute - (&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev) - && CUDA_SUCCESS == cuDeviceGetAttribute - (&dev_size, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev) - && CUDA_SUCCESS == cuDeviceGetAttribute - (&cpu_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev)) + cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK; + cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE; + cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT; + cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR; + + if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS + && cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS + && cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS + && cuDeviceGetAttribute (&cpu_size, cu_tpm, dev) == CUDA_SUCCESS) { GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," " dev_size=%d, cpu_size=%d\n", @@ -980,6 +983,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, default_dims[GOMP_DIM_WORKER], default_dims[GOMP_DIM_VECTOR]); } + pthread_mutex_unlock (&ptx_dev_lock); for (i = 0; i != GOMP_DIM_MAX; i++) if (!dims[i])