On 08/08/2018 08:19 AM, Tom de Vries wrote: > On Wed, Aug 08, 2018 at 07:09:16AM -0700, Cesar Philippidis wrote: >> On 08/07/2018 06:52 AM, Cesar Philippidis wrote:
Thanks for review. This version should address all of the following remarks. However, one thing to note ... >> [nvptx] Use CUDA driver API to select default runtime launch geometry >> >> 2018-08-YY Cesar Philippidis <ce...@codesourcery.com> >> >> libgomp/ >> plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. >> (cuDriverGetVersion): Declare. >> (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare. >> plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for >> cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize. >> (ptx_device): Add driver_version member. >> (nvptx_open_device): Initialize it. >> (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the >> default num_gangs and num_workers when the driver supports it. >> --- >> libgomp/plugin/cuda-lib.def | 2 ++ >> libgomp/plugin/cuda/cuda.h | 4 ++++ >> libgomp/plugin/plugin-nvptx.c | 40 +++++++++++++++++++++++++++++++++++++++- >> 3 files changed, 45 insertions(+), 1 deletion(-) >> >> diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def >> index be8e3b3..f2433e1 100644 >> --- a/libgomp/plugin/cuda-lib.def >> +++ b/libgomp/plugin/cuda-lib.def >> @@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate) >> CUDA_ONE_CALL (cuCtxDestroy) >> CUDA_ONE_CALL (cuCtxGetCurrent) >> CUDA_ONE_CALL (cuCtxGetDevice) >> +CUDA_ONE_CALL (cuDriverGetVersion) > > Don't use cuDriverGetVersion. > >> CUDA_ONE_CALL (cuCtxPopCurrent) >> CUDA_ONE_CALL (cuCtxPushCurrent) >> CUDA_ONE_CALL (cuCtxSynchronize) >> @@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) >> CUDA_ONE_CALL (cuModuleLoad) >> CUDA_ONE_CALL (cuModuleLoadData) >> CUDA_ONE_CALL (cuModuleUnload) >> +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) > > Use CUDA_ONE_CALL_MAYBE_NULL. > >> CUDA_ONE_CALL (cuStreamCreate) >> CUDA_ONE_CALL (cuStreamDestroy) >> CUDA_ONE_CALL (cuStreamQuery) >> diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h >> index 4799825..3a790e6 100644 >> --- a/libgomp/plugin/cuda/cuda.h >> +++ b/libgomp/plugin/cuda/cuda.h >> @@ -44,6 +44,7 @@ typedef void *CUevent; >> typedef void *CUfunction; >> typedef void *CUlinkState; >> typedef void *CUmodule; >> +typedef size_t (*CUoccupancyB2DSize)(int); >> typedef void *CUstream; >> >> typedef enum { >> @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void); >> CUresult cuDeviceGet (CUdevice *, int); >> CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice); >> CUresult cuDeviceGetCount (int *); >> +CUresult cuDriverGetVersion(int *); >> CUresult cuEventCreate (CUevent *, unsigned); >> #define cuEventDestroy cuEventDestroy_v2 >> CUresult cuEventDestroy (CUevent); >> @@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, >> CUmodule, const char *); >> CUresult cuModuleLoad (CUmodule *, const char *); >> CUresult cuModuleLoadData (CUmodule *, const void *); >> CUresult cuModuleUnload (CUmodule); >> +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, >> + CUoccupancyB2DSize, size_t, int); >> CUresult cuStreamCreate (CUstream *, unsigned); >> #define cuStreamDestroy cuStreamDestroy_v2 >> CUresult cuStreamDestroy (CUstream); >> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c >> index 825470a..b0ccf0b 100644 >> --- a/libgomp/plugin/plugin-nvptx.c >> +++ b/libgomp/plugin/plugin-nvptx.c >> @@ -376,6 +376,7 @@ struct ptx_device >> int max_threads_per_block; >> int max_threads_per_multiprocessor; >> int default_dims[GOMP_DIM_MAX]; >> + int driver_version; >> >> struct ptx_image_data *images; /* Images loaded on device. */ >> pthread_mutex_t image_lock; /* Lock for above list. */ >> @@ -687,6 +688,7 @@ nvptx_open_device (int n) >> ptx_dev->ord = n; >> ptx_dev->dev = dev; >> ptx_dev->ctx_shared = false; >> + ptx_dev->driver_version = 0; >> >> r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev); >> if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) >> @@ -780,6 +782,9 @@ nvptx_open_device (int n) >> for (int i = 0; i != GOMP_DIM_MAX; i++) >> ptx_dev->default_dims[i] = 0; >> >> + CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi); >> + ptx_dev->driver_version = pi; >> + >> ptx_dev->images = NULL; >> pthread_mutex_init (&ptx_dev->image_lock, NULL); >> >> @@ -1173,11 +1178,44 @@ nvptx_exec (void (*fn), size_t mapnum, void >> **hostaddrs, void **devaddrs, >> >> { >> bool default_dim_p[GOMP_DIM_MAX]; >> + int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR]; >> + int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER]; >> + int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG]; >> + is that I modified the default value for vectors as follows + int vectors = default_dim_p[GOMP_DIM_VECTOR] + ? 0 : dims[GOMP_DIM_VECTOR]; Technically, trunk only supports warp-sized vectors, but the fallback code is already checking for the presence of vectors as so + if (default_dim_p[GOMP_DIM_VECTOR]) + dims[GOMP_DIM_VECTOR] + = MIN (dims[GOMP_DIM_VECTOR], + (targ_fn->max_threads_per_block / warp_size + * warp_size)); therefore, I had the cuOccupancyMaxPotentialBlockSize code path behave the same. If you want, I can resubmit a patch without that change though. >> + /* The CUDA driver occupancy calculator is only available on >> + CUDA version 6.5 (6050) and newer. */ >> +#if (CUDA_VERSION >= 6050) >> + if (nvthd->ptx_dev->driver_version > 6050) > > Use CUDA_CALL_EXISTS. > >> + { >> + int grids, blocks; >> + CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, >> + &blocks, function, NULL, 0, >> + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); >> + GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " >> + "grid = %d, block = %d\n", grids, blocks); >> + >> + /* Keep the num_gangs proportional to the block size. The >> + constant factor 2 is there to prevent threads from >> + idling when there is sufficient work for them. */ > > typo: sufficient -> insufficient > > Also, reformulate the first part of rationale comment to say: "Keep the > num_gangs proportional to the block size, in order to ..." or some such, along > the lines of what you mentioned here ( > https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00289.html ). > >> + if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0) >> + gangs = 2 * grids * (blocks / warp_size); >> + >> + if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0) >> + workers = blocks / vectors; >> + } >> +#endif >> + >> for (i = 0; i != GOMP_DIM_MAX; i++) >> { >> default_dim_p[i] = !dims[i]; >> if (default_dim_p[i]) >> - dims[i] = nvthd->ptx_dev->default_dims[i]; >> + switch (i) >> + { >> + case GOMP_DIM_GANG: dims[i] = gangs; break; >> + case GOMP_DIM_WORKER: dims[i] = workers; break; >> + case GOMP_DIM_VECTOR: dims[i] = vectors; break; >> + default: GOMP_PLUGIN_fatal ("invalid dim"); >> + } >> } >> > > The new default calculation is not cleanly separated from the fallback default > calculation. I've already shown you how that should be done: ( > https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00027.html ). Is this patch OK for trunk? I tested it with CUDA 5.5, 8.0 and 9.0, with and without --without-cuda-driver. Thanks, Cesar
[nvptx] Use CUDA driver API to select default runtime launch geometry PR target/85590 libgomp/ plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare. plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New CUDA_ONE_CALL_MAYBE_NULL. plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define CUoccupancyB2DSize and declare cuOccupancyMaxPotentialBlockSizeWithFlags. (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the default num_gangs and num_workers when the driver supports it. --- libgomp/plugin/cuda-lib.def | 1 + libgomp/plugin/cuda/cuda.h | 3 ++ libgomp/plugin/plugin-nvptx.c | 77 +++++++++++++++++++++++++++++------ 3 files changed, 69 insertions(+), 12 deletions(-) diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index 29028b504a0..b2a4c2154eb 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) CUDA_ONE_CALL (cuModuleLoad) CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleUnload) +CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamDestroy) CUDA_ONE_CALL (cuStreamQuery) diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h index 4799825bda2..b4c1b29c5d8 100644 --- a/libgomp/plugin/cuda/cuda.h +++ b/libgomp/plugin/cuda/cuda.h @@ -44,6 +44,7 @@ typedef void *CUevent; typedef void *CUfunction; typedef void *CUlinkState; typedef void *CUmodule; +typedef size_t (*CUoccupancyB2DSize)(int); typedef void *CUstream; typedef enum { @@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *); CUresult cuModuleLoad (CUmodule *, const char *); CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleUnload (CUmodule); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); CUresult cuStreamCreate (CUstream *, unsigned); #define cuStreamDestroy cuStreamDestroy_v2 CUresult cuStreamDestroy (CUstream); diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 6799a264976..9a4bc11e5fe 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned, CUjit_option *, void **); CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *); #else +typedef size_t (*CUoccupancyB2DSize)(int); CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned, CUjit_option *, void **); CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); #endif #define DO_PRAGMA(x) _Pragma (#x) @@ -1200,21 +1203,71 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, { bool default_dim_p[GOMP_DIM_MAX]; for (i = 0; i != GOMP_DIM_MAX; i++) + default_dim_p[i] = !dims[i]; + + if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize)) { - default_dim_p[i] = !dims[i]; - if (default_dim_p[i]) - dims[i] = nvthd->ptx_dev->default_dims[i]; - } + for (i = 0; i != GOMP_DIM_MAX; i++) + { + if (default_dim_p[i]) + dims[i] = nvthd->ptx_dev->default_dims[i]; + } - if (default_dim_p[GOMP_DIM_VECTOR]) - dims[GOMP_DIM_VECTOR] - = MIN (dims[GOMP_DIM_VECTOR], - (targ_fn->max_threads_per_block / warp_size * warp_size)); + if (default_dim_p[GOMP_DIM_VECTOR]) + dims[GOMP_DIM_VECTOR] + = MIN (dims[GOMP_DIM_VECTOR], + (targ_fn->max_threads_per_block / warp_size + * warp_size)); - if (default_dim_p[GOMP_DIM_WORKER]) - dims[GOMP_DIM_WORKER] - = MIN (dims[GOMP_DIM_WORKER], - targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]); + if (default_dim_p[GOMP_DIM_WORKER]) + dims[GOMP_DIM_WORKER] + = MIN (dims[GOMP_DIM_WORKER], + targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]); + } + else + { + int vectors = default_dim_p[GOMP_DIM_VECTOR] + ? 0 : dims[GOMP_DIM_VECTOR]; + int workers = gomp_openacc_dims[GOMP_DIM_WORKER]; + int gangs = gomp_openacc_dims[GOMP_DIM_GANG]; + int grids, blocks; + + CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, + &blocks, function, NULL, 0, + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); + GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " + "grid = %d, block = %d\n", grids, blocks); + + /* Keep the num_gangs proportional to the block size. In + the case were a block size is limited by shared-memory + or the register file capacity, the runtime will not + excessively over assign gangs to the multiprocessor + units if their state is going to be swapped out even + more than necessary. The constant factor 2 is there to + prevent threads from idling when there is insufficient + work for them. */ + if (gangs == 0) + gangs = 2 * grids * (blocks / warp_size); + + if (vectors == 0) + vectors = warp_size; + + if (workers == 0) + workers = blocks / vectors; + + for (i = 0; i != GOMP_DIM_MAX; i++) + { + default_dim_p[i] = !dims[i]; + if (default_dim_p[i]) + switch (i) + { + case GOMP_DIM_GANG: dims[i] = gangs; break; + case GOMP_DIM_WORKER: dims[i] = workers; break; + case GOMP_DIM_VECTOR: dims[i] = vectors; break; + default: GOMP_PLUGIN_fatal ("invalid dim"); + } + } + } } } -- 2.17.1