On 08/07/2018 06:52 AM, Cesar Philippidis wrote: > I attached an updated version of the CUDA driver patch, although I > haven't rebased it against your changes yet. It still needs to be tested > against CUDA 5.5 using the systems/Nvidia's cuda.h. But I wanted to give > you an update. > > Does this patch look OK, at least after testing competes? I removed the > tests for CUDA_ONE_CALL_MAYBE_NULL, because the newer CUDA API isn't > supported in the older drivers.
I've finally finished testing this patch. Besides for a couple of regressions with CUDA 5.5 in libgomp.oacc-c-c++-common/lib-75.c, lib-76.c and lib-79.c, the results came back clean. This patch has been tested the following ways using a K40 GPU: * Using GCC's cuda.h with CUDA 9.2 drivers. * Using cuda.h from CUDA 5.5 and Nvidia drivers 331.133 (supports CUDA 6.0) and the driver from CUDA 8.0. * Using cuda.h from CUDA 8.0. As mentioned before, because GCC's cuda.h defines CUDA_VERSION as 8000, there was a conflict with using it against CUDA 5.5, because of the missing cuLinkAddData_v2 symbol. Note how the usage of cuOccupancyMaxPotentialBlockSize is guarded by checking for the version of CUDA_VERSION. I don't really like this, but it's a necessary evil of maintaining backwards compatibility. Is this patch OK for trunk? Thanks, Cesar
[nvptx] Use CUDA driver API to select default runtime launch geometry 2018-08-YY Cesar Philippidis <ce...@codesourcery.com> libgomp/ plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef. (cuDriverGetVersion): Declare. (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare. plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize. (ptx_device): Add driver_version member. (nvptx_open_device): Initialize it. (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the default num_gangs and num_workers when the driver supports it. --- libgomp/plugin/cuda-lib.def | 2 ++ libgomp/plugin/cuda/cuda.h | 4 ++++ libgomp/plugin/plugin-nvptx.c | 40 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index be8e3b3..f2433e1 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate) CUDA_ONE_CALL (cuCtxDestroy) CUDA_ONE_CALL (cuCtxGetCurrent) CUDA_ONE_CALL (cuCtxGetDevice) +CUDA_ONE_CALL (cuDriverGetVersion) CUDA_ONE_CALL (cuCtxPopCurrent) CUDA_ONE_CALL (cuCtxPushCurrent) CUDA_ONE_CALL (cuCtxSynchronize) @@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) CUDA_ONE_CALL (cuModuleLoad) CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleUnload) +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamDestroy) CUDA_ONE_CALL (cuStreamQuery) diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h index 4799825..3a790e6 100644 --- a/libgomp/plugin/cuda/cuda.h +++ b/libgomp/plugin/cuda/cuda.h @@ -44,6 +44,7 @@ typedef void *CUevent; typedef void *CUfunction; typedef void *CUlinkState; typedef void *CUmodule; +typedef size_t (*CUoccupancyB2DSize)(int); typedef void *CUstream; typedef enum { @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void); CUresult cuDeviceGet (CUdevice *, int); CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice); CUresult cuDeviceGetCount (int *); +CUresult cuDriverGetVersion(int *); CUresult cuEventCreate (CUevent *, unsigned); #define cuEventDestroy cuEventDestroy_v2 CUresult cuEventDestroy (CUevent); @@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *); CUresult cuModuleLoad (CUmodule *, const char *); CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleUnload (CUmodule); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); CUresult cuStreamCreate (CUstream *, unsigned); #define cuStreamDestroy cuStreamDestroy_v2 CUresult cuStreamDestroy (CUstream); diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 825470a..b0ccf0b 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -376,6 +376,7 @@ struct ptx_device int max_threads_per_block; int max_threads_per_multiprocessor; int default_dims[GOMP_DIM_MAX]; + int driver_version; struct ptx_image_data *images; /* Images loaded on device. */ pthread_mutex_t image_lock; /* Lock for above list. */ @@ -687,6 +688,7 @@ nvptx_open_device (int n) ptx_dev->ord = n; ptx_dev->dev = dev; ptx_dev->ctx_shared = false; + ptx_dev->driver_version = 0; r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) @@ -780,6 +782,9 @@ nvptx_open_device (int n) for (int i = 0; i != GOMP_DIM_MAX; i++) ptx_dev->default_dims[i] = 0; + CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi); + ptx_dev->driver_version = pi; + ptx_dev->images = NULL; pthread_mutex_init (&ptx_dev->image_lock, NULL); @@ -1173,11 +1178,44 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, { bool default_dim_p[GOMP_DIM_MAX]; + int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR]; + int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER]; + int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG]; + + /* The CUDA driver occupancy calculator is only available on + CUDA version 6.5 (6050) and newer. */ +#if (CUDA_VERSION >= 6050) + if (nvthd->ptx_dev->driver_version > 6050) + { + int grids, blocks; + CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, + &blocks, function, NULL, 0, + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); + GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " + "grid = %d, block = %d\n", grids, blocks); + + /* Keep the num_gangs proportional to the block size. The + constant factor 2 is there to prevent threads from + idling when there is sufficient work for them. */ + if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0) + gangs = 2 * grids * (blocks / warp_size); + + if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0) + workers = blocks / vectors; + } +#endif + for (i = 0; i != GOMP_DIM_MAX; i++) { default_dim_p[i] = !dims[i]; if (default_dim_p[i]) - dims[i] = nvthd->ptx_dev->default_dims[i]; + switch (i) + { + case GOMP_DIM_GANG: dims[i] = gangs; break; + case GOMP_DIM_WORKER: dims[i] = workers; break; + case GOMP_DIM_VECTOR: dims[i] = vectors; break; + default: GOMP_PLUGIN_fatal ("invalid dim"); + } } if (default_dim_p[GOMP_DIM_VECTOR]) -- 2.7.4