On 08/07/2018 06:52 AM, Cesar Philippidis wrote:

> I attached an updated version of the CUDA driver patch, although I
> haven't rebased it against your changes yet. It still needs to be tested
> against CUDA 5.5 using the systems/Nvidia's cuda.h. But I wanted to give
> you an update.
> 
> Does this patch look OK, at least after testing competes? I removed the
> tests for CUDA_ONE_CALL_MAYBE_NULL, because the newer CUDA API isn't
> supported in the older drivers.

I've finally finished testing this patch. Besides for a couple of
regressions with CUDA 5.5 in libgomp.oacc-c-c++-common/lib-75.c,
lib-76.c and lib-79.c, the results came back clean.

This patch has been tested the following ways using a K40 GPU:

  * Using GCC's cuda.h with CUDA 9.2 drivers.
  * Using cuda.h from CUDA 5.5 and Nvidia drivers 331.133 (supports CUDA
    6.0) and the driver from CUDA 8.0.
  * Using cuda.h from CUDA 8.0.

As mentioned before, because GCC's cuda.h defines CUDA_VERSION as 8000,
there was a conflict with using it against CUDA 5.5, because of the
missing cuLinkAddData_v2 symbol.

Note how the usage of cuOccupancyMaxPotentialBlockSize is guarded by
checking for the version of CUDA_VERSION. I don't really like this, but
it's a necessary evil of maintaining backwards compatibility.

Is this patch OK for trunk?

Thanks,
Cesar
[nvptx] Use CUDA driver API to select default runtime launch geometry

2018-08-YY  Cesar Philippidis  <ce...@codesourcery.com>

	libgomp/
	plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuDriverGetVersion): Declare.
	(cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
	plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
	cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
	(ptx_device): Add driver_version member.
	(nvptx_open_device): Initialize it.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.
---
 libgomp/plugin/cuda-lib.def   |  2 ++
 libgomp/plugin/cuda/cuda.h    |  4 ++++
 libgomp/plugin/plugin-nvptx.c | 40 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index be8e3b3..f2433e1 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate)
 CUDA_ONE_CALL (cuCtxDestroy)
 CUDA_ONE_CALL (cuCtxGetCurrent)
 CUDA_ONE_CALL (cuCtxGetDevice)
+CUDA_ONE_CALL (cuDriverGetVersion)
 CUDA_ONE_CALL (cuCtxPopCurrent)
 CUDA_ONE_CALL (cuCtxPushCurrent)
 CUDA_ONE_CALL (cuCtxSynchronize)
@@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
 CUDA_ONE_CALL (cuModuleLoad)
 CUDA_ONE_CALL (cuModuleLoadData)
 CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize)
 CUDA_ONE_CALL (cuStreamCreate)
 CUDA_ONE_CALL (cuStreamDestroy)
 CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825..3a790e6 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
 CUresult cuDeviceGet (CUdevice *, int);
 CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
 CUresult cuDeviceGetCount (int *);
+CUresult cuDriverGetVersion(int *);
 CUresult cuEventCreate (CUevent *, unsigned);
 #define cuEventDestroy cuEventDestroy_v2
 CUresult cuEventDestroy (CUevent);
@@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 825470a..b0ccf0b 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -376,6 +376,7 @@ struct ptx_device
   int max_threads_per_block;
   int max_threads_per_multiprocessor;
   int default_dims[GOMP_DIM_MAX];
+  int driver_version;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -687,6 +688,7 @@ nvptx_open_device (int n)
   ptx_dev->ord = n;
   ptx_dev->dev = dev;
   ptx_dev->ctx_shared = false;
+  ptx_dev->driver_version = 0;
 
   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
@@ -780,6 +782,9 @@ nvptx_open_device (int n)
   for (int i = 0; i != GOMP_DIM_MAX; i++)
     ptx_dev->default_dims[i] = 0;
 
+  CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi);
+  ptx_dev->driver_version = pi;
+
   ptx_dev->images = NULL;
   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 
@@ -1173,11 +1178,44 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
       {
 	bool default_dim_p[GOMP_DIM_MAX];
+	int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR];
+	int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER];
+	int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG];
+
+	/* The CUDA driver occupancy calculator is only available on
+	   CUDA version 6.5 (6050) and newer.  */
+#if (CUDA_VERSION >= 6050)
+	if (nvthd->ptx_dev->driver_version > 6050)
+	  {
+	    int grids, blocks;
+	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+			      &blocks, function, NULL, 0,
+			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+			       "grid = %d, block = %d\n", grids, blocks);
+
+	    /* Keep the num_gangs proportional to the block size.  The
+	       constant factor 2 is there to prevent threads from
+	       idling when there is sufficient work for them.  */
+	    if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0)
+	      gangs = 2 * grids * (blocks / warp_size);
+
+	    if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0)
+	      workers = blocks / vectors;
+	  }
+#endif
+
 	for (i = 0; i != GOMP_DIM_MAX; i++)
 	  {
 	    default_dim_p[i] = !dims[i];
 	    if (default_dim_p[i])
-	      dims[i] = nvthd->ptx_dev->default_dims[i];
+	      switch (i)
+		{
+		case GOMP_DIM_GANG: dims[i] = gangs; break;
+		case GOMP_DIM_WORKER: dims[i] = workers; break;
+		case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+		default: GOMP_PLUGIN_fatal ("invalid dim");
+		}
 	  }
 
 	if (default_dim_p[GOMP_DIM_VECTOR])
-- 
2.7.4

Reply via email to