Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

Cesar Philippidis Tue, 07 Aug 2018 06:54:01 -0700

On 08/06/2018 11:08 PM, Tom de Vries wrote:
> On 08/01/2018 12:18 PM, Tom de Vries wrote:
> 
>> I think we need to add and handle:
>> ...
>>   CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
>> ...
>>
> 
> I realized that the patch I posted introducing CUDA_ONE_CALL_MAYBE_NULL
> was incomplete, and needed to use the weak attribute in case of linking
> against a concrete libcuda.so.
> 
> So, I've now committed a patch implementing just CUDA_ONE_CALL_MAYBE_NULL:
> "[libgomp, nvptx] Handle CUDA_ONE_CALL_MAYBE_NULL" @
> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00447.html . You can use
> "CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize)" to test for
> existence of the function in the cuda driver API.


Sorry for taking so long getting this patch updated. It's a slow build
and test cycle getting older versions of cuda to play nicely. So far,
I've managed to get CUDA 5.5 partially working with Nvidia driver
331.113 (which supports CUDA 6.0) in the sense that I spotted an error
with the patch; I realized that the cuda.h that ships with libgomp
emulates version CUDA 8.0. That lead to problems using cuLinkAddData,
because that function gets remapped to cuLinkAddData_v2 in CUDA 6.5 and
newer.

That leads me to a question, do we really want to support older versions
of CUDA without using the system's CUDA header files?

>> The patch doesn't build in a setup with
>> --enable-offload-targets=nvptx-none and without cuda, that enables usage
>> of plugin/cuda/cuda.h:
>> ...
>> /data/offload-nvptx/src/libgomp/plugin/plugin-nvptx.c:98:16: error:
>> ‘cuOccupancyMaxPotentialBlockSize’ undeclared here (not in a function);
>> did you mean ‘cuOccupancyMaxPotentialBlockSizeWithFlags’?
>>  CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \
>> ...
>>
> 
> I've committed a patch "[libgomp, nvptx, --without-cuda-driver] Don't
> use system cuda driver" @
> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00348.html .
> 
> Using --without-cuda-driver should make it easy to build using the
> dlopen interface without having to de-install the system libcuda.so.

I attached an updated version of the CUDA driver patch, although I
haven't rebased it against your changes yet. It still needs to be tested
against CUDA 5.5 using the systems/Nvidia's cuda.h. But I wanted to give
you an update.

Does this patch look OK, at least after testing competes? I removed the
tests for CUDA_ONE_CALL_MAYBE_NULL, because the newer CUDA API isn't
supported in the older drivers.

Cesar

>From 7fc093da173543b43e1d83dd5fb9e00e2b92eb09 Mon Sep 17 00:00:00 2001
From: Cesar Philippidis <ce...@codesourcery.com>
Date: Thu, 26 Jul 2018 11:47:35 -0700
Subject: [PATCH] [nvptx] Use CUDA driver API to select default runtime launch
 geometry

	libgomp/
	plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuDriverGetVersion): Declare.
	(cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
	plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
	cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
	(ptx_device): Add driver_version member.
	(nvptx_open_device): Initialize it.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.
---
 libgomp/plugin/cuda-lib.def   |  2 ++
 libgomp/plugin/cuda/cuda.h    |  4 ++++
 libgomp/plugin/plugin-nvptx.c | 41 +++++++++++++++++++++++++++++++++--
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index be8e3b3ec4d..f2433e1f0a9 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate)
 CUDA_ONE_CALL (cuCtxDestroy)
 CUDA_ONE_CALL (cuCtxGetCurrent)
 CUDA_ONE_CALL (cuCtxGetDevice)
+CUDA_ONE_CALL (cuDriverGetVersion)
 CUDA_ONE_CALL (cuCtxPopCurrent)
 CUDA_ONE_CALL (cuCtxPushCurrent)
 CUDA_ONE_CALL (cuCtxSynchronize)
@@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
 CUDA_ONE_CALL (cuModuleLoad)
 CUDA_ONE_CALL (cuModuleLoadData)
 CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize)
 CUDA_ONE_CALL (cuStreamCreate)
 CUDA_ONE_CALL (cuStreamDestroy)
 CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825bda2..3a790e688e0 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
 CUresult cuDeviceGet (CUdevice *, int);
 CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
 CUresult cuDeviceGetCount (int *);
+CUresult cuDriverGetVersion(int *);
 CUresult cuEventCreate (CUevent *, unsigned);
 #define cuEventDestroy cuEventDestroy_v2
 CUresult cuEventDestroy (CUevent);
@@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index cc465b4addb..f88c571a38e 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -54,6 +54,8 @@
 
 # define CUDA_ONE_CALL(call) \
   __typeof (call) *call;
+# define CUDA_ONE_CALL_MAYBE_NULL(call) \
+  __typeof (call) *call;
 struct cuda_lib_s {
 #include "cuda-lib.def"
 } cuda_lib;
@@ -80,7 +82,6 @@ init_cuda_lib (void)
   cuda_lib.call = dlsym (h, #call);	\
   if (cuda_lib.call == NULL)		\
     return false;
-#include "cuda-lib.def"
   cuda_lib_inited = true;
   return true;
 }
@@ -355,6 +356,7 @@ struct ptx_device
   int max_threads_per_block;
   int max_threads_per_multiprocessor;
   int default_dims[GOMP_DIM_MAX];
+  int driver_version;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -666,6 +668,7 @@ nvptx_open_device (int n)
   ptx_dev->ord = n;
   ptx_dev->dev = dev;
   ptx_dev->ctx_shared = false;
+  ptx_dev->driver_version = 0;
 
   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
@@ -759,6 +762,9 @@ nvptx_open_device (int n)
   for (int i = 0; i != GOMP_DIM_MAX; i++)
     ptx_dev->default_dims[i] = 0;
 
+  CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi);
+  ptx_dev->driver_version = pi;
+
   ptx_dev->images = NULL;
   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 
@@ -1152,11 +1158,42 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
       {
 	bool default_dim_p[GOMP_DIM_MAX];
+	int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR];
+	int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER];
+	int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG];
+
+	/* The CUDA driver occupancy calculator is only available on
+	   CUDA version 6.5 (6050) and newer.  */
+	if (nvthd->ptx_dev->driver_version > 6050)
+	  {
+	    int grids, blocks;
+	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+			      &blocks, function, NULL, 0,
+			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+			       "grid = %d, block = %d\n", grids, blocks);
+
+	    /* Keep the num_gangs proportional to the block size.  The
+	       constant factor 2 is there to prevent threads from
+	       idling when there is sufficient work for them.  */
+	    if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0)
+	      gangs = 2 * grids * (blocks / warp_size);
+
+	    if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0)
+	      workers = blocks / vectors;
+	  }
+
 	for (i = 0; i != GOMP_DIM_MAX; i++)
 	  {
 	    default_dim_p[i] = !dims[i];
 	    if (default_dim_p[i])
-	      dims[i] = nvthd->ptx_dev->default_dims[i];
+	      switch (i)
+		{
+		case GOMP_DIM_GANG: dims[i] = gangs; break;
+		case GOMP_DIM_WORKER: dims[i] = workers; break;
+		case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+		default: GOMP_PLUGIN_fatal ("invalid dim");
+		}
 	  }
 
 	if (default_dim_p[GOMP_DIM_VECTOR])
-- 
2.17.1

Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

Reply via email to