On Wed, Aug 08, 2018 at 07:09:16AM -0700, Cesar Philippidis wrote:
> On 08/07/2018 06:52 AM, Cesar Philippidis wrote:
> 
> > I attached an updated version of the CUDA driver patch, although I
> > haven't rebased it against your changes yet. It still needs to be tested
> > against CUDA 5.5 using the systems/Nvidia's cuda.h. But I wanted to give
> > you an update.
> > 
> > Does this patch look OK, at least after testing competes? I removed the
> > tests for CUDA_ONE_CALL_MAYBE_NULL, because the newer CUDA API isn't
> > supported in the older drivers.
> 
> I've finally finished testing this patch. Besides for a couple of
> regressions with CUDA 5.5 in libgomp.oacc-c-c++-common/lib-75.c,
> lib-76.c and lib-79.c, the results came back clean.
> 
> This patch has been tested the following ways using a K40 GPU:
> 
>   * Using GCC's cuda.h with CUDA 9.2 drivers.
>   * Using cuda.h from CUDA 5.5 and Nvidia drivers 331.133 (supports CUDA
>     6.0) and the driver from CUDA 8.0.
>   * Using cuda.h from CUDA 8.0.
> 
> As mentioned before, because GCC's cuda.h defines CUDA_VERSION as 8000,
> there was a conflict with using it against CUDA 5.5, because of the
> missing cuLinkAddData_v2 symbol.
> 
> Note how the usage of cuOccupancyMaxPotentialBlockSize is guarded by
> checking for the version of CUDA_VERSION. I don't really like this, but
> it's a necessary evil of maintaining backwards compatibility.
> 
> Is this patch OK for trunk?
> 
> Thanks,
> Cesar

> [nvptx] Use CUDA driver API to select default runtime launch geometry
> 
> 2018-08-YY  Cesar Philippidis  <ce...@codesourcery.com>
> 
>       libgomp/
>       plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
>       (cuDriverGetVersion): Declare.
>       (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
>       plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
>       cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
>       (ptx_device): Add driver_version member.
>       (nvptx_open_device): Initialize it.
>       (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
>       default num_gangs and num_workers when the driver supports it.
> ---
>  libgomp/plugin/cuda-lib.def   |  2 ++
>  libgomp/plugin/cuda/cuda.h    |  4 ++++
>  libgomp/plugin/plugin-nvptx.c | 40 +++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 45 insertions(+), 1 deletion(-)
> 
> diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
> index be8e3b3..f2433e1 100644
> --- a/libgomp/plugin/cuda-lib.def
> +++ b/libgomp/plugin/cuda-lib.def
> @@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate)
>  CUDA_ONE_CALL (cuCtxDestroy)
>  CUDA_ONE_CALL (cuCtxGetCurrent)
>  CUDA_ONE_CALL (cuCtxGetDevice)
> +CUDA_ONE_CALL (cuDriverGetVersion)

Don't use cuDriverGetVersion.

>  CUDA_ONE_CALL (cuCtxPopCurrent)
>  CUDA_ONE_CALL (cuCtxPushCurrent)
>  CUDA_ONE_CALL (cuCtxSynchronize)
> @@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
>  CUDA_ONE_CALL (cuModuleLoad)
>  CUDA_ONE_CALL (cuModuleLoadData)
>  CUDA_ONE_CALL (cuModuleUnload)
> +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize)

Use CUDA_ONE_CALL_MAYBE_NULL.

>  CUDA_ONE_CALL (cuStreamCreate)
>  CUDA_ONE_CALL (cuStreamDestroy)
>  CUDA_ONE_CALL (cuStreamQuery)
> diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
> index 4799825..3a790e6 100644
> --- a/libgomp/plugin/cuda/cuda.h
> +++ b/libgomp/plugin/cuda/cuda.h
> @@ -44,6 +44,7 @@ typedef void *CUevent;
>  typedef void *CUfunction;
>  typedef void *CUlinkState;
>  typedef void *CUmodule;
> +typedef size_t (*CUoccupancyB2DSize)(int);
>  typedef void *CUstream;
>  
>  typedef enum {
> @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
>  CUresult cuDeviceGet (CUdevice *, int);
>  CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
>  CUresult cuDeviceGetCount (int *);
> +CUresult cuDriverGetVersion(int *);
>  CUresult cuEventCreate (CUevent *, unsigned);
>  #define cuEventDestroy cuEventDestroy_v2
>  CUresult cuEventDestroy (CUevent);
> @@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, 
> CUmodule, const char *);
>  CUresult cuModuleLoad (CUmodule *, const char *);
>  CUresult cuModuleLoadData (CUmodule *, const void *);
>  CUresult cuModuleUnload (CUmodule);
> +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
> +                                       CUoccupancyB2DSize, size_t, int);
>  CUresult cuStreamCreate (CUstream *, unsigned);
>  #define cuStreamDestroy cuStreamDestroy_v2
>  CUresult cuStreamDestroy (CUstream);
> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
> index 825470a..b0ccf0b 100644
> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -376,6 +376,7 @@ struct ptx_device
>    int max_threads_per_block;
>    int max_threads_per_multiprocessor;
>    int default_dims[GOMP_DIM_MAX];
> +  int driver_version;
>  
>    struct ptx_image_data *images;  /* Images loaded on device.  */
>    pthread_mutex_t image_lock;     /* Lock for above list.  */
> @@ -687,6 +688,7 @@ nvptx_open_device (int n)
>    ptx_dev->ord = n;
>    ptx_dev->dev = dev;
>    ptx_dev->ctx_shared = false;
> +  ptx_dev->driver_version = 0;
>  
>    r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
>    if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
> @@ -780,6 +782,9 @@ nvptx_open_device (int n)
>    for (int i = 0; i != GOMP_DIM_MAX; i++)
>      ptx_dev->default_dims[i] = 0;
>  
> +  CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi);
> +  ptx_dev->driver_version = pi;
> +
>    ptx_dev->images = NULL;
>    pthread_mutex_init (&ptx_dev->image_lock, NULL);
>  
> @@ -1173,11 +1178,44 @@ nvptx_exec (void (*fn), size_t mapnum, void 
> **hostaddrs, void **devaddrs,
>  
>        {
>       bool default_dim_p[GOMP_DIM_MAX];
> +     int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR];
> +     int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER];
> +     int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG];
> +
> +     /* The CUDA driver occupancy calculator is only available on
> +        CUDA version 6.5 (6050) and newer.  */
> +#if (CUDA_VERSION >= 6050)
> +     if (nvthd->ptx_dev->driver_version > 6050)

Use CUDA_CALL_EXISTS.

> +       {
> +         int grids, blocks;
> +         CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
> +                           &blocks, function, NULL, 0,
> +                           dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
> +         GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
> +                            "grid = %d, block = %d\n", grids, blocks);
> +
> +         /* Keep the num_gangs proportional to the block size.  The
> +            constant factor 2 is there to prevent threads from
> +            idling when there is sufficient work for them.  */

typo: sufficient -> insufficient

Also, reformulate the first part of rationale comment to say: "Keep the
num_gangs proportional to the block size, in order to ..." or some such, along
the lines of what you mentioned here (
https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00289.html ).

> +         if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0)
> +           gangs = 2 * grids * (blocks / warp_size);
> +
> +         if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0)
> +           workers = blocks / vectors;
> +       }
> +#endif
> +
>       for (i = 0; i != GOMP_DIM_MAX; i++)
>         {
>           default_dim_p[i] = !dims[i];
>           if (default_dim_p[i])
> -           dims[i] = nvthd->ptx_dev->default_dims[i];
> +           switch (i)
> +             {
> +             case GOMP_DIM_GANG: dims[i] = gangs; break;
> +             case GOMP_DIM_WORKER: dims[i] = workers; break;
> +             case GOMP_DIM_VECTOR: dims[i] = vectors; break;
> +             default: GOMP_PLUGIN_fatal ("invalid dim");
> +             }
>         }
>  

The new default calculation is not cleanly separated from the fallback default
calculation. I've already shown you how that should be done: (
https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00027.html ).

Thanks,
- Tom

Reply via email to