Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

Tom de Vries Mon, 13 Aug 2018 05:05:10 -0700

On 08/10/2018 08:39 PM, Cesar Philippidis wrote:
> is that I modified the default value for vectors as follows
> 
> +         int vectors = default_dim_p[GOMP_DIM_VECTOR]
> +           ? 0 : dims[GOMP_DIM_VECTOR];
> 
> Technically, trunk only supports warp-sized vectors, but the fallback
> code is already checking for the presence of vectors as so
> 
> +         if (default_dim_p[GOMP_DIM_VECTOR])
> +           dims[GOMP_DIM_VECTOR]
> +             = MIN (dims[GOMP_DIM_VECTOR],
> +                    (targ_fn->max_threads_per_block / warp_size
> +                     * warp_size));
>


That code handles the case that the default vector size is bigger than
the function being launched allows, independent from whether that
default is calculated by the runtime, or set by GOMP_OPENACC_DIM.

The GOMP_OPENACC_DIM part is forward compatible, given that currently
the compiler doesn't allow the runtime to choose the vector length, and
AFAICT that will remain the same after application of the submitted set
of vector_length patches.

> therefore, I had the cuOccupancyMaxPotentialBlockSize code path behave
> the same.

They don't behave the same. What you add here is ignoring
GOMP_OPENACC_DIM[GOMP_DIM_VECTOR], not handling it. That requires a comment.

Furthermore, by assigning dims[GOMP_DIM_VECTOR] at the start you break
the pattern of the code, which:
- first applies GOMP_OPENACC_DIM
- then further fills in defaults as required
- then applies defaults
I've rewritten this bit to fit the pattern. This result is not pretty,
but it'll do for now.  Changing the pattern may make things better
structured, but this is something we can do in a follow up patch, and
want to do for all dimensions at once, not just for vector, otherwise
the code will become too convoluted.

Btw, I've also noticed that we don't handle a too high
GOMP_OPENACC_DIM[GOMP_DIM_WORKER], I've added a TODO comment for this.

> If you want, I can resubmit a patch without that change though> 
> 0001-nvptx-Use-CUDA-driver-API-to-select-default-runtime-.patch
> 
> 
> [nvptx] Use CUDA driver API to select default runtime launch geometry
> 
>       PR target/85590
> 
>       libgomp/
>       plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.

You forgot to add '* ' in front of all files.

>       (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.

You've added cuOccupancyMaxPotentialBlockSize, not
cuOccupancyMaxPotentialBlockSizeWithFlags.

>       plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
>       CUDA_ONE_CALL_MAYBE_NULL.
>       plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
>       CUoccupancyB2DSize and declare
>       cuOccupancyMaxPotentialBlockSizeWithFlags.

Same here.

>       (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
>       default num_gangs and num_workers when the driver supports it.
> 
> ---
>  libgomp/plugin/cuda-lib.def   |  1 +
>  libgomp/plugin/cuda/cuda.h    |  3 ++
>  libgomp/plugin/plugin-nvptx.c | 77 +++++++++++++++++++++++++++++------
>  3 files changed, 69 insertions(+), 12 deletions(-)
> 
> diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
> index 29028b504a0..b2a4c2154eb 100644
> --- a/libgomp/plugin/cuda-lib.def
> +++ b/libgomp/plugin/cuda-lib.def
> @@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
>  CUDA_ONE_CALL (cuModuleLoad)
>  CUDA_ONE_CALL (cuModuleLoadData)
>  CUDA_ONE_CALL (cuModuleUnload)
> +CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
>  CUDA_ONE_CALL (cuStreamCreate)
>  CUDA_ONE_CALL (cuStreamDestroy)
>  CUDA_ONE_CALL (cuStreamQuery)
> diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
> index 4799825bda2..b4c1b29c5d8 100644
> --- a/libgomp/plugin/cuda/cuda.h
> +++ b/libgomp/plugin/cuda/cuda.h
> @@ -44,6 +44,7 @@ typedef void *CUevent;
>  typedef void *CUfunction;
>  typedef void *CUlinkState;
>  typedef void *CUmodule;
> +typedef size_t (*CUoccupancyB2DSize)(int);
>  typedef void *CUstream;
>  
>  typedef enum {
> @@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, 
> CUmodule, const char *);
>  CUresult cuModuleLoad (CUmodule *, const char *);
>  CUresult cuModuleLoadData (CUmodule *, const void *);
>  CUresult cuModuleUnload (CUmodule);
> +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
> +                                       CUoccupancyB2DSize, size_t, int);
>  CUresult cuStreamCreate (CUstream *, unsigned);
>  #define cuStreamDestroy cuStreamDestroy_v2
>  CUresult cuStreamDestroy (CUstream);
> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
> index 6799a264976..9a4bc11e5fe 100644
> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void 
> *, size_t,
>                       const char *, unsigned, CUjit_option *, void **);
>  CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
>  #else
> +typedef size_t (*CUoccupancyB2DSize)(int);
>  CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
>                          const char *, unsigned, CUjit_option *, void **);
>  CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
> +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
> +                                       CUoccupancyB2DSize, size_t, int);
>  #endif
>  
>  #define DO_PRAGMA(x) _Pragma (#x)
> @@ -1200,21 +1203,71 @@ nvptx_exec (void (*fn), size_t mapnum, void 
> **hostaddrs, void **devaddrs,
>        {
>       bool default_dim_p[GOMP_DIM_MAX];
>       for (i = 0; i != GOMP_DIM_MAX; i++)
> +       default_dim_p[i] = !dims[i];
> +
> +     if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
>         {
> -         default_dim_p[i] = !dims[i];
> -         if (default_dim_p[i])
> -           dims[i] = nvthd->ptx_dev->default_dims[i];
> -       }
> +         for (i = 0; i != GOMP_DIM_MAX; i++)
> +           {
> +             if (default_dim_p[i])
> +               dims[i] = nvthd->ptx_dev->default_dims[i];
> +           }
>  
> -     if (default_dim_p[GOMP_DIM_VECTOR])
> -       dims[GOMP_DIM_VECTOR]
> -         = MIN (dims[GOMP_DIM_VECTOR],
> -                (targ_fn->max_threads_per_block / warp_size * warp_size));
> +         if (default_dim_p[GOMP_DIM_VECTOR])
> +           dims[GOMP_DIM_VECTOR]
> +             = MIN (dims[GOMP_DIM_VECTOR],
> +                    (targ_fn->max_threads_per_block / warp_size
> +                     * warp_size));
>  
> -     if (default_dim_p[GOMP_DIM_WORKER])
> -       dims[GOMP_DIM_WORKER]
> -         = MIN (dims[GOMP_DIM_WORKER],
> -                targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
> +         if (default_dim_p[GOMP_DIM_WORKER])
> +           dims[GOMP_DIM_WORKER]
> +             = MIN (dims[GOMP_DIM_WORKER],
> +                    targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
> +       }
> +     else
> +       {
> +         int vectors = default_dim_p[GOMP_DIM_VECTOR]
> +           ? 0 : dims[GOMP_DIM_VECTOR];

This is badly formatted. Please read
https://www.gnu.org/prep/standards/html_node/Formatting.html .

This is typically formatted like:
...
int vectors = (default_dim_p[GOMP_DIM_VECTOR]
               ? 0 : dims[GOMP_DIM_VECTOR]);
...
or:
...
int vectors = (default_dim_p[GOMP_DIM_VECTOR]
               ? 0
               : dims[GOMP_DIM_VECTOR]);
...

> +         int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
> +         int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
> +         int grids, blocks;
> +
> +         CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
> +                           &blocks, function, NULL, 0,
> +                           dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
> +         GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
> +                            "grid = %d, block = %d\n", grids, blocks);
> +
> +         /* Keep the num_gangs proportional to the block size.  In
> +            the case were a block size is limited by shared-memory
> +            or the register file capacity, the runtime will not
> +            excessively over assign gangs to the multiprocessor
> +            units if their state is going to be swapped out even
> +            more than necessary. The constant factor 2 is there to
> +            prevent threads from idling when there is insufficient
> +            work for them.  */
> +         if (gangs == 0)
> +           gangs = 2 * grids * (blocks / warp_size);
> +
> +         if (vectors == 0)
> +           vectors = warp_size;
> +
> +         if (workers == 0)
> +           workers = blocks / vectors;
> +
> +         for (i = 0; i != GOMP_DIM_MAX; i++)
> +           {

> +             default_dim_p[i] = !dims[i];

This is duplicate code.


> +             if (default_dim_p[i])
> +               switch (i)
> +                 {
> +                 case GOMP_DIM_GANG: dims[i] = gangs; break;
> +                 case GOMP_DIM_WORKER: dims[i] = workers; break;
> +                 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
> +                 default: GOMP_PLUGIN_fatal ("invalid dim");
> +                 }
> +           }
> +       }
>        }
>      }
>  
> -- 2.17.1
> 

Committed as attached.

Thanks,
- Tom

[nvptx] Use CUDA driver API to select default runtime launch geometry

The CUDA driver API starting version 6.5 offers a set of runtime functions to
calculate several occupancy-related measures, as a replacement for the occupancy
calculator spreadsheet.

This patch adds a heuristic for default runtime launch geometry, based on the
new runtime function cuOccupancyMaxPotentialBlockSize.

Build on x86_64 with nvptx accelerator and ran libgomp testsuite.

2018-08-13  Cesar Philippidis  <ce...@codesourcery.com>
	    Tom de Vries  <tdevr...@suse.de>

	PR target/85590
	* plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuOccupancyMaxPotentialBlockSize): Declare.
	* plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
	CUDA_ONE_CALL_MAYBE_NULL.
	* plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
	CUoccupancyB2DSize and declare
	cuOccupancyMaxPotentialBlockSize.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.

---
 libgomp/plugin/cuda-lib.def   |  1 +
 libgomp/plugin/cuda/cuda.h    |  3 ++
 libgomp/plugin/plugin-nvptx.c | 83 ++++++++++++++++++++++++++++++++++++-------
 3 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 29028b504a0..b2a4c2154eb 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
 CUDA_ONE_CALL (cuModuleLoad)
 CUDA_ONE_CALL (cuModuleLoadData)
 CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
 CUDA_ONE_CALL (cuStreamCreate)
 CUDA_ONE_CALL (cuStreamDestroy)
 CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825bda2..b4c1b29c5d8 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 6799a264976..bae1b05ccaa 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
 			const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
 #else
+typedef size_t (*CUoccupancyB2DSize)(int);
 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
 			   const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
 #endif
 
 #define DO_PRAGMA(x) _Pragma (#x)
@@ -1200,21 +1203,77 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
       {
 	bool default_dim_p[GOMP_DIM_MAX];
 	for (i = 0; i != GOMP_DIM_MAX; i++)
+	  default_dim_p[i] = !dims[i];
+
+	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 	  {
-	    default_dim_p[i] = !dims[i];
-	    if (default_dim_p[i])
-	      dims[i] = nvthd->ptx_dev->default_dims[i];
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      if (default_dim_p[i])
+		dims[i] = nvthd->ptx_dev->default_dims[i];
+
+	    if (default_dim_p[GOMP_DIM_VECTOR])
+	      dims[GOMP_DIM_VECTOR]
+		= MIN (dims[GOMP_DIM_VECTOR],
+		       (targ_fn->max_threads_per_block / warp_size
+			* warp_size));
+
+	    if (default_dim_p[GOMP_DIM_WORKER])
+	      dims[GOMP_DIM_WORKER]
+		= MIN (dims[GOMP_DIM_WORKER],
+		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 	  }
+	else
+	  {
+	    /* Handle the case that the compiler allows the runtime to choose
+	       the vector-length conservatively, by ignoring
+	       gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
+	       it.  */
+	    int vectors = 0;
+	    /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
+	       gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
+	       exceed targ_fn->max_threads_per_block. */
+	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
+	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
+	    int grids, blocks;
+
+	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+			      &blocks, function, NULL, 0,
+			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+			       "grid = %d, block = %d\n", grids, blocks);
+
+	    /* Keep the num_gangs proportional to the block size.  In
+	       the case were a block size is limited by shared-memory
+	       or the register file capacity, the runtime will not
+	       excessively over assign gangs to the multiprocessor
+	       units if their state is going to be swapped out even
+	       more than necessary. The constant factor 2 is there to
+	       prevent threads from idling when there is insufficient
+	       work for them.  */
+	    if (gangs == 0)
+	      gangs = 2 * grids * (blocks / warp_size);
+
+	    if (vectors == 0)
+	      vectors = warp_size;
+
+	    if (workers == 0)
+	      {
+		int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
+				      ? vectors
+				      : dims[GOMP_DIM_VECTOR]);
+		workers = blocks / actual_vectors;
+	      }
 
-	if (default_dim_p[GOMP_DIM_VECTOR])
-	  dims[GOMP_DIM_VECTOR]
-	    = MIN (dims[GOMP_DIM_VECTOR],
-		   (targ_fn->max_threads_per_block / warp_size * warp_size));
-
-	if (default_dim_p[GOMP_DIM_WORKER])
-	  dims[GOMP_DIM_WORKER]
-	    = MIN (dims[GOMP_DIM_WORKER],
-		   targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      if (default_dim_p[i])
+		switch (i)
+		  {
+		  case GOMP_DIM_GANG: dims[i] = gangs; break;
+		  case GOMP_DIM_WORKER: dims[i] = workers; break;
+		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+		  default: GOMP_PLUGIN_fatal ("invalid dim");
+		  }
+	  }
       }
     }

Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

Reply via email to