[gomp-nvptx 13/13] libgomp plugin: handle multiple teams

Alexander Monakov Wed, 20 Jan 2016 09:47:09 -0800

This complements multiple teams support on the libgomp plugin side.

        * plugin/plugin-nvptx.c (struct targ_fn_descriptor): Add new fields.
        (struct ptx_device): Ditto.  Set them...
        (nvptx_open_device): ...here.
        (GOMP_OFFLOAD_load_image): Set new targ_fn_descriptor fields.
        (nvptx_adjust_launch_bounds): New.  Use it...
        (GOMP_OFFLOAD_run): ...here.
---
 libgomp/ChangeLog.gomp-nvptx  |   9 ++++
 libgomp/plugin/plugin-nvptx.c | 106 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 109 insertions(+), 6 deletions(-)


diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 87e0494..b7bf59b 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -254,6 +254,8 @@ struct targ_fn_descriptor
 {
   CUfunction fn;
   const struct targ_fn_launch *launch;
+  int regs_per_thread;
+  int max_threads_per_block;
 };
 
 /* A loaded PTX image.  */
@@ -290,6 +292,9 @@ struct ptx_device
   bool mkern;
   int  mode;
   int clock_khz;
+  int num_sms;
+  int regs_per_block;
+  int regs_per_sm;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -648,6 +653,36 @@ nvptx_open_device (int n)
 
   ptx_dev->clock_khz = pi;
 
+  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 
dev);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+  ptx_dev->num_sms = pi;
+
+  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+                           dev);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+  ptx_dev->regs_per_block = pi;
+
+  /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
+     in CUDA 6.0 and newer.  */
+  r = cuDeviceGetAttribute (&pi, 82, dev);
+  /* Fallback: use limit of registers per block, which is usually equal.  */
+  if (r == CUDA_ERROR_INVALID_VALUE)
+    pi = ptx_dev->regs_per_block;
+  else if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+  ptx_dev->regs_per_sm = pi;
+
+  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+  if (pi != 32)
+    GOMP_PLUGIN_fatal ("Only warp size 32 is supported");
+
   r = cuDeviceGetAttribute (&async_engines,
                            CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
   if (r != CUDA_SUCCESS)
@@ -1589,13 +1624,23 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, 
const void *target_data,
   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
     {
       CUfunction function;
+      int nregs, mthrs;
 
       r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
       if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
+      r = cuFuncGetAttribute (&nregs, CU_FUNC_ATTRIBUTE_NUM_REGS, function);
+      if (r != CUDA_SUCCESS)
+       GOMP_PLUGIN_fatal ("cuFuncGetAttribute error: %s", cuda_error (r));
+      r = cuFuncGetAttribute (&mthrs, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             function);
+      if (r != CUDA_SUCCESS)
+       GOMP_PLUGIN_fatal ("cuFuncGetAttribute error: %s", cuda_error (r));
 
       targ_fns->fn = function;
       targ_fns->launch = &fn_descs[i];
+      targ_fns->regs_per_thread = nregs;
+      targ_fns->max_threads_per_block = mthrs;
 
       targ_tbl->start = (uintptr_t) targ_fns;
       targ_tbl->end = targ_tbl->start + 1;
@@ -1822,19 +1867,67 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void 
*stream)
   return nvptx_set_cuda_stream (async, stream);
 }
 
+/* Adjust launch dimensions: pick good values for number of blocks and warps
+   and ensure that number of warps does not exceed CUDA limits as well as GCC's
+   own limits.  */
+
+static void
+nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
+                           struct ptx_device *ptx_dev,
+                           long *teams_p, long *threads_p)
+{
+  int max_warps_block = fn->max_threads_per_block / 32;
+  /* Maximum 32 warps per block is an implementation limit in NVPTX backend
+     and libgcc, which matches documented limit of all GPUs as of 2015.  */
+  if (max_warps_block > 32)
+    max_warps_block = 32;
+  if (*threads_p <= 0)
+    *threads_p = 8;
+  if (*threads_p > max_warps_block)
+    *threads_p = max_warps_block;
+
+  int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
+  /* This is an estimate of how many blocks the device can host simultaneously.
+     Actual limit, which may be lower, can be queried with "occupancy control"
+     driver interface (since CUDA 6.0).  */
+  int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
+  if (*teams_p <= 0 || *teams_p > max_blocks)
+    *teams_p = max_blocks;
+}
+
 void
-GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars)
+GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 {
   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
   CUresult r;
   struct ptx_device *ptx_dev = ptx_devices[ord];
   const char *maybe_abort_msg = "(perhaps abort was called)";
-  void *args = &tgt_vars;
+  void *fn_args = &tgt_vars;
+  long teams = 0, threads = 0;
+
+  if (!args)
+    GOMP_PLUGIN_fatal ("No target arguments provided");
+  while (*args)
+    {
+      long id = (long) *args++, val;
+      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+       val = (long) *args++;
+      else
+        val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
+       continue;
+      id &= GOMP_TARGET_ARG_ID_MASK;
+      if (id == GOMP_TARGET_ARG_NUM_TEAMS)
+       teams = val;
+      else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
+       threads = val;
+    }
+  nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
 
   r = cuLaunchKernel (function,
-                     1, 1, 1,
-                     32, 8, 1,
-                     0, ptx_dev->null_stream->stream, &args, 0);
+                     teams, 1, 1,
+                     32, threads, 1,
+                     0, ptx_dev->null_stream->stream, &fn_args, 0);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 
@@ -1847,7 +1940,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars)
 }
 
 void
-GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void 
*async_data)
+GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
+                       void *async_data)
 {
   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
 }

[gomp-nvptx 13/13] libgomp plugin: handle multiple teams

Reply via email to