[PATCH 2/3, libgomp] nvptx plugin parts

Chung-Lin Tang Thu, 27 Aug 2015 06:46:02 -0700

These are the nvptx plugin specific parts.

Chung-Lin


        * plugin/plugin-nvptx.c (CUDA_CALL_ERET): New convenience macro.
        (CUDA_CALL): Likewise.
        (CUDA_CALL_ASSERT): Likewise.
        (map_init): Change return type to bool, use CUDA_CALL* macros.
        (map_fini): Likewise.
        (init_streams_for_device): Change return type to bool, adjust
        call to map_init.
        (fini_streams_for_device): Change return type to bool, adjust
        call to map_fini.
        (select_stream_for_async): Release stream_lock before calls to
        GOMP_PLUGIN_fatal, adjust call to map_init.
        (nvptx_init): Use CUDA_CALL* macros.
        (nvptx_attach_host_thread_to_device): Change return type to bool,
        use CUDA_CALL* macros.
        (nvptx_open_device): Use CUDA_CALL* macros.
        (nvptx_close_device): Change return type to bool, use CUDA_CALL* macros.
        (nvptx_get_num_devices): Use CUDA_CALL* macros.
        (link_ptx): Change return type to bool, use CUDA_CALL* macros.
        (nvptx_exec): Use CUDA_CALL* macros.
        (nvptx_alloc): Change return type to bool, use CUDA_CALL* macros,
        change to use out parameter to return allocated pointer.
        (nvptx_free): Change return type to bool, use CUDA_CALL* macros.
        (nvptx_host2dev): Likewise.
        (nvptx_dev2host): Likewise.
        (nvptx_wait): Use CUDA_CALL* macros.
        (nvptx_wait_async): Likewise.
        (nvptx_wait_all): Likewise.
        (nvptx_wait_all_async): Likewise.
        (nvptx_set_cuda_stream): Adjust order of stream_lock acquire,
        use CUDA_CALL* macros, adjust call to map_fini.
        (GOMP_OFFLOAD_init_device): Change return type to bool,
        adjust code accordingly.
        (GOMP_OFFLOAD_fini_device): Likewise.
        (GOMP_OFFLOAD_load_image): Adjust calls to
        nvptx_attach_host_thread_to_device/link_ptx to handle errors,
        use CUDA_CALL* macros.
        (GOMP_OFFLOAD_alloc): Change return type to bool, adjust calls
        to code to handle error return.
        (GOMP_OFFLOAD_free): Likewise.
        (GOMP_OFFLOAD_dev2host): Likewise.
        (GOMP_OFFLOAD_host2dev): Likewise.
        (GOMP_OFFLOAD_openacc_register_async_cleanup): Use CUDA_CALL* macros.
        (GOMP_OFFLOAD_openacc_create_thread_data): Likewise.

Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c	(revision 227257)
+++ libgomp/plugin/plugin-nvptx.c	(working copy)
@@ -127,6 +127,34 @@ cuda_error (CUresult r)
   return errmsg;
 }
 
+/* Convenience macros for the frequently used CUDA library call and
+   error handling sequence.  This does not capture all the cases we
+   use in this file, but is common enough.  */
+
+#define CUDA_CALL_ERET(ERET, FN, ...)		\
+  do {						\
+    unsigned __r = FN (__VA_ARGS__);		\
+    if (__r != CUDA_SUCCESS)			\
+      {						\
+	GOMP_PLUGIN_error (#FN " error: %s",	\
+			   cuda_error (__r));	\
+	return ERET;				\
+      }						\
+  } while (0)
+
+#define CUDA_CALL(FN, ...)			\
+  CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
+
+#define CUDA_CALL_ASSERT(FN, ...)		\
+  do {						\
+    unsigned __r = FN (__VA_ARGS__);		\
+    if (__r != CUDA_SUCCESS)			\
+      {						\
+	GOMP_PLUGIN_fatal (#FN " error: %s",	\
+			   cuda_error (__r));	\
+      }						\
+  } while (0)
+
 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
@@ -162,25 +190,18 @@ struct map
   char    mappings[0];
 };
 
-static void
+static bool
 map_init (struct ptx_stream *s)
 {
-  CUresult r;
-
   int size = getpagesize ();
 
   assert (s);
   assert (!s->d);
   assert (!s->h);
 
-  r = cuMemAllocHost (&s->h, size);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
+  CUDA_CALL (cuMemAllocHost, &s->h, size);
+  CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 
-  r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
-
   assert (s->h);
 
   s->h_begin = s->h;
@@ -189,16 +210,14 @@ map_init (struct ptx_stream *s)
 
   assert (s->h_next);
   assert (s->h_end);
+  return true;
 }
 
-static void
+static bool
 map_fini (struct ptx_stream *s)
 {
-  CUresult r;
-
-  r = cuMemFreeHost (s->h);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
+  CUDA_CALL (cuMemFreeHost, s->h);
+  return true;
 }
 
 static void
@@ -359,7 +378,7 @@ nvptx_thread (void)
   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 }
 
-static void
+static bool
 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 {
   int i;
@@ -371,9 +390,10 @@ init_streams_for_device (struct ptx_device *ptx_de
   null_stream->multithreaded = true;
   null_stream->d = (CUdeviceptr) NULL;
   null_stream->h = NULL;
-  map_init (null_stream);
-  ptx_dev->null_stream = null_stream;
+  if (!map_init (null_stream))
+    return false;
 
+  ptx_dev->null_stream = null_stream;
   ptx_dev->active_streams = NULL;
   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 
@@ -389,25 +409,35 @@ init_streams_for_device (struct ptx_device *ptx_de
 
   for (i = 0; i < concurrency; i++)
     ptx_dev->async_streams.arr[i] = NULL;
+
+  return true;
 }
 
-static void
+static bool
 fini_streams_for_device (struct ptx_device *ptx_dev)
 {
   free (ptx_dev->async_streams.arr);
 
+  bool ret = true;
   while (ptx_dev->active_streams != NULL)
     {
       struct ptx_stream *s = ptx_dev->active_streams;
       ptx_dev->active_streams = ptx_dev->active_streams->next;
 
-      map_fini (s);
-      cuStreamDestroy (s->stream);
+      ret &= map_fini (s);
+
+      CUresult r = cuStreamDestroy (s->stream);
+      if (r != CUDA_SUCCESS)
+	{
+	  GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
+	  ret = false;
+	}
       free (s);
     }
 
-  map_fini (ptx_dev->null_stream);
+  ret &= map_fini (ptx_dev->null_stream);
   free (ptx_dev->null_stream);
+  return ret;
 }
 
 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
@@ -481,7 +511,11 @@ select_stream_for_async (int async, pthread_t thre
 	    {
 	      r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 	      if (r != CUDA_SUCCESS)
-		GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
+		{
+		  pthread_mutex_unlock (&ptx_dev->stream_lock);
+		  GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
+				     cuda_error (r));
+		}
 	    }
 
 	  /* If CREATE is true, we're going to be queueing some work on this
@@ -491,7 +525,11 @@ select_stream_for_async (int async, pthread_t thre
 
 	  s->d = (CUdeviceptr) NULL;
 	  s->h = NULL;
-	  map_init (s);
+	  if (!map_init (s))
+	    {
+	      pthread_mutex_unlock (&ptx_dev->stream_lock);
+	      GOMP_PLUGIN_fatal ("map_init fail");
+	    }
 
 	  s->next = ptx_dev->active_streams;
 	  ptx_dev->active_streams = s;
@@ -501,7 +539,11 @@ select_stream_for_async (int async, pthread_t thre
       stream = ptx_dev->async_streams.arr[async];
     }
   else if (async < 0)
-    GOMP_PLUGIN_fatal ("bad async %d", async);
+    {
+      if (create)
+	pthread_mutex_unlock (&ptx_dev->stream_lock);
+      GOMP_PLUGIN_fatal ("bad async %d", async);
+    }
 
   if (create)
     {
@@ -532,34 +574,25 @@ select_stream_for_async (int async, pthread_t thre
 static bool
 nvptx_init (void)
 {
-  CUresult r;
   int ndevs;
 
   if (instantiated_devices != 0)
     return true;
 
-  r = cuInit (0);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
-
+  CUDA_CALL (cuInit, 0);
   ptx_events = NULL;
-
   pthread_mutex_init (&ptx_event_lock, NULL);
 
-  r = cuDeviceGetCount (&ndevs);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
-
+  CUDA_CALL (cuDeviceGetCount, &ndevs);
   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 					    * ndevs);
-
   return true;
 }
 
 /* Select the N'th PTX device for the current host thread.  The device must
    have been previously opened before calling this function.  */
 
-static void
+static bool
 nvptx_attach_host_thread_to_device (int n)
 {
   CUdevice dev;
@@ -569,34 +602,34 @@ nvptx_attach_host_thread_to_device (int n)
 
   r = cuCtxGetDevice (&dev);
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
-    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+    {
+      GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
+      return false;
+    }
 
   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
-    return;
+    return true;
   else
     {
       CUcontext old_ctx;
 
       ptx_dev = ptx_devices[n];
-      assert (ptx_dev);
+      if (!ptx_dev)
+	{
+	  GOMP_PLUGIN_error ("device %d not found", n);
+	  return false;
+	}
 
-      r = cuCtxGetCurrent (&thd_ctx);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+      CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 
       /* We don't necessarily have a current context (e.g. if it has been
          destroyed.  Pop it if we do though.  */
       if (thd_ctx != NULL)
-	{
-	  r = cuCtxPopCurrent (&old_ctx);
-	  if (r != CUDA_SUCCESS)
-            GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
-	}
+	CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 
-      r = cuCtxPushCurrent (ptx_dev->ctx);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
+      CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
     }
+  return true;
 }
 
 static struct ptx_device *
@@ -607,9 +640,7 @@ nvptx_open_device (int n)
   CUresult r;
   int async_engines, pi;
 
-  r = cuDeviceGet (&dev, n);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
+  CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 
   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 
@@ -619,60 +650,44 @@ nvptx_open_device (int n)
 
   r = cuCtxGetDevice (&ctx_dev);
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
-    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+    {
+      GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
+      return NULL;
+    }
   
   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
     {
       /* The current host thread has an active context for a different device.
          Detach it.  */
       CUcontext old_ctx;
-      
-      r = cuCtxPopCurrent (&old_ctx);
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+      CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
     }
 
-  r = cuCtxGetCurrent (&ptx_dev->ctx);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+  CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 
   if (!ptx_dev->ctx)
-    {
-      r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
-    }
+    CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
   else
     ptx_dev->ctx_shared = true;
 
-  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
   ptx_dev->overlap = pi;
 
-  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
   ptx_dev->map = pi;
 
-  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
   ptx_dev->concur = pi;
 
-  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
   ptx_dev->mode = pi;
 
-  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
   ptx_dev->mkern = pi;
 
   r = cuDeviceGetAttribute (&async_engines,
@@ -683,38 +698,34 @@ nvptx_open_device (int n)
   ptx_dev->images = NULL;
   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 
-  init_streams_for_device (ptx_dev, async_engines);
+  if (!init_streams_for_device (ptx_dev, async_engines))
+    return NULL;
 
   return ptx_dev;
 }
 
-static void
+static bool
 nvptx_close_device (struct ptx_device *ptx_dev)
 {
-  CUresult r;
-
   if (!ptx_dev)
-    return;
+    return true;
 
-  fini_streams_for_device (ptx_dev);
+  if (!fini_streams_for_device (ptx_dev))
+    return false;
   
   pthread_mutex_destroy (&ptx_dev->image_lock);
 
   if (!ptx_dev->ctx_shared)
-    {
-      r = cuCtxDestroy (ptx_dev->ctx);
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
-    }
+    CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 
   free (ptx_dev);
+  return true;
 }
 
 static int
 nvptx_get_num_devices (void)
 {
   int n;
-  CUresult r;
 
   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
      configurations.  */
@@ -727,22 +738,19 @@ nvptx_get_num_devices (void)
      further initialization).  */
   if (instantiated_devices == 0)
     {
-      r = cuInit (0);
+      CUresult r = cuInit (0);
       /* This is not an error: e.g. we may have CUDA libraries installed but
          no devices available.  */
       if (r != CUDA_SUCCESS)
         return 0;
     }
 
-  r = cuDeviceGetCount (&n);
-  if (r!= CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
-
+  CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
   return n;
 }
 
 
-static void
+static bool
 link_ptx (CUmodule *module, const char *ptx_code)
 {
   CUjit_option opts[7];
@@ -780,9 +788,7 @@ link_ptx (CUmodule *module, const char *ptx_code)
   opts[6] = CU_JIT_TARGET;
   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 
-  r = cuLinkCreate (7, opts, optvals, &linkstate);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
+  CUDA_CALL (cuLinkCreate, 7, opts, optvals, &linkstate);
 
   char *abort_ptx = ABORT_PTX;
   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
@@ -790,7 +796,8 @@ link_ptx (CUmodule *module, const char *ptx_code)
   if (r != CUDA_SUCCESS)
     {
       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
-      GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
+      GOMP_PLUGIN_error ("cuLinkAddData (abort) error: %s", cuda_error (r));
+      return false;
     }
 
   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
@@ -799,8 +806,9 @@ link_ptx (CUmodule *module, const char *ptx_code)
   if (r != CUDA_SUCCESS)
     {
       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
-      GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
+      GOMP_PLUGIN_error ("cuLinkAddData (acc_on_device) error: %s",
 			 cuda_error (r));
+      return false;
     }
 
   char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
@@ -809,29 +817,28 @@ link_ptx (CUmodule *module, const char *ptx_code)
   if (r != CUDA_SUCCESS)
     {
       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
-      GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
+      GOMP_PLUGIN_error ("cuLinkAddData (goacc_internal_ptx) error: %s",
 			 cuda_error (r));
+      return false;
     }
 
   /* cuLinkAddData's 'data' argument erroneously omits the const qualifier.  */
   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
-              strlen (ptx_code) + 1, 0, 0, 0, 0);
+		     strlen (ptx_code) + 1, 0, 0, 0, 0);
   if (r != CUDA_SUCCESS)
     {
       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
-      GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
+      GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
+      return false;
     }
 
-  r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
+  CUDA_CALL (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 
   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 
-  r = cuModuleLoadData (module, linkout);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
+  CUDA_CALL (cuModuleLoadData, module, linkout);
+  return true;
 }
 
 static void
@@ -961,11 +968,11 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
 
   /* Copy the (device) pointers to arguments to the device (dp and hp might in
      fact have the same value on a unified-memory system).  */
-  r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
+  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
+		    mapnum * sizeof (void *));
 
-  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
+  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__,
+		     targ_fn->name);
 
   // OpenACC		CUDA
   //
@@ -991,12 +998,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
   nthreads_in_block = vector_length;
 
   kargs[0] = &dp;
-  r = cuLaunchKernel (function,
-		      num_gangs, 1, 1,
-		      nthreads_in_block, 1, 1,
-		      0, dev_str->stream, kargs, 0);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+  CUDA_CALL_ASSERT (cuLaunchKernel, function,
+		    num_gangs, 1, 1,
+		    nthreads_in_block, 1, 1,
+		    0, dev_str->stream, kargs, 0);
 
 #ifndef DISABLE_ASYNC
   if (async < acc_async_noval)
@@ -1023,9 +1028,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
 
       event_gc (true);
 
-      r = cuEventRecord (*e, dev_str->stream);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+      CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
 
       event_add (PTX_EVT_KNL, e, (void *)dev_str);
     }
@@ -1049,167 +1052,157 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
 
 void * openacc_get_current_cuda_context (void);
 
-static void *
-nvptx_alloc (size_t s)
+static bool
+nvptx_alloc (size_t s, void **ptr)
 {
   CUdeviceptr d;
   CUresult r;
 
   r = cuMemAlloc (&d, s);
   if (r == CUDA_ERROR_OUT_OF_MEMORY)
-    return 0;
+    {
+      *ptr = NULL;
+      return true;
+    }
+
   if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
-  return (void *)d;
+    {
+      GOMP_PLUGIN_error ("cuMemAlloc error: %s", cuda_error (r));
+      return false;
+    }
+
+  *ptr = (void *)d;
+  return true;
 }
 
-static void
+static bool
 nvptx_free (void *p)
 {
-  CUresult r;
   CUdeviceptr pb;
   size_t ps;
 
-  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
+  if ((CUdeviceptr) p != pb)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
 
-  if ((CUdeviceptr)p != pb)
-    GOMP_PLUGIN_fatal ("invalid device address");
-
-  r = cuMemFree ((CUdeviceptr)p);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+  CUDA_CALL (cuMemFree, (CUdeviceptr) p);
+  return true;
 }
 
-static void *
+
+static bool
 nvptx_host2dev (void *d, const void *h, size_t s)
 {
-  CUresult r;
   CUdeviceptr pb;
   size_t ps;
   struct nvptx_thread *nvthd = nvptx_thread ();
 
   if (!s)
-    return 0;
-
+    return true;
   if (!d)
-    GOMP_PLUGIN_fatal ("invalid device address");
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
 
-  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
 
   if (!pb)
-    GOMP_PLUGIN_fatal ("invalid device address");
-
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
   if (!h)
-    GOMP_PLUGIN_fatal ("invalid host address");
-
+    {
+      GOMP_PLUGIN_error ("invalid host address");
+      return false;
+    }
   if (d == h)
-    GOMP_PLUGIN_fatal ("invalid host or device address");
-
+    {
+      GOMP_PLUGIN_error ("invalid host or device address");
+      return false;
+    }
   if ((void *)(d + s) > (void *)(pb + ps))
-    GOMP_PLUGIN_fatal ("invalid size");
+    {
+      GOMP_PLUGIN_error ("invalid size");
+      return false;
+    }
 
 #ifndef DISABLE_ASYNC
   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
-      CUevent *e;
-
-      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
-
+      CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+      CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
       event_gc (false);
-
-      r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
-			     nvthd->current_stream->stream);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
-
-      r = cuEventRecord (*e, nvthd->current_stream->stream);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
-
+      CUDA_CALL (cuMemcpyHtoDAsync,
+		 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
+      CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
       event_add (PTX_EVT_MEM, e, (void *)h);
     }
   else
 #endif
-    {
-      r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
-    }
+    CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
 
-  return 0;
+  return true;
 }
 
-static void *
+static bool
 nvptx_dev2host (void *h, const void *d, size_t s)
 {
-  CUresult r;
   CUdeviceptr pb;
   size_t ps;
   struct nvptx_thread *nvthd = nvptx_thread ();
 
   if (!s)
-    return 0;
-
+    return true;
   if (!d)
-    GOMP_PLUGIN_fatal ("invalid device address");
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
 
-  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
 
   if (!pb)
-    GOMP_PLUGIN_fatal ("invalid device address");
-
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
   if (!h)
-    GOMP_PLUGIN_fatal ("invalid host address");
-
+    {
+      GOMP_PLUGIN_error ("invalid host address");
+      return false;
+    }
   if (d == h)
-    GOMP_PLUGIN_fatal ("invalid host or device address");
-
+    {
+      GOMP_PLUGIN_error ("invalid host or device address");
+      return false;
+    }
   if ((void *)(d + s) > (void *)(pb + ps))
-    GOMP_PLUGIN_fatal ("invalid size");
+    {
+      GOMP_PLUGIN_error ("invalid size");
+      return false;
+    }
 
 #ifndef DISABLE_ASYNC
   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
-      CUevent *e;
-
-      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
-
+      CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+      CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
       event_gc (false);
-
-      r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
-			     nvthd->current_stream->stream);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
-
-      r = cuEventRecord (*e, nvthd->current_stream->stream);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
-
+      CUDA_CALL (cuMemcpyDtoHAsync,
+		 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
+      CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
       event_add (PTX_EVT_MEM, e, (void *)h);
     }
   else
 #endif
-    {
-      r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
-    }
+    CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
 
-  return 0;
+  return true;
 }
 
 static void
@@ -1279,17 +1272,13 @@ nvptx_async_test_all (void)
 static void
 nvptx_wait (int async)
 {
-  CUresult r;
   struct ptx_stream *s;
 
   s = select_stream_for_async (async, pthread_self (), false, NULL);
-
   if (!s)
     GOMP_PLUGIN_fatal ("unknown async %d", async);
 
-  r = cuStreamSynchronize (s->stream);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
 
   event_gc (true);
 }
@@ -1297,7 +1286,6 @@ nvptx_wait (int async)
 static void
 nvptx_wait_async (int async1, int async2)
 {
-  CUresult r;
   CUevent *e;
   struct ptx_stream *s1, *s2;
   pthread_t self = pthread_self ();
@@ -1313,23 +1301,17 @@ nvptx_wait_async (int async1, int async2)
   if (s1 == s2)
     GOMP_PLUGIN_fatal ("identical parameters");
 
-  e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+  e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
 
-  r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+  CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
 
   event_gc (true);
 
-  r = cuEventRecord (*e, s1->stream);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+  CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
 
   event_add (PTX_EVT_SYNC, e, NULL);
 
-  r = cuStreamWaitEvent (s2->stream, *e, 0);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
+  CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
 }
 
 static void
@@ -1354,9 +1336,7 @@ nvptx_wait_all (void)
 	  else if (r != CUDA_ERROR_NOT_READY)
 	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
 
-	  r = cuStreamSynchronize (s->stream);
-	  if (r != CUDA_SUCCESS)
-	    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+	  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
 	}
     }
 
@@ -1368,7 +1348,6 @@ nvptx_wait_all (void)
 static void
 nvptx_wait_all_async (int async)
 {
-  CUresult r;
   struct ptx_stream *waiting_stream, *other_stream;
   CUevent *e;
   struct nvptx_thread *nvthd = nvptx_thread ();
@@ -1398,20 +1377,14 @@ nvptx_wait_all_async (int async)
 
       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
 
-      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+      CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
 
       /* Record an event on the waited-for stream.  */
-      r = cuEventRecord (*e, other_stream->stream);
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+      CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
 
       event_add (PTX_EVT_SYNC, e, NULL);
 
-      r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
+      CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
    }
 
   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
@@ -1460,11 +1433,11 @@ nvptx_set_cuda_stream (int async, void *stream)
   pthread_t self = pthread_self ();
   struct nvptx_thread *nvthd = nvptx_thread ();
 
-  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
   if (async < 0)
     GOMP_PLUGIN_fatal ("bad async %d", async);
 
+  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
   /* We have a list of active streams and an array mapping async values to
      entries of that list.  We need to take "ownership" of the passed-in stream,
      and add it to our list, removing the previous entry also (if there was one)
@@ -1487,8 +1460,11 @@ nvptx_set_cuda_stream (int async, void *stream)
 	  s->next = s->next->next;
 	}
 
-      cuStreamDestroy (oldstream->stream);
-      map_fini (oldstream);
+      CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
+
+      if (!map_fini (oldstream))
+	GOMP_PLUGIN_fatal ("error when freeing host memory");
+
       free (oldstream);
     }
 
@@ -1525,37 +1501,50 @@ GOMP_OFFLOAD_get_num_devices (void)
   return nvptx_get_num_devices ();
 }
 
-void
+bool
 GOMP_OFFLOAD_init_device (int n)
 {
+  struct ptx_device *dev;
+
   pthread_mutex_lock (&ptx_dev_lock);
 
   if (!nvptx_init () || ptx_devices[n] != NULL)
     {
       pthread_mutex_unlock (&ptx_dev_lock);
-      return;
+      return false;
     }
 
-  ptx_devices[n] = nvptx_open_device (n);
-  instantiated_devices++;
+  dev = nvptx_open_device (n);
+  if (dev)
+    {
+      ptx_devices[n] = dev;
+      instantiated_devices++;
+    }
 
   pthread_mutex_unlock (&ptx_dev_lock);
+
+  return dev != NULL;
 }
 
-void
+bool
 GOMP_OFFLOAD_fini_device (int n)
 {
   pthread_mutex_lock (&ptx_dev_lock);
 
   if (ptx_devices[n] != NULL)
     {
-      nvptx_attach_host_thread_to_device (n);
-      nvptx_close_device (ptx_devices[n]);
+      if (!nvptx_attach_host_thread_to_device (n)
+	  || !nvptx_close_device (ptx_devices[n]))
+	{
+	  pthread_mutex_unlock (&ptx_dev_lock);
+	  return false;
+	}
       ptx_devices[n] = NULL;
       instantiated_devices--;
     }
 
   pthread_mutex_unlock (&ptx_dev_lock);
+  return true;
 }
 
 /* Data emitted by mkoffload.  */
@@ -1590,7 +1579,6 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version
   CUmodule module;
   const char *const *fn_names, *const *var_names;
   unsigned int fn_entries, var_entries, i, j;
-  CUresult r;
   struct targ_fn_descriptor *targ_fns;
   struct addr_pair *targ_tbl;
   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
@@ -1598,18 +1586,19 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version
   struct ptx_device *dev;
 
   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
-    GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
-		       " (expected %u, received %u)",
-		       GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
-  
-  GOMP_OFFLOAD_init_device (ord);
+    {
+      GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
+			 " (expected %u, received %u)",
+			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
+      return -1;
+    }
 
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !link_ptx (&module, img_header->ptx_src))
+    return -1;
+
   dev = ptx_devices[ord];
-  
-  nvptx_attach_host_thread_to_device (ord);
 
-  link_ptx (&module, img_header->ptx_src);
-
   /* The mkoffload utility emits a struct of pointers/integers at the
      start of each offload image.  The array of kernel names and the
      functions addresses form a one-to-one correspondence.  */
@@ -1639,11 +1628,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version
   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
     {
       CUfunction function;
+      CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module, fn_names[i]);
 
-      r = cuModuleGetFunction (&function, module, fn_names[i]);
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
-
       targ_fns->fn = function;
       targ_fns->name = (const char *) fn_names[i];
 
@@ -1656,9 +1642,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version
       CUdeviceptr var;
       size_t bytes;
 
-      r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
-      if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+      CUDA_CALL_ERET (-1, cuModuleGetGlobal,
+		      &var, &bytes, module, var_names[j]);
 
       targ_tbl->start = (uintptr_t) var;
       targ_tbl->end = targ_tbl->start + bytes;
@@ -1692,32 +1677,32 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned versi
   pthread_mutex_unlock (&dev->image_lock);
 }
 
-void *
-GOMP_OFFLOAD_alloc (int ord, size_t size)
+bool
+GOMP_OFFLOAD_alloc (int ord, size_t size, void **ptr)
 {
-  nvptx_attach_host_thread_to_device (ord);
-  return nvptx_alloc (size);
+  return (nvptx_attach_host_thread_to_device (ord)
+	  && nvptx_alloc (size, ptr));
 }
 
-void
+bool
 GOMP_OFFLOAD_free (int ord, void *ptr)
 {
-  nvptx_attach_host_thread_to_device (ord);
-  nvptx_free (ptr);
+  return (nvptx_attach_host_thread_to_device (ord)
+	  && nvptx_free (ptr));
 }
 
-void *
+bool
 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
 {
-  nvptx_attach_host_thread_to_device (ord);
-  return nvptx_dev2host (dst, src, n);
+  return (nvptx_attach_host_thread_to_device (ord)
+	  && nvptx_dev2host (dst, src, n));
 }
 
-void *
+bool
 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
 {
-  nvptx_attach_host_thread_to_device (ord);
-  return nvptx_host2dev (dst, src, n);
+  return (nvptx_attach_host_thread_to_device (ord)
+	  && nvptx_host2dev (dst, src, n));
 }
 
 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
@@ -1736,20 +1721,11 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *)
 void
 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
 {
-  CUevent *e;
-  CUresult r;
   struct nvptx_thread *nvthd = nvptx_thread ();
+  CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
 
-  e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-  r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
-
-  r = cuEventRecord (*e, nvthd->current_stream->stream);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
-
+  CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
+  CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
 }
 
@@ -1801,25 +1777,18 @@ GOMP_OFFLOAD_openacc_create_thread_data (int ord)
   struct ptx_device *ptx_dev;
   struct nvptx_thread *nvthd
     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
-  CUresult r;
   CUcontext thd_ctx;
 
   ptx_dev = ptx_devices[ord];
 
   assert (ptx_dev);
 
-  r = cuCtxGetCurrent (&thd_ctx);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+  CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
 
   assert (ptx_dev->ctx);
 
   if (!thd_ctx)
-    {
-      r = cuCtxPushCurrent (ptx_dev->ctx);
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
-    }
+    CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
 
   nvthd->current_stream = ptx_dev->null_stream;
   nvthd->ptx_dev = ptx_dev;

[PATCH 2/3, libgomp] nvptx plugin parts

Reply via email to