These are the nvptx plugin specific parts. Chung-Lin
* plugin/plugin-nvptx.c (CUDA_CALL_ERET): New convenience macro. (CUDA_CALL): Likewise. (CUDA_CALL_ASSERT): Likewise. (map_init): Change return type to bool, use CUDA_CALL* macros. (map_fini): Likewise. (init_streams_for_device): Change return type to bool, adjust call to map_init. (fini_streams_for_device): Change return type to bool, adjust call to map_fini. (select_stream_for_async): Release stream_lock before calls to GOMP_PLUGIN_fatal, adjust call to map_init. (nvptx_init): Use CUDA_CALL* macros. (nvptx_attach_host_thread_to_device): Change return type to bool, use CUDA_CALL* macros. (nvptx_open_device): Use CUDA_CALL* macros. (nvptx_close_device): Change return type to bool, use CUDA_CALL* macros. (nvptx_get_num_devices): Use CUDA_CALL* macros. (link_ptx): Change return type to bool, use CUDA_CALL* macros. (nvptx_exec): Use CUDA_CALL* macros. (nvptx_alloc): Change return type to bool, use CUDA_CALL* macros, change to use out parameter to return allocated pointer. (nvptx_free): Change return type to bool, use CUDA_CALL* macros. (nvptx_host2dev): Likewise. (nvptx_dev2host): Likewise. (nvptx_wait): Use CUDA_CALL* macros. (nvptx_wait_async): Likewise. (nvptx_wait_all): Likewise. (nvptx_wait_all_async): Likewise. (nvptx_set_cuda_stream): Adjust order of stream_lock acquire, use CUDA_CALL* macros, adjust call to map_fini. (GOMP_OFFLOAD_init_device): Change return type to bool, adjust code accordingly. (GOMP_OFFLOAD_fini_device): Likewise. (GOMP_OFFLOAD_load_image): Adjust calls to nvptx_attach_host_thread_to_device/link_ptx to handle errors, use CUDA_CALL* macros. (GOMP_OFFLOAD_alloc): Change return type to bool, adjust calls to code to handle error return. (GOMP_OFFLOAD_free): Likewise. (GOMP_OFFLOAD_dev2host): Likewise. (GOMP_OFFLOAD_host2dev): Likewise. (GOMP_OFFLOAD_openacc_register_async_cleanup): Use CUDA_CALL* macros. (GOMP_OFFLOAD_openacc_create_thread_data): Likewise.
Index: libgomp/plugin/plugin-nvptx.c =================================================================== --- libgomp/plugin/plugin-nvptx.c (revision 227257) +++ libgomp/plugin/plugin-nvptx.c (working copy) @@ -127,6 +127,34 @@ cuda_error (CUresult r) return errmsg; } +/* Convenience macros for the frequently used CUDA library call and + error handling sequence. This does not capture all the cases we + use in this file, but is common enough. */ + +#define CUDA_CALL_ERET(ERET, FN, ...) \ + do { \ + unsigned __r = FN (__VA_ARGS__); \ + if (__r != CUDA_SUCCESS) \ + { \ + GOMP_PLUGIN_error (#FN " error: %s", \ + cuda_error (__r)); \ + return ERET; \ + } \ + } while (0) + +#define CUDA_CALL(FN, ...) \ + CUDA_CALL_ERET (false, (FN), __VA_ARGS__) + +#define CUDA_CALL_ASSERT(FN, ...) \ + do { \ + unsigned __r = FN (__VA_ARGS__); \ + if (__r != CUDA_SUCCESS) \ + { \ + GOMP_PLUGIN_fatal (#FN " error: %s", \ + cuda_error (__r)); \ + } \ + } while (0) + static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; @@ -162,25 +190,18 @@ struct map char mappings[0]; }; -static void +static bool map_init (struct ptx_stream *s) { - CUresult r; - int size = getpagesize (); assert (s); assert (!s->d); assert (!s->h); - r = cuMemAllocHost (&s->h, size); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r)); + CUDA_CALL (cuMemAllocHost, &s->h, size); + CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0); - r = cuMemHostGetDevicePointer (&s->d, s->h, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r)); - assert (s->h); s->h_begin = s->h; @@ -189,16 +210,14 @@ map_init (struct ptx_stream *s) assert (s->h_next); assert (s->h_end); + return true; } -static void +static bool map_fini (struct ptx_stream *s) { - CUresult r; - - r = cuMemFreeHost (s->h); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r)); + CUDA_CALL (cuMemFreeHost, s->h); + return true; } static void @@ -359,7 +378,7 @@ nvptx_thread (void) return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); } -static void +static bool init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) { int i; @@ -371,9 +390,10 @@ init_streams_for_device (struct ptx_device *ptx_de null_stream->multithreaded = true; null_stream->d = (CUdeviceptr) NULL; null_stream->h = NULL; - map_init (null_stream); - ptx_dev->null_stream = null_stream; + if (!map_init (null_stream)) + return false; + ptx_dev->null_stream = null_stream; ptx_dev->active_streams = NULL; pthread_mutex_init (&ptx_dev->stream_lock, NULL); @@ -389,25 +409,35 @@ init_streams_for_device (struct ptx_device *ptx_de for (i = 0; i < concurrency; i++) ptx_dev->async_streams.arr[i] = NULL; + + return true; } -static void +static bool fini_streams_for_device (struct ptx_device *ptx_dev) { free (ptx_dev->async_streams.arr); + bool ret = true; while (ptx_dev->active_streams != NULL) { struct ptx_stream *s = ptx_dev->active_streams; ptx_dev->active_streams = ptx_dev->active_streams->next; - map_fini (s); - cuStreamDestroy (s->stream); + ret &= map_fini (s); + + CUresult r = cuStreamDestroy (s->stream); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r)); + ret = false; + } free (s); } - map_fini (ptx_dev->null_stream); + ret &= map_fini (ptx_dev->null_stream); free (ptx_dev->null_stream); + return ret; } /* Select a stream for (OpenACC-semantics) ASYNC argument for the current @@ -481,7 +511,11 @@ select_stream_for_async (int async, pthread_t thre { r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r)); + { + pthread_mutex_unlock (&ptx_dev->stream_lock); + GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", + cuda_error (r)); + } } /* If CREATE is true, we're going to be queueing some work on this @@ -491,7 +525,11 @@ select_stream_for_async (int async, pthread_t thre s->d = (CUdeviceptr) NULL; s->h = NULL; - map_init (s); + if (!map_init (s)) + { + pthread_mutex_unlock (&ptx_dev->stream_lock); + GOMP_PLUGIN_fatal ("map_init fail"); + } s->next = ptx_dev->active_streams; ptx_dev->active_streams = s; @@ -501,7 +539,11 @@ select_stream_for_async (int async, pthread_t thre stream = ptx_dev->async_streams.arr[async]; } else if (async < 0) - GOMP_PLUGIN_fatal ("bad async %d", async); + { + if (create) + pthread_mutex_unlock (&ptx_dev->stream_lock); + GOMP_PLUGIN_fatal ("bad async %d", async); + } if (create) { @@ -532,34 +574,25 @@ select_stream_for_async (int async, pthread_t thre static bool nvptx_init (void) { - CUresult r; int ndevs; if (instantiated_devices != 0) return true; - r = cuInit (0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r)); - + CUDA_CALL (cuInit, 0); ptx_events = NULL; - pthread_mutex_init (&ptx_event_lock, NULL); - r = cuDeviceGetCount (&ndevs); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); - + CUDA_CALL (cuDeviceGetCount, &ndevs); ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) * ndevs); - return true; } /* Select the N'th PTX device for the current host thread. The device must have been previously opened before calling this function. */ -static void +static bool nvptx_attach_host_thread_to_device (int n) { CUdevice dev; @@ -569,34 +602,34 @@ nvptx_attach_host_thread_to_device (int n) r = cuCtxGetDevice (&dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) - GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); + { + GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); + return false; + } if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) - return; + return true; else { CUcontext old_ctx; ptx_dev = ptx_devices[n]; - assert (ptx_dev); + if (!ptx_dev) + { + GOMP_PLUGIN_error ("device %d not found", n); + return false; + } - r = cuCtxGetCurrent (&thd_ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); + CUDA_CALL (cuCtxGetCurrent, &thd_ctx); /* We don't necessarily have a current context (e.g. if it has been destroyed. Pop it if we do though. */ if (thd_ctx != NULL) - { - r = cuCtxPopCurrent (&old_ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); - } + CUDA_CALL (cuCtxPopCurrent, &old_ctx); - r = cuCtxPushCurrent (ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); + CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx); } + return true; } static struct ptx_device * @@ -607,9 +640,7 @@ nvptx_open_device (int n) CUresult r; int async_engines, pi; - r = cuDeviceGet (&dev, n); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r)); + CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n); ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device)); @@ -619,60 +650,44 @@ nvptx_open_device (int n) r = cuCtxGetDevice (&ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) - GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); + { + GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); + return NULL; + } if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) { /* The current host thread has an active context for a different device. Detach it. */ CUcontext old_ctx; - - r = cuCtxPopCurrent (&old_ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); + CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx); } - r = cuCtxGetCurrent (&ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); + CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx); if (!ptx_dev->ctx) - { - r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r)); - } + CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); else ptx_dev->ctx_shared = true; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); ptx_dev->overlap = pi; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); ptx_dev->map = pi; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); ptx_dev->concur = pi; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); ptx_dev->mode = pi; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); ptx_dev->mkern = pi; r = cuDeviceGetAttribute (&async_engines, @@ -683,38 +698,34 @@ nvptx_open_device (int n) ptx_dev->images = NULL; pthread_mutex_init (&ptx_dev->image_lock, NULL); - init_streams_for_device (ptx_dev, async_engines); + if (!init_streams_for_device (ptx_dev, async_engines)) + return NULL; return ptx_dev; } -static void +static bool nvptx_close_device (struct ptx_device *ptx_dev) { - CUresult r; - if (!ptx_dev) - return; + return true; - fini_streams_for_device (ptx_dev); + if (!fini_streams_for_device (ptx_dev)) + return false; pthread_mutex_destroy (&ptx_dev->image_lock); if (!ptx_dev->ctx_shared) - { - r = cuCtxDestroy (ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r)); - } + CUDA_CALL (cuCtxDestroy, ptx_dev->ctx); free (ptx_dev); + return true; } static int nvptx_get_num_devices (void) { int n; - CUresult r; /* PR libgomp/65099: Currently, we only support offloading in 64-bit configurations. */ @@ -727,22 +738,19 @@ nvptx_get_num_devices (void) further initialization). */ if (instantiated_devices == 0) { - r = cuInit (0); + CUresult r = cuInit (0); /* This is not an error: e.g. we may have CUDA libraries installed but no devices available. */ if (r != CUDA_SUCCESS) return 0; } - r = cuDeviceGetCount (&n); - if (r!= CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); - + CUDA_CALL_ERET (-1, cuDeviceGetCount, &n); return n; } -static void +static bool link_ptx (CUmodule *module, const char *ptx_code) { CUjit_option opts[7]; @@ -780,9 +788,7 @@ link_ptx (CUmodule *module, const char *ptx_code) opts[6] = CU_JIT_TARGET; optvals[6] = (void *) CU_TARGET_COMPUTE_30; - r = cuLinkCreate (7, opts, optvals, &linkstate); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r)); + CUDA_CALL (cuLinkCreate, 7, opts, optvals, &linkstate); char *abort_ptx = ABORT_PTX; r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx, @@ -790,7 +796,8 @@ link_ptx (CUmodule *module, const char *ptx_code) if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); - GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r)); + GOMP_PLUGIN_error ("cuLinkAddData (abort) error: %s", cuda_error (r)); + return false; } char *acc_on_device_ptx = ACC_ON_DEVICE_PTX; @@ -799,8 +806,9 @@ link_ptx (CUmodule *module, const char *ptx_code) if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); - GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s", + GOMP_PLUGIN_error ("cuLinkAddData (acc_on_device) error: %s", cuda_error (r)); + return false; } char *goacc_internal_ptx = GOACC_INTERNAL_PTX; @@ -809,29 +817,28 @@ link_ptx (CUmodule *module, const char *ptx_code) if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); - GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s", + GOMP_PLUGIN_error ("cuLinkAddData (goacc_internal_ptx) error: %s", cuda_error (r)); + return false; } /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */ r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code, - strlen (ptx_code) + 1, 0, 0, 0, 0); + strlen (ptx_code) + 1, 0, 0, 0, 0); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); - GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r)); + GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s", cuda_error (r)); + return false; } - r = cuLinkComplete (linkstate, &linkout, &linkoutsize); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r)); + CUDA_CALL (cuLinkComplete, linkstate, &linkout, &linkoutsize); GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); - r = cuModuleLoadData (module, linkout); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r)); + CUDA_CALL (cuModuleLoadData, module, linkout); + return true; } static void @@ -961,11 +968,11 @@ nvptx_exec (void (*fn), size_t mapnum, void **host /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ - r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, + mapnum * sizeof (void *)); - GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name); + GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, + targ_fn->name); // OpenACC CUDA // @@ -991,12 +998,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **host nthreads_in_block = vector_length; kargs[0] = &dp; - r = cuLaunchKernel (function, - num_gangs, 1, 1, - nthreads_in_block, 1, 1, - 0, dev_str->stream, kargs, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuLaunchKernel, function, + num_gangs, 1, 1, + nthreads_in_block, 1, 1, + 0, dev_str->stream, kargs, 0); #ifndef DISABLE_ASYNC if (async < acc_async_noval) @@ -1023,9 +1028,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **host event_gc (true); - r = cuEventRecord (*e, dev_str->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream); event_add (PTX_EVT_KNL, e, (void *)dev_str); } @@ -1049,167 +1052,157 @@ nvptx_exec (void (*fn), size_t mapnum, void **host void * openacc_get_current_cuda_context (void); -static void * -nvptx_alloc (size_t s) +static bool +nvptx_alloc (size_t s, void **ptr) { CUdeviceptr d; CUresult r; r = cuMemAlloc (&d, s); if (r == CUDA_ERROR_OUT_OF_MEMORY) - return 0; + { + *ptr = NULL; + return true; + } + if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); - return (void *)d; + { + GOMP_PLUGIN_error ("cuMemAlloc error: %s", cuda_error (r)); + return false; + } + + *ptr = (void *)d; + return true; } -static void +static bool nvptx_free (void *p) { - CUresult r; CUdeviceptr pb; size_t ps; - r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); + CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p); + if ((CUdeviceptr) p != pb) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } - if ((CUdeviceptr)p != pb) - GOMP_PLUGIN_fatal ("invalid device address"); - - r = cuMemFree ((CUdeviceptr)p); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); + CUDA_CALL (cuMemFree, (CUdeviceptr) p); + return true; } -static void * + +static bool nvptx_host2dev (void *d, const void *h, size_t s) { - CUresult r; CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) - return 0; - + return true; if (!d) - GOMP_PLUGIN_fatal ("invalid device address"); + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } - r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); + CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); if (!pb) - GOMP_PLUGIN_fatal ("invalid device address"); - + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } if (!h) - GOMP_PLUGIN_fatal ("invalid host address"); - + { + GOMP_PLUGIN_error ("invalid host address"); + return false; + } if (d == h) - GOMP_PLUGIN_fatal ("invalid host or device address"); - + { + GOMP_PLUGIN_error ("invalid host or device address"); + return false; + } if ((void *)(d + s) > (void *)(pb + ps)) - GOMP_PLUGIN_fatal ("invalid size"); + { + GOMP_PLUGIN_error ("invalid size"); + return false; + } #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { - CUevent *e; - - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); - + CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); + CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (false); - - r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, - nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r)); - - r = cuEventRecord (*e, nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); - + CUDA_CALL (cuMemcpyHtoDAsync, + (CUdeviceptr) d, h, s, nvthd->current_stream->stream); + CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_MEM, e, (void *)h); } else #endif - { - r = cuMemcpyHtoD ((CUdeviceptr)d, h, s); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); - } + CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s); - return 0; + return true; } -static void * +static bool nvptx_dev2host (void *h, const void *d, size_t s) { - CUresult r; CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) - return 0; - + return true; if (!d) - GOMP_PLUGIN_fatal ("invalid device address"); + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } - r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); + CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); if (!pb) - GOMP_PLUGIN_fatal ("invalid device address"); - + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } if (!h) - GOMP_PLUGIN_fatal ("invalid host address"); - + { + GOMP_PLUGIN_error ("invalid host address"); + return false; + } if (d == h) - GOMP_PLUGIN_fatal ("invalid host or device address"); - + { + GOMP_PLUGIN_error ("invalid host or device address"); + return false; + } if ((void *)(d + s) > (void *)(pb + ps)) - GOMP_PLUGIN_fatal ("invalid size"); + { + GOMP_PLUGIN_error ("invalid size"); + return false; + } #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { - CUevent *e; - - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r)); - + CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); + CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (false); - - r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, - nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r)); - - r = cuEventRecord (*e, nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); - + CUDA_CALL (cuMemcpyDtoHAsync, + h, (CUdeviceptr) d, s, nvthd->current_stream->stream); + CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_MEM, e, (void *)h); } else #endif - { - r = cuMemcpyDtoH (h, (CUdeviceptr)d, s); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); - } + CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s); - return 0; + return true; } static void @@ -1279,17 +1272,13 @@ nvptx_async_test_all (void) static void nvptx_wait (int async) { - CUresult r; struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); - if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); - r = cuStreamSynchronize (s->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); event_gc (true); } @@ -1297,7 +1286,6 @@ nvptx_wait (int async) static void nvptx_wait_async (int async1, int async2) { - CUresult r; CUevent *e; struct ptx_stream *s1, *s2; pthread_t self = pthread_self (); @@ -1313,23 +1301,17 @@ nvptx_wait_async (int async1, int async2) if (s1 == s2) GOMP_PLUGIN_fatal ("identical parameters"); - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); + e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (true); - r = cuEventRecord (*e, s1->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream); event_add (PTX_EVT_SYNC, e, NULL); - r = cuStreamWaitEvent (s2->stream, *e, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0); } static void @@ -1354,9 +1336,7 @@ nvptx_wait_all (void) else if (r != CUDA_ERROR_NOT_READY) GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); - r = cuStreamSynchronize (s->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); } } @@ -1368,7 +1348,6 @@ nvptx_wait_all (void) static void nvptx_wait_all_async (int async) { - CUresult r; struct ptx_stream *waiting_stream, *other_stream; CUevent *e; struct nvptx_thread *nvthd = nvptx_thread (); @@ -1398,20 +1377,14 @@ nvptx_wait_all_async (int async) e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); /* Record an event on the waited-for stream. */ - r = cuEventRecord (*e, other_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream); event_add (PTX_EVT_SYNC, e, NULL); - r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0); } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); @@ -1460,11 +1433,11 @@ nvptx_set_cuda_stream (int async, void *stream) pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); - pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); - if (async < 0) GOMP_PLUGIN_fatal ("bad async %d", async); + pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); + /* We have a list of active streams and an array mapping async values to entries of that list. We need to take "ownership" of the passed-in stream, and add it to our list, removing the previous entry also (if there was one) @@ -1487,8 +1460,11 @@ nvptx_set_cuda_stream (int async, void *stream) s->next = s->next->next; } - cuStreamDestroy (oldstream->stream); - map_fini (oldstream); + CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream); + + if (!map_fini (oldstream)) + GOMP_PLUGIN_fatal ("error when freeing host memory"); + free (oldstream); } @@ -1525,37 +1501,50 @@ GOMP_OFFLOAD_get_num_devices (void) return nvptx_get_num_devices (); } -void +bool GOMP_OFFLOAD_init_device (int n) { + struct ptx_device *dev; + pthread_mutex_lock (&ptx_dev_lock); if (!nvptx_init () || ptx_devices[n] != NULL) { pthread_mutex_unlock (&ptx_dev_lock); - return; + return false; } - ptx_devices[n] = nvptx_open_device (n); - instantiated_devices++; + dev = nvptx_open_device (n); + if (dev) + { + ptx_devices[n] = dev; + instantiated_devices++; + } pthread_mutex_unlock (&ptx_dev_lock); + + return dev != NULL; } -void +bool GOMP_OFFLOAD_fini_device (int n) { pthread_mutex_lock (&ptx_dev_lock); if (ptx_devices[n] != NULL) { - nvptx_attach_host_thread_to_device (n); - nvptx_close_device (ptx_devices[n]); + if (!nvptx_attach_host_thread_to_device (n) + || !nvptx_close_device (ptx_devices[n])) + { + pthread_mutex_unlock (&ptx_dev_lock); + return false; + } ptx_devices[n] = NULL; instantiated_devices--; } pthread_mutex_unlock (&ptx_dev_lock); + return true; } /* Data emitted by mkoffload. */ @@ -1590,7 +1579,6 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version CUmodule module; const char *const *fn_names, *const *var_names; unsigned int fn_entries, var_entries, i, j; - CUresult r; struct targ_fn_descriptor *targ_fns; struct addr_pair *targ_tbl; const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data; @@ -1598,18 +1586,19 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version struct ptx_device *dev; if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) - GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin" - " (expected %u, received %u)", - GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); - - GOMP_OFFLOAD_init_device (ord); + { + GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" + " (expected %u, received %u)", + GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); + return -1; + } + if (!nvptx_attach_host_thread_to_device (ord) + || !link_ptx (&module, img_header->ptx_src)) + return -1; + dev = ptx_devices[ord]; - - nvptx_attach_host_thread_to_device (ord); - link_ptx (&module, img_header->ptx_src); - /* The mkoffload utility emits a struct of pointers/integers at the start of each offload image. The array of kernel names and the functions addresses form a one-to-one correspondence. */ @@ -1639,11 +1628,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++) { CUfunction function; + CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module, fn_names[i]); - r = cuModuleGetFunction (&function, module, fn_names[i]); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r)); - targ_fns->fn = function; targ_fns->name = (const char *) fn_names[i]; @@ -1656,9 +1642,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version CUdeviceptr var; size_t bytes; - r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); + CUDA_CALL_ERET (-1, cuModuleGetGlobal, + &var, &bytes, module, var_names[j]); targ_tbl->start = (uintptr_t) var; targ_tbl->end = targ_tbl->start + bytes; @@ -1692,32 +1677,32 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned versi pthread_mutex_unlock (&dev->image_lock); } -void * -GOMP_OFFLOAD_alloc (int ord, size_t size) +bool +GOMP_OFFLOAD_alloc (int ord, size_t size, void **ptr) { - nvptx_attach_host_thread_to_device (ord); - return nvptx_alloc (size); + return (nvptx_attach_host_thread_to_device (ord) + && nvptx_alloc (size, ptr)); } -void +bool GOMP_OFFLOAD_free (int ord, void *ptr) { - nvptx_attach_host_thread_to_device (ord); - nvptx_free (ptr); + return (nvptx_attach_host_thread_to_device (ord) + && nvptx_free (ptr)); } -void * +bool GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) { - nvptx_attach_host_thread_to_device (ord); - return nvptx_dev2host (dst, src, n); + return (nvptx_attach_host_thread_to_device (ord) + && nvptx_dev2host (dst, src, n)); } -void * +bool GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) { - nvptx_attach_host_thread_to_device (ord); - return nvptx_host2dev (dst, src, n); + return (nvptx_attach_host_thread_to_device (ord) + && nvptx_host2dev (dst, src, n)); } void (*device_run) (int n, void *fn_ptr, void *vars) = NULL; @@ -1736,20 +1721,11 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *) void GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) { - CUevent *e; - CUresult r; struct nvptx_thread *nvthd = nvptx_thread (); + CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); - - r = cuEventRecord (*e, nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); - + CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); + CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); } @@ -1801,25 +1777,18 @@ GOMP_OFFLOAD_openacc_create_thread_data (int ord) struct ptx_device *ptx_dev; struct nvptx_thread *nvthd = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); - CUresult r; CUcontext thd_ctx; ptx_dev = ptx_devices[ord]; assert (ptx_dev); - r = cuCtxGetCurrent (&thd_ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx); assert (ptx_dev->ctx); if (!thd_ctx) - { - r = cuCtxPushCurrent (ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); - } + CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx); nvthd->current_stream = ptx_dev->null_stream; nvthd->ptx_dev = ptx_dev;