Hi! Attached is a minimal patch to get at least a trivial OpenMP 4.0 testcase offloading to NVPTX (the first patch). The second patch is WIP, just first few needed changes to make libgomp to build for NVPTX (several weeks of work at least).
The following seems to work and the output suggests that it was offloaded to a non-SHM arch: int main () { int v = 0; int *w = 0; int x = 0; #pragma omp target { v = 6; w = &v; x = 1; // omp_is_initial_device (); } __builtin_printf ("%d %p %p %d\n", v, &v, w, x); return 0; } but already tiny bit more complicated testcase: extern void *malloc (__SIZE_TYPE__); extern void free (void *); int main () { int v = 0; int *w = 0; int x = 0; #pragma omp target { v = 6; w = &v; char *p = malloc (64); x = 1; // omp_is_initial_device (); free (p); } __builtin_printf ("%d %p %p %d\n", v, &v, w, x); return 0; } suggests that while it is nice that when building nvptx accel compiler we build libgcc.a, libc.a, libm.a, libgfortran.a (and in the future hopefully libgomp.a), nothing attempts to link those in :(. Is the plan to link those in at mkoffload time (haven't seen any attempt of mkoffload to invoke the nvptx-none-ld linker though), or link those in somehow at link_ptx time in the plugin? In either case, it isn't clear to me how things will work (if at all) in the case where multiple shared libraries (or executable and at least one shared library) have their own offloading bits, and if you try to e.g. call an offloaded function defined in the shared library from an offloaded kernel in the executable, because if any library needs some global singleton case, if it is linked multiple times, no idea what the PTX JIT will do. Once that is resolved, another thing will be to figure out how to efficiently implement the TLS libgomp needs for its ICVs and other state - right now it uses either __thread, or pthread_getspecific, neither of these is usable of course. I've been thinking about an array of those structures in .shared memory indexed by %tid.x, but I guess that runs into the issue that the array would need to be declared fixed size and there is a very small size limitation on .shared memory size. So perhaps a file scope .shared pointer to global memory, where whomever launches an OpenMP 4.0 kernel (either the libgomp-plugin-nvptx.so.1 doing GOMP_run, or later on dynamic parallelism from GOMP_target in the nvptx libgomp.a) allocates the memory and some wrapper sets the .shared variable to that allocated memory, then calls the kernel? Jakub
--- libgomp/plugin/plugin-nvptx.c.jj 2015-04-21 08:38:00.000000000 +0200 +++ libgomp/plugin/plugin-nvptx.c 2015-04-21 16:55:25.247470080 +0200 @@ -978,8 +978,8 @@ event_add (enum ptx_event_type type, CUe void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, - size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers, - int vector_length, int async, void *targ_mem_desc) + size_t *sizes, unsigned short *kinds, int num_gangs, + int num_workers, int vector_length, int async, void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; @@ -1137,7 +1137,6 @@ nvptx_host2dev (void *d, const void *h, CUresult r; CUdeviceptr pb; size_t ps; - struct nvptx_thread *nvthd = nvptx_thread (); if (!s) return 0; @@ -1162,7 +1161,8 @@ nvptx_host2dev (void *d, const void *h, GOMP_PLUGIN_fatal ("invalid size"); #ifndef DISABLE_ASYNC - if (nvthd->current_stream != nvthd->ptx_dev->null_stream) + struct nvptx_thread *nvthd = nvptx_thread (); + if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e; @@ -1202,7 +1202,6 @@ nvptx_dev2host (void *h, const void *d, CUresult r; CUdeviceptr pb; size_t ps; - struct nvptx_thread *nvthd = nvptx_thread (); if (!s) return 0; @@ -1227,7 +1226,8 @@ nvptx_dev2host (void *h, const void *d, GOMP_PLUGIN_fatal ("invalid size"); #ifndef DISABLE_ASYNC - if (nvthd->current_stream != nvthd->ptx_dev->null_stream) + struct nvptx_thread *nvthd = nvptx_thread (); + if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e; @@ -1559,7 +1559,8 @@ GOMP_OFFLOAD_get_name (void) unsigned int GOMP_OFFLOAD_get_caps (void) { - return GOMP_OFFLOAD_CAP_OPENACC_200; + return GOMP_OFFLOAD_CAP_OPENACC_200 + | GOMP_OFFLOAD_CAP_OPENMP_400; } int @@ -1759,7 +1760,7 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn void *targ_mem_desc) { nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs, - num_workers, vector_length, async, targ_mem_desc); + num_workers, vector_length, async, targ_mem_desc); } void @@ -1889,3 +1890,27 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (in { return nvptx_set_cuda_stream (async, stream); } + +void +GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars) +{ + CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn; + CUresult r; + struct ptx_device *ptx_dev = ptx_devices[ord]; + const char *maybe_abort_msg = "(perhaps abort was called)"; + void *args = &tgt_vars; + + r = cuLaunchKernel (function, + 1, 1, 1, + 1, 1, 1, + 0, ptx_dev->null_stream->stream, &args, 0); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); + + r = cuCtxSynchronize (); + if (r == CUDA_ERROR_LAUNCH_FAILED) + GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), + maybe_abort_msg); + else if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); +}
--- configure.jj 2015-04-21 08:38:24.000000000 +0200 +++ configure 2015-04-21 09:16:42.994959648 +0200 @@ -3171,6 +3171,9 @@ if test x$enable_libgomp = x ; then ;; *-*-darwin* | *-*-aix*) ;; + # And on NVPTX as an offloading target. + nvptx*-*-*) + ;; *) noconfigdirs="$noconfigdirs target-libgomp" ;; --- libgomp/configure.jj 2015-04-21 11:08:08.347628799 +0200 +++ libgomp/configure 2015-04-21 11:07:39.000000000 +0200 @@ -15038,6 +15038,9 @@ case "$host" in *-*-rtems*) # RTEMS supports Pthreads, but the library is not available at GCC build time. ;; + nvptx*-*-*) + # NVPTX does not support Pthreads, has its own code replacement. + ;; *) # Check to see if -pthread or -lpthread is needed. Prefer the former. # In case the pthread.h system header is not found, this test will fail. --- libgomp/configure.tgt.jj 2015-04-21 08:38:00.000000000 +0200 +++ libgomp/configure.tgt 2015-04-21 10:59:30.857197475 +0200 @@ -151,6 +151,10 @@ case "${target}" in XLDFLAGS="${XLDFLAGS} -lpthread" ;; + nvptx*-*-*) + config_path="nvptx" + ;; + *) ;; --- libgomp/config.h.in.jj 2015-04-21 08:38:01.000000000 +0200 +++ libgomp/config.h.in 2015-04-21 08:38:01.000000000 +0200 @@ -39,6 +39,9 @@ /* Define if pthread_{,attr_}{g,s}etaffinity_np is supported. */ #undef HAVE_PTHREAD_AFFINITY_NP +/* Define to 1 if you have the <pthread.h> header file. */ +#undef HAVE_PTHREAD_H + /* Define to 1 if you have the <semaphore.h> header file. */ #undef HAVE_SEMAPHORE_H --- libgomp/libgomp.h.jj 2015-04-21 08:38:00.000000000 +0200 +++ libgomp/libgomp.h 2015-04-21 11:15:35.952217394 +0200 @@ -40,7 +40,9 @@ #include "gstdint.h" #include "libgomp-plugin.h" +#ifdef HAVE_PTHREAD_H #include <pthread.h> +#endif #include <stdbool.h> #include <stdlib.h> #include <stdarg.h> --- libgomp/configure.ac.jj 2015-04-21 08:38:00.000000000 +0200 +++ libgomp/configure.ac 2015-04-21 11:06:38.418117846 +0200 @@ -179,6 +179,9 @@ case "$host" in *-*-rtems*) # RTEMS supports Pthreads, but the library is not available at GCC build time. ;; + nvptx*-*-*) + # NVPTX does not support Pthreads, has its own code replacement. + ;; *) # Check to see if -pthread or -lpthread is needed. Prefer the former. # In case the pthread.h system header is not found, this test will fail. --- configure.ac.jj 2015-04-21 08:38:09.000000000 +0200 +++ configure.ac 2015-04-21 09:14:50.107827544 +0200 @@ -539,6 +539,9 @@ if test x$enable_libgomp = x ; then ;; *-*-darwin* | *-*-aix*) ;; + # And on NVPTX as an offloading target. + nvptx*-*-*) + ;; *) noconfigdirs="$noconfigdirs target-libgomp" ;;