This patch ports team.c to nvptx by arranging an initialization/cleanup routine, gomp_nvptx_main, that all (pre-started) threads can run. It initializes a thread pool and proceeds to run gomp_thread_start in all threads except thread zero, which runs original target region function.
Thread-private data is arranged via a linear array, nvptx_thrs, that is allocated in gomp_nvptx_main. Like in previous patch, are naked asm() statement OK? * libgomp.h [__nvptx__] (gomp_thread): New implementation. * config/nvptx/team.c: Delete. * team.c: Guard uses of PThreads-specific interfaces by LIBGOMP_USE_PTHREADS. (gomp_nvptx_main): New. (gomp_thread_start) [__nvptx__]: Handle calls from gomp_nvptx_main. --- libgomp/config/nvptx/team.c | 0 libgomp/libgomp.h | 10 ++++- libgomp/team.c | 92 ++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 96 insertions(+), 6 deletions(-) delete mode 100644 libgomp/config/nvptx/team.c diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c deleted file mode 100644 index e69de29..0000000 diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 1454adf..f25b265 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -483,7 +483,15 @@ enum gomp_cancel_kind /* ... and here is that TLS data. */ -#if defined HAVE_TLS || defined USE_EMUTLS +#if defined __nvptx__ +extern struct gomp_thread *nvptx_thrs; +static inline struct gomp_thread *gomp_thread (void) +{ + int tid; + asm ("mov.u32 %0, %%tid.y;" : "=r" (tid)); + return nvptx_thrs + tid; +} +#elif defined HAVE_TLS || defined USE_EMUTLS extern __thread struct gomp_thread gomp_tls_data; static inline struct gomp_thread *gomp_thread (void) { diff --git a/libgomp/team.c b/libgomp/team.c index 7671b05..5b74532 100644 --- a/libgomp/team.c +++ b/libgomp/team.c @@ -30,6 +30,7 @@ #include <stdlib.h> #include <string.h> +#ifdef LIBGOMP_USE_PTHREADS /* This attribute contains PTHREAD_CREATE_DETACHED. */ pthread_attr_t gomp_thread_attr; @@ -43,6 +44,7 @@ __thread struct gomp_thread gomp_tls_data; #else pthread_key_t gomp_tls_key; #endif +#endif /* This structure is used to communicate across pthread_create. */ @@ -58,6 +60,52 @@ struct gomp_thread_start_data bool nested; }; +#ifdef __nvptx__ +struct gomp_thread *nvptx_thrs; + +static struct gomp_thread_pool *gomp_new_thread_pool (void); +static void *gomp_thread_start (void *); + +void __attribute__((kernel)) +gomp_nvptx_main (void (*fn) (void *), void *fn_data) +{ + int ntids, tid, laneid; + asm ("mov.u32 %0, %%laneid;" : "=r" (laneid)); + if (laneid) + return; + static struct gomp_thread_pool *pool; + asm ("mov.u32 %0, %%tid.y;" : "=r" (tid)); + asm ("mov.u32 %0, %%ntid.y;" : "=r"(ntids)); + if (tid == 0) + { + gomp_global_icv.nthreads_var = ntids; + + nvptx_thrs = gomp_malloc_cleared (ntids * sizeof (*nvptx_thrs)); + + pool = gomp_new_thread_pool (); + pool->threads = gomp_malloc (ntids * sizeof (*pool->threads)); + pool->threads[0] = nvptx_thrs; + pool->threads_size = ntids; + pool->threads_used = ntids; + gomp_barrier_init (&pool->threads_dock, ntids); + + nvptx_thrs[0].thread_pool = pool; + asm ("bar.sync 0;"); + fn (fn_data); + + gomp_free_thread (nvptx_thrs); + free (nvptx_thrs); + } + else + { + struct gomp_thread_start_data tsdata = {0}; + tsdata.ts.team_id = tid; + asm ("bar.sync 0;"); + tsdata.thread_pool = pool; + gomp_thread_start (&tsdata); + } +} +#endif /* This function is a pthread_create entry point. This contains the idle loop in which a thread waits to be called up to become part of a team. */ @@ -71,7 +119,9 @@ gomp_thread_start (void *xdata) void (*local_fn) (void *); void *local_data; -#if defined HAVE_TLS || defined USE_EMUTLS +#ifdef __nvptx__ + thr = gomp_thread (); +#elif defined HAVE_TLS || defined USE_EMUTLS thr = &gomp_tls_data; #else struct gomp_thread local_thr; @@ -88,7 +138,8 @@ gomp_thread_start (void *xdata) thr->task = data->task; thr->place = data->place; - thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release; + if (thr->ts.team) + thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release; /* Make thread pool local. */ pool = thr->thread_pool; @@ -110,6 +161,10 @@ gomp_thread_start (void *xdata) pool->threads[thr->ts.team_id] = thr; gomp_barrier_wait (&pool->threads_dock); +#ifdef __nvptx__ + local_fn = thr->fn; + local_data = thr->data; +#endif do { struct gomp_team *team = thr->ts.team; @@ -242,7 +297,13 @@ gomp_free_pool_helper (void *thread_pool) gomp_sem_destroy (&thr->release); thr->thread_pool = NULL; thr->task = NULL; +#ifdef LIBGOMP_USE_PTHREADS pthread_exit (NULL); +#elif defined(__nvptx__) + asm ("exit;"); +#else +#error gomp_free_pool_helper must terminate the thread +#endif } /* Free a thread pool and release its threads. */ @@ -300,33 +361,40 @@ void gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, unsigned flags, struct gomp_team *team) { - struct gomp_thread_start_data *start_data; struct gomp_thread *thr, *nthr; struct gomp_task *task; struct gomp_task_icv *icv; bool nested; struct gomp_thread_pool *pool; unsigned i, n, old_threads_used = 0; - pthread_attr_t thread_attr, *attr; unsigned long nthreads_var; - char bind, bind_var; + char bind_var; +#ifdef LIBGOMP_USE_PTHREADS + char bind; + struct gomp_thread_start_data *start_data; + pthread_attr_t thread_attr, *attr; unsigned int s = 0, rest = 0, p = 0, k = 0; +#endif unsigned int affinity_count = 0; struct gomp_thread **affinity_thr = NULL; thr = gomp_thread (); nested = thr->ts.team != NULL; +#ifdef LIBGOMP_USE_PTHREADS if (__builtin_expect (thr->thread_pool == NULL, 0)) { thr->thread_pool = gomp_new_thread_pool (); thr->thread_pool->threads_busy = nthreads; pthread_setspecific (gomp_thread_destructor, thr); } +#endif pool = thr->thread_pool; task = thr->task; icv = task ? &task->icv : &gomp_global_icv; +#ifdef LIBGOMP_USE_PTHREADS if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0) gomp_init_affinity (); +#endif /* Always save the previous state, even if this isn't a nested team. In particular, we should save any work share state from an outer @@ -352,10 +420,12 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, bind_var = icv->bind_var; if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false) bind_var = flags & 7; +#ifdef LIBGOMP_USE_PTHREADS bind = bind_var; if (__builtin_expect (gomp_bind_var_list != NULL, 0) && thr->ts.level < gomp_bind_var_list_len) bind_var = gomp_bind_var_list[thr->ts.level]; +#endif gomp_init_task (thr->task, task, icv); team->implicit_task[0].icv.nthreads_var = nthreads_var; team->implicit_task[0].icv.bind_var = bind_var; @@ -365,6 +435,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, i = 1; +#ifdef LIBGOMP_USE_PTHREADS if (__builtin_expect (gomp_places_list != NULL, 0)) { /* Depending on chosen proc_bind model, set subpartition @@ -432,6 +503,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, } else bind = omp_proc_bind_false; +#endif /* We only allow the reuse of idle threads for non-nested PARALLEL regions. This appears to be implied by the semantics of @@ -481,6 +553,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, unsigned int place_partition_off = thr->ts.place_partition_off; unsigned int place_partition_len = thr->ts.place_partition_len; unsigned int place = 0; +#ifdef LIBGOMP_USE_PTHREADS if (__builtin_expect (gomp_places_list != NULL, 0)) { switch (bind) @@ -612,6 +685,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, place = p + 1; } else +#endif nthr = pool->threads[i]; nthr->ts.team = team; nthr->ts.work_share = &team->work_shares[0]; @@ -635,6 +709,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, team->ordered_release[i] = &nthr->release; } +#ifdef LIBGOMP_USE_PTHREADS if (__builtin_expect (affinity_thr != NULL, 0)) { /* If AFFINITY_THR is non-NULL just because we had to @@ -695,9 +770,11 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, if (i == nthreads) goto do_release; +#endif } +#ifdef LIBGOMP_USE_PTHREADS if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0)) { long diff = (long) (nthreads + affinity_count) - (long) old_threads_used; @@ -829,6 +906,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, pthread_attr_destroy (&thread_attr); do_release: +#endif gomp_barrier_wait (nested ? &team->barrier : &pool->threads_dock); /* Decrease the barrier threshold to match the number of threads @@ -935,6 +1013,7 @@ gomp_team_end (void) } } +#ifdef LIBGOMP_USE_PTHREADS /* Constructors for this file. */ @@ -959,6 +1038,7 @@ team_destructor (void) crashes. */ pthread_key_delete (gomp_thread_destructor); } +#endif struct gomp_task_icv * gomp_new_icv (void) @@ -967,6 +1047,8 @@ gomp_new_icv (void) struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task)); gomp_init_task (task, NULL, &gomp_global_icv); thr->task = task; +#ifdef LIBGOMP_USE_PTHREADS pthread_setspecific (gomp_thread_destructor, thr); +#endif return &task->icv; }