This patch ports team.c to nvptx by arranging an initialization/cleanup
routine, gomp_nvptx_main, that all (pre-started) threads can run.  It
initializes a thread pool and proceeds to run gomp_thread_start in all threads
except thread zero, which runs original target region function.

Thread-private data is arranged via a linear array, nvptx_thrs, that is
allocated in gomp_nvptx_main.

Like in previous patch, are naked asm() statement OK?

        * libgomp.h [__nvptx__] (gomp_thread): New implementation.
        * config/nvptx/team.c: Delete.
        * team.c: Guard uses of PThreads-specific interfaces by
        LIBGOMP_USE_PTHREADS.
        (gomp_nvptx_main): New.
        (gomp_thread_start) [__nvptx__]: Handle calls from gomp_nvptx_main.
---
 libgomp/config/nvptx/team.c |  0
 libgomp/libgomp.h           | 10 ++++-
 libgomp/team.c              | 92 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 96 insertions(+), 6 deletions(-)
 delete mode 100644 libgomp/config/nvptx/team.c

diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
deleted file mode 100644
index e69de29..0000000
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 1454adf..f25b265 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -483,7 +483,15 @@ enum gomp_cancel_kind
 
 /* ... and here is that TLS data.  */
 
-#if defined HAVE_TLS || defined USE_EMUTLS
+#if defined __nvptx__
+extern struct gomp_thread *nvptx_thrs;
+static inline struct gomp_thread *gomp_thread (void)
+{
+  int tid;
+  asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
+  return nvptx_thrs + tid;
+}
+#elif defined HAVE_TLS || defined USE_EMUTLS
 extern __thread struct gomp_thread gomp_tls_data;
 static inline struct gomp_thread *gomp_thread (void)
 {
diff --git a/libgomp/team.c b/libgomp/team.c
index 7671b05..5b74532 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -30,6 +30,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef LIBGOMP_USE_PTHREADS
 /* This attribute contains PTHREAD_CREATE_DETACHED.  */
 pthread_attr_t gomp_thread_attr;
 
@@ -43,6 +44,7 @@ __thread struct gomp_thread gomp_tls_data;
 #else
 pthread_key_t gomp_tls_key;
 #endif
+#endif
 
 
 /* This structure is used to communicate across pthread_create.  */
@@ -58,6 +60,52 @@ struct gomp_thread_start_data
   bool nested;
 };
 
+#ifdef __nvptx__
+struct gomp_thread *nvptx_thrs;
+
+static struct gomp_thread_pool *gomp_new_thread_pool (void);
+static void *gomp_thread_start (void *);
+
+void __attribute__((kernel))
+gomp_nvptx_main (void (*fn) (void *), void *fn_data)
+{
+  int ntids, tid, laneid;
+  asm ("mov.u32 %0, %%laneid;" : "=r" (laneid));
+  if (laneid)
+    return;
+  static struct gomp_thread_pool *pool;
+  asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
+  asm ("mov.u32 %0, %%ntid.y;" : "=r"(ntids));
+  if (tid == 0)
+    {
+      gomp_global_icv.nthreads_var = ntids;
+
+      nvptx_thrs = gomp_malloc_cleared (ntids * sizeof (*nvptx_thrs));
+
+      pool = gomp_new_thread_pool ();
+      pool->threads = gomp_malloc (ntids * sizeof (*pool->threads));
+      pool->threads[0] = nvptx_thrs;
+      pool->threads_size = ntids;
+      pool->threads_used = ntids;
+      gomp_barrier_init (&pool->threads_dock, ntids);
+
+      nvptx_thrs[0].thread_pool = pool;
+      asm ("bar.sync 0;");
+      fn (fn_data);
+
+      gomp_free_thread (nvptx_thrs);
+      free (nvptx_thrs);
+    }
+  else
+    {
+      struct gomp_thread_start_data tsdata = {0};
+      tsdata.ts.team_id = tid;
+      asm ("bar.sync 0;");
+      tsdata.thread_pool = pool;
+      gomp_thread_start (&tsdata);
+    }
+}
+#endif
 
 /* This function is a pthread_create entry point.  This contains the idle
    loop in which a thread waits to be called up to become part of a team.  */
@@ -71,7 +119,9 @@ gomp_thread_start (void *xdata)
   void (*local_fn) (void *);
   void *local_data;
 
-#if defined HAVE_TLS || defined USE_EMUTLS
+#ifdef __nvptx__
+  thr = gomp_thread ();
+#elif defined HAVE_TLS || defined USE_EMUTLS
   thr = &gomp_tls_data;
 #else
   struct gomp_thread local_thr;
@@ -88,7 +138,8 @@ gomp_thread_start (void *xdata)
   thr->task = data->task;
   thr->place = data->place;
 
-  thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
+  if (thr->ts.team)
+    thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
 
   /* Make thread pool local. */
   pool = thr->thread_pool;
@@ -110,6 +161,10 @@ gomp_thread_start (void *xdata)
       pool->threads[thr->ts.team_id] = thr;
 
       gomp_barrier_wait (&pool->threads_dock);
+#ifdef __nvptx__
+      local_fn = thr->fn;
+      local_data = thr->data;
+#endif
       do
        {
          struct gomp_team *team = thr->ts.team;
@@ -242,7 +297,13 @@ gomp_free_pool_helper (void *thread_pool)
   gomp_sem_destroy (&thr->release);
   thr->thread_pool = NULL;
   thr->task = NULL;
+#ifdef LIBGOMP_USE_PTHREADS
   pthread_exit (NULL);
+#elif defined(__nvptx__)
+  asm ("exit;");
+#else
+#error gomp_free_pool_helper must terminate the thread
+#endif
 }
 
 /* Free a thread pool and release its threads. */
@@ -300,33 +361,40 @@ void
 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
                 unsigned flags, struct gomp_team *team)
 {
-  struct gomp_thread_start_data *start_data;
   struct gomp_thread *thr, *nthr;
   struct gomp_task *task;
   struct gomp_task_icv *icv;
   bool nested;
   struct gomp_thread_pool *pool;
   unsigned i, n, old_threads_used = 0;
-  pthread_attr_t thread_attr, *attr;
   unsigned long nthreads_var;
-  char bind, bind_var;
+  char bind_var;
+#ifdef LIBGOMP_USE_PTHREADS
+  char bind;
+  struct gomp_thread_start_data *start_data;
+  pthread_attr_t thread_attr, *attr;
   unsigned int s = 0, rest = 0, p = 0, k = 0;
+#endif
   unsigned int affinity_count = 0;
   struct gomp_thread **affinity_thr = NULL;
 
   thr = gomp_thread ();
   nested = thr->ts.team != NULL;
+#ifdef LIBGOMP_USE_PTHREADS
   if (__builtin_expect (thr->thread_pool == NULL, 0))
     {
       thr->thread_pool = gomp_new_thread_pool ();
       thr->thread_pool->threads_busy = nthreads;
       pthread_setspecific (gomp_thread_destructor, thr);
     }
+#endif
   pool = thr->thread_pool;
   task = thr->task;
   icv = task ? &task->icv : &gomp_global_icv;
+#ifdef LIBGOMP_USE_PTHREADS
   if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
     gomp_init_affinity ();
+#endif
 
   /* Always save the previous state, even if this isn't a nested team.
      In particular, we should save any work share state from an outer
@@ -352,10 +420,12 @@ gomp_team_start (void (*fn) (void *), void *data, 
unsigned nthreads,
   bind_var = icv->bind_var;
   if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
     bind_var = flags & 7;
+#ifdef LIBGOMP_USE_PTHREADS
   bind = bind_var;
   if (__builtin_expect (gomp_bind_var_list != NULL, 0)
       && thr->ts.level < gomp_bind_var_list_len)
     bind_var = gomp_bind_var_list[thr->ts.level];
+#endif
   gomp_init_task (thr->task, task, icv);
   team->implicit_task[0].icv.nthreads_var = nthreads_var;
   team->implicit_task[0].icv.bind_var = bind_var;
@@ -365,6 +435,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned 
nthreads,
 
   i = 1;
 
+#ifdef LIBGOMP_USE_PTHREADS
   if (__builtin_expect (gomp_places_list != NULL, 0))
     {
       /* Depending on chosen proc_bind model, set subpartition
@@ -432,6 +503,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned 
nthreads,
     }
   else
     bind = omp_proc_bind_false;
+#endif
 
   /* We only allow the reuse of idle threads for non-nested PARALLEL
      regions.  This appears to be implied by the semantics of
@@ -481,6 +553,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned 
nthreads,
          unsigned int place_partition_off = thr->ts.place_partition_off;
          unsigned int place_partition_len = thr->ts.place_partition_len;
          unsigned int place = 0;
+#ifdef LIBGOMP_USE_PTHREADS
          if (__builtin_expect (gomp_places_list != NULL, 0))
            {
              switch (bind)
@@ -612,6 +685,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned 
nthreads,
              place = p + 1;
            }
          else
+#endif
            nthr = pool->threads[i];
          nthr->ts.team = team;
          nthr->ts.work_share = &team->work_shares[0];
@@ -635,6 +709,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned 
nthreads,
          team->ordered_release[i] = &nthr->release;
        }
 
+#ifdef LIBGOMP_USE_PTHREADS
       if (__builtin_expect (affinity_thr != NULL, 0))
        {
          /* If AFFINITY_THR is non-NULL just because we had to
@@ -695,9 +770,11 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned 
nthreads,
 
       if (i == nthreads)
        goto do_release;
+#endif
 
     }
 
+#ifdef LIBGOMP_USE_PTHREADS
   if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
     {
       long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
@@ -829,6 +906,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned 
nthreads,
     pthread_attr_destroy (&thread_attr);
 
  do_release:
+#endif
   gomp_barrier_wait (nested ? &team->barrier : &pool->threads_dock);
 
   /* Decrease the barrier threshold to match the number of threads
@@ -935,6 +1013,7 @@ gomp_team_end (void)
     }
 }
 
+#ifdef LIBGOMP_USE_PTHREADS
 
 /* Constructors for this file.  */
 
@@ -959,6 +1038,7 @@ team_destructor (void)
      crashes.  */
   pthread_key_delete (gomp_thread_destructor);
 }
+#endif
 
 struct gomp_task_icv *
 gomp_new_icv (void)
@@ -967,6 +1047,8 @@ gomp_new_icv (void)
   struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
   gomp_init_task (task, NULL, &gomp_global_icv);
   thr->task = task;
+#ifdef LIBGOMP_USE_PTHREADS
   pthread_setspecific (gomp_thread_destructor, thr);
+#endif
   return &task->icv;
 }

Reply via email to