[gomp4 09/14] libgomp: provide barriers on NVPTX

Alexander Monakov Tue, 20 Oct 2015 11:53:40 -0700

On NVPTX, there's 16 hardware barriers for each thread team, each barrier has
a variable waiter count.  The instruction 'bar.sync N, M;' allows to wait on
barrier number N until M threads have arrived.  M should be pre-multiplied by
warp width.  It's also possible to 'post' the barrier without suspending with
'bar.arrive'.


We should be able to provide gomp barrier via a combination of ptx barriers
and atomics.  This patch is a first step in that direction.

It's mostly a copy of Linux implementation, and it's very likely that
functions more complex than gomp_barrier_wait_end are implemented incorrectly.
I will have to review all of that (and optimize, hopefully).

I'm not sure if naked asm()'s are OK.  It's possible to implement a builtin
instead for a minor beautification.  Thoughts?

---
 libgomp/config/nvptx/bar.c | 210 +++++++++++++++++++++++++++++++++++++++++++++
 libgomp/config/nvptx/bar.h | 129 +++++++++++++++++++++++++++-
 2 files changed, 338 insertions(+), 1 deletion(-)

diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index e69de29..5631cb4 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -0,0 +1,210 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an NVPTX specific implementation of a barrier synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   implementation uses atomic instructions and bar.sync instruction.  */
+
+#include <limits.h>
+#include "libgomp.h"
+
+
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+                       MEMMODEL_RELEASE);
+    }
+  asm ("bar.sync 0, %0;" : : "r"(32*bar->total));
+}
+
+void
+gomp_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+/* Like gomp_barrier_wait, except that if the encountering thread
+   is not the last one to hit the barrier, it returns immediately.
+   The intended usage is that a thread which intends to gomp_barrier_destroy
+   this barrier calls gomp_barrier_wait, while all other threads
+   call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
+   the barrier can be safely destroyed.  */
+
+void
+gomp_barrier_wait_last (gomp_barrier_t *bar)
+{
+#if 0
+  gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
+  if (state & BAR_WAS_LAST)
+    gomp_barrier_wait_end (bar, state);
+#else
+  gomp_barrier_wait (bar);
+#endif
+}
+
+void
+gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+{
+  asm ("bar.sync 0, %0;" : : "r"(32*bar->total));
+}
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  gomp_barrier_wait_end (bar, state);
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+       {
+         gomp_barrier_handle_tasks (state);
+         state &= ~BAR_WAS_LAST;
+       }
+      else
+       {
+         state &= ~BAR_CANCELLED;
+         state += BAR_INCR - BAR_WAS_LAST;
+         __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+         asm ("bar.sync 0, %0;" : : "r"(32*bar->total));
+         return;
+       }
+    }
+
+  generation = state;
+  state &= ~BAR_CANCELLED;
+  do
+    {
+      asm ("bar.sync 0, %0;" : : "r"(32*bar->total));
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+       {
+         gomp_barrier_handle_tasks (state);
+         gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+       }
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+}
+
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    bar->awaited_final = bar->total;
+  gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+                                  gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      /* BAR_CANCELLED should never be set in state here, because
+        cancellation means that at least one of the threads has been
+        cancelled, thus on a cancellable barrier we should never see
+        all threads to arrive.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+       {
+         gomp_barrier_handle_tasks (state);
+         state &= ~BAR_WAS_LAST;
+       }
+      else
+       {
+         state += BAR_INCR - BAR_WAS_LAST;
+         __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+         asm ("bar.sync 0, %0;" : : "r"(32*bar->total));
+         return false;
+       }
+    }
+
+  if (__builtin_expect (state & BAR_CANCELLED, 0))
+    return true;
+
+  generation = state;
+  do
+    {
+      asm ("bar.sync 0, %0;" : : "r"(32*bar->total));
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_CANCELLED, 0))
+       return true;
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+       {
+         gomp_barrier_handle_tasks (state);
+         gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+       }
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+
+  return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start 
(bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
+  gomp_team_barrier_wake (&team->barrier, INT_MAX);
+}
diff --git a/libgomp/config/nvptx/bar.h b/libgomp/config/nvptx/bar.h
index 009d85f..bbdc466 100644
--- a/libgomp/config/nvptx/bar.h
+++ b/libgomp/config/nvptx/bar.h
@@ -24,15 +24,142 @@
 
 /* This is an NVPTX specific implementation of a barrier synchronization
    mechanism for libgomp.  This type is private to the library.  This
-   implementation is a stub, for now.  */
+   implementation uses atomic instructions and bar.sync instruction.  */
 
 #ifndef GOMP_BARRIER_H
 #define GOMP_BARRIER_H 1
 
+#include "mutex.h"
+
 typedef struct
 {
+  unsigned total;
+  unsigned generation;
+  unsigned awaited;
+  unsigned awaited_final;
 } gomp_barrier_t;
 
 typedef unsigned int gomp_barrier_state_t;
 
+/* The generation field contains a counter in the high bits, with a few
+   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
+   share space because WAS_LAST is never stored back to generation.  */
+#define BAR_TASK_PENDING       1
+#define BAR_WAS_LAST           1
+#define BAR_WAITING_FOR_TASK   2
+#define BAR_CANCELLED          4
+#define BAR_INCR               8
+
+static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
+{
+  bar->total = count;
+  bar->awaited = count;
+  bar->awaited_final = count;
+  bar->generation = 0;
+}
+
+static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
+{
+  __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_ACQ_REL);
+  bar->total = count;
+}
+
+static inline void gomp_barrier_destroy (gomp_barrier_t *bar)
+{
+}
+
+extern void gomp_barrier_wait (gomp_barrier_t *);
+extern void gomp_barrier_wait_last (gomp_barrier_t *);
+extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
+extern void gomp_team_barrier_wait (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
+                                       gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+                                              gomp_barrier_state_t);
+extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* A memory barrier is needed before exiting from the various forms
+     of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
+     2.8.6 flush Construct, which says there is an implicit flush during
+     a barrier region.  This is a convenient place to add the barrier,
+     so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE.  */
+  if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+  return gomp_barrier_wait_start (bar);
+}
+
+/* This is like gomp_barrier_wait_start, except it decrements
+   bar->awaited_final rather than bar->awaited and should be used
+   for the gomp_team_end barrier only.  */
+static inline gomp_barrier_state_t
+gomp_barrier_wait_final_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* See above gomp_barrier_wait_start comment.  */
+  if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline bool
+gomp_barrier_last_thread (gomp_barrier_state_t state)
+{
+  return state & BAR_WAS_LAST;
+}
+
+/* All the inlines below must be called with team->task_lock
+   held.  */
+
+static inline void
+gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
+{
+  bar->generation |= BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
+{
+  bar->generation &= ~BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
+{
+  bar->generation |= BAR_WAITING_FOR_TASK;
+}
+
+static inline bool
+gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
+{
+  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
+}
+
+static inline void
+gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  bar->generation = (state & -BAR_INCR) + BAR_INCR;
+}
+
 #endif /* GOMP_BARRIER_H */

[gomp4 09/14] libgomp: provide barriers on NVPTX

Reply via email to