Hi,

Consider the following omp fragment.
...
  #pragma omp target
  #pragma omp parallel num_threads (2)
  #pragma omp task
    ;
...

This hangs at -O0 for nvptx.

Investigating the behaviour gives us the following trace of events:
- both threads execute GOMP_task, where they:
  - deposit a task, and
  - execute gomp_team_barrier_wake
- thread 1 executes gomp_team_barrier_wait_end and, not being the last thread,
  proceeds to wait at the team barrier
- thread 0 executes gomp_team_barrier_wait_end and, being the last thread, it
  calls gomp_barrier_handle_tasks, where it:
  - executes both tasks and marks the team barrier done
  - executes a gomp_team_barrier_wake which wakes up thread 1
- thread 1 exits the team barrier
- thread 0 returns from gomp_barrier_handle_tasks and goes to wait at
  the team barrier.
- thread 0 hangs.

To understand why there is a hang here, it's good to understand how things
are setup for nvptx.  The libgomp/config/nvptx/bar.c implementation is
a copy of the libgomp/config/linux/bar.c implementation, with uses of both
futex_wake and do_wait replaced with uses of nvptx insn bar.sync:
...
  if (bar->total > 1)
    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
...

The point where thread 0 goes to wait at the team barrier, corresponds in
the linux implementation with a do_wait.  In the linux case, the call to
do_wait doesn't hang, because it's waiting for bar->generation to become
a certain value, and if bar->generation already has that value, it just
proceeds, without any need for coordination with other threads.

In the nvtpx case, the bar.sync waits until thread 1 joins it in the same
logical barrier, which never happens: thread 1 is lingering in the
thread pool at the thread pool barrier (using a different logical barrier),
waiting to join a new team.

The easiest way to fix this is to revert to the posix implementation for
bar.{c,h}.

Another way would be to revert to the linux implementation for bar.{c,h},
and implement the primitives futex_wait and do_wait using nvptx insns.

This patch instead implements a minimal fix (which makes the implementation
deviate further from the linux one).

The hang was only observed in gomp_team_barrier_wait_end, but we propagate the
fix to its twin gomp_team_barrier_wait_cancel_end as well.

The fix is based on the assumptions that at the point of the fix, after the
call to gomp_barrier_handle_tasks:
- all tasks are done
  (an assert is added to check this), and consequently:
- the executing thread is the only thread left in the team barrier
  (so it's accurate to set nthreads to 1).

Tested libgomp on x86_64 with nvptx accelerator.

Any comments?

Thanks,
- Tom

[libgomp, nvptx] Fix hang in gomp_team_barrier_wait_end

libgomp/ChangeLog:

2021-04-20  Tom de Vries  <tdevr...@suse.de>

        PR target/99555
        * config/nvptx/bar.c (gomp_team_barrier_wait_end)
        (gomp_team_barrier_wait_cancel_end): Don't try to sync with team threads
        that have left the team barrier.
        * testsuite/libgomp.c-c++-common/task-detach-6.c: Remove nvptx-specific
        workarounds.
        * testsuite/libgomp.c/pr99555-1.c: Same.
        * testsuite/libgomp.fortran/task-detach-6.f90: Same.

---
 libgomp/config/nvptx/bar.c                         | 32 ++++++++++++++++------
 .../testsuite/libgomp.c-c++-common/task-detach-6.c |  8 ------
 libgomp/testsuite/libgomp.c/pr99555-1.c            |  8 ------
 .../testsuite/libgomp.fortran/task-detach-6.f90    | 12 --------
 4 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index c5c2fa8829b..058a8d4d5ca 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -78,6 +78,7 @@ void
 gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
   unsigned int generation, gen;
+  unsigned int nthreads = bar->total;
 
   if (__builtin_expect (state & BAR_WAS_LAST, 0))
     {
@@ -90,6 +91,15 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, 
gomp_barrier_state_t state)
       if (__builtin_expect (team->task_count, 0))
        {
          gomp_barrier_handle_tasks (state);
+         /* Assert that all tasks have been handled.  */
+         if (team->task_count != 0)
+           __builtin_abort ();
+         /* In gomp_barrier_handle_tasks, the team barrier has been marked
+            as done, and all pending threads woken up.  So this is now the
+            last and only thread in the barrier.  Adjust nthreads to
+            reflect the new situation, to make sure we don't hang
+            indefinitely at the bar.sync below.  */
+         nthreads = 1;
          state &= ~BAR_WAS_LAST;
        }
       else
@@ -97,8 +107,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, 
gomp_barrier_state_t state)
          state &= ~BAR_CANCELLED;
          state += BAR_INCR - BAR_WAS_LAST;
          __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-         if (bar->total > 1)
-           asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+         if (nthreads > 1)
+           asm ("bar.sync 1, %0;" : : "r" (32 * nthreads));
          return;
        }
     }
@@ -107,8 +117,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, 
gomp_barrier_state_t state)
   state &= ~BAR_CANCELLED;
   do
     {
-      if (bar->total > 1)
-       asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+      if (nthreads > 1)
+       asm ("bar.sync 1, %0;" : : "r" (32 * nthreads));
       gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
       if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
        {
@@ -140,6 +150,7 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
                                   gomp_barrier_state_t state)
 {
   unsigned int generation, gen;
+  unsigned int nthreads = bar->total;
 
   if (__builtin_expect (state & BAR_WAS_LAST, 0))
     {
@@ -156,14 +167,19 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
       if (__builtin_expect (team->task_count, 0))
        {
          gomp_barrier_handle_tasks (state);
+         /* Assert that all tasks have been handled.  */
+         if (team->task_count != 0)
+           __builtin_abort ();
+         /* See comment in gomp_team_barrier_wait_end.  */
+         nthreads = 1;
          state &= ~BAR_WAS_LAST;
        }
       else
        {
          state += BAR_INCR - BAR_WAS_LAST;
          __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-         if (bar->total > 1)
-           asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+         if (nthreads > 1)
+           asm ("bar.sync 1, %0;" : : "r" (32 * nthreads));
          return false;
        }
     }
@@ -174,8 +190,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
   generation = state;
   do
     {
-      if (bar->total > 1)
-       asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+      if (nthreads > 1)
+       asm ("bar.sync 1, %0;" : : "r" (32 * nthreads));
       gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
       if (__builtin_expect (gen & BAR_CANCELLED, 0))
        return true;
diff --git a/libgomp/testsuite/libgomp.c-c++-common/task-detach-6.c 
b/libgomp/testsuite/libgomp.c-c++-common/task-detach-6.c
index f18b57bf047..e5c2291e6ff 100644
--- a/libgomp/testsuite/libgomp.c-c++-common/task-detach-6.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/task-detach-6.c
@@ -2,9 +2,6 @@
 
 #include <omp.h>
 #include <assert.h>
-#include <unistd.h> // For 'alarm'.
-
-#include "on_device_arch.h"
 
 /* Test tasks with detach clause on an offload device.  Each device
    thread spawns off a chain of tasks, that can then be executed by
@@ -12,11 +9,6 @@
 
 int main (void)
 {
-  //TODO See '../libgomp.c/pr99555-1.c'.
-  if (on_device_arch_nvptx ())
-    alarm (4); /*TODO Until resolved, make sure that we exit quickly, with 
error status.
-                { dg-xfail-run-if "PR99555" { offload_device_nvptx } } */
-
   int x = 0, y = 0, z = 0;
   int thread_count;
   omp_event_handle_t detach_event1, detach_event2;
diff --git a/libgomp/testsuite/libgomp.c/pr99555-1.c 
b/libgomp/testsuite/libgomp.c/pr99555-1.c
index bd33b93716b..7386e016fd2 100644
--- a/libgomp/testsuite/libgomp.c/pr99555-1.c
+++ b/libgomp/testsuite/libgomp.c/pr99555-1.c
@@ -2,16 +2,8 @@
 
 // { dg-additional-options "-O0" }
 
-#include <unistd.h> // For 'alarm'.
-
-#include "../libgomp.c-c++-common/on_device_arch.h"
-
 int main (void)
 {
-  if (on_device_arch_nvptx ())
-    alarm (4); /*TODO Until resolved, make sure that we exit quickly, with 
error status.
-                { dg-xfail-run-if "PR99555" { offload_device_nvptx } } */
-
 #pragma omp target
 #pragma omp parallel // num_threads(1)
 #pragma omp task
diff --git a/libgomp/testsuite/libgomp.fortran/task-detach-6.f90 
b/libgomp/testsuite/libgomp.fortran/task-detach-6.f90
index e4373b4c6f1..03a3b61540d 100644
--- a/libgomp/testsuite/libgomp.fortran/task-detach-6.f90
+++ b/libgomp/testsuite/libgomp.fortran/task-detach-6.f90
@@ -1,6 +1,5 @@
 ! { dg-do run }
 
-! { dg-additional-sources on_device_arch.c }
   ! { dg-prune-output "command-line option '-fintrinsic-modules-path=.*' is 
valid for Fortran but not for C" }
 
 ! Test tasks with detach clause on an offload device.  Each device
@@ -14,17 +13,6 @@ program task_detach_6
   integer :: x = 0, y = 0, z = 0
   integer :: thread_count
 
-  interface
-    integer function on_device_arch_nvptx() bind(C)
-    end function on_device_arch_nvptx
-  end interface
-
-  !TODO See '../libgomp.c/pr99555-1.c'.
-  if (on_device_arch_nvptx () /= 0) then
-     call alarm (4, 0); !TODO Until resolved, make sure that we exit quickly, 
with error status.
-     ! { dg-xfail-run-if "PR99555" { offload_device_nvptx } }
-  end if
-
   !$omp target map (tofrom: x, y, z) map (from: thread_count)
     !$omp parallel private (detach_event1, detach_event2)
       !$omp single

Reply via email to