Hi, Consider the following omp fragment. ... #pragma omp target #pragma omp parallel num_threads (2) #pragma omp task ; ...
This hangs at -O0 for nvptx. Investigating the behaviour gives us the following trace of events: - both threads execute GOMP_task, where they: - deposit a task, and - execute gomp_team_barrier_wake - thread 1 executes gomp_team_barrier_wait_end and, not being the last thread, proceeds to wait at the team barrier - thread 0 executes gomp_team_barrier_wait_end and, being the last thread, it calls gomp_barrier_handle_tasks, where it: - executes both tasks and marks the team barrier done - executes a gomp_team_barrier_wake which wakes up thread 1 - thread 1 exits the team barrier - thread 0 returns from gomp_barrier_handle_tasks and goes to wait at the team barrier. - thread 0 hangs. To understand why there is a hang here, it's good to understand how things are setup for nvptx. The libgomp/config/nvptx/bar.c implementation is a copy of the libgomp/config/linux/bar.c implementation, with uses of both futex_wake and do_wait replaced with uses of nvptx insn bar.sync: ... if (bar->total > 1) asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); ... The point where thread 0 goes to wait at the team barrier, corresponds in the linux implementation with a do_wait. In the linux case, the call to do_wait doesn't hang, because it's waiting for bar->generation to become a certain value, and if bar->generation already has that value, it just proceeds, without any need for coordination with other threads. In the nvtpx case, the bar.sync waits until thread 1 joins it in the same logical barrier, which never happens: thread 1 is lingering in the thread pool at the thread pool barrier (using a different logical barrier), waiting to join a new team. The easiest way to fix this is to revert to the posix implementation for bar.{c,h}. Another way would be to revert to the linux implementation for bar.{c,h}, and implement the primitives futex_wait and do_wait using nvptx insns. This patch instead implements a minimal fix (which makes the implementation deviate further from the linux one). The hang was only observed in gomp_team_barrier_wait_end, but we propagate the fix to its twin gomp_team_barrier_wait_cancel_end as well. The fix is based on the assumptions that at the point of the fix, after the call to gomp_barrier_handle_tasks: - all tasks are done (an assert is added to check this), and consequently: - the executing thread is the only thread left in the team barrier (so it's accurate to set nthreads to 1). Tested libgomp on x86_64 with nvptx accelerator. Any comments? Thanks, - Tom [libgomp, nvptx] Fix hang in gomp_team_barrier_wait_end libgomp/ChangeLog: 2021-04-20 Tom de Vries <tdevr...@suse.de> PR target/99555 * config/nvptx/bar.c (gomp_team_barrier_wait_end) (gomp_team_barrier_wait_cancel_end): Don't try to sync with team threads that have left the team barrier. * testsuite/libgomp.c-c++-common/task-detach-6.c: Remove nvptx-specific workarounds. * testsuite/libgomp.c/pr99555-1.c: Same. * testsuite/libgomp.fortran/task-detach-6.f90: Same. --- libgomp/config/nvptx/bar.c | 32 ++++++++++++++++------ .../testsuite/libgomp.c-c++-common/task-detach-6.c | 8 ------ libgomp/testsuite/libgomp.c/pr99555-1.c | 8 ------ .../testsuite/libgomp.fortran/task-detach-6.f90 | 12 -------- 4 files changed, 24 insertions(+), 36 deletions(-) diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c index c5c2fa8829b..058a8d4d5ca 100644 --- a/libgomp/config/nvptx/bar.c +++ b/libgomp/config/nvptx/bar.c @@ -78,6 +78,7 @@ void gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) { unsigned int generation, gen; + unsigned int nthreads = bar->total; if (__builtin_expect (state & BAR_WAS_LAST, 0)) { @@ -90,6 +91,15 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) if (__builtin_expect (team->task_count, 0)) { gomp_barrier_handle_tasks (state); + /* Assert that all tasks have been handled. */ + if (team->task_count != 0) + __builtin_abort (); + /* In gomp_barrier_handle_tasks, the team barrier has been marked + as done, and all pending threads woken up. So this is now the + last and only thread in the barrier. Adjust nthreads to + reflect the new situation, to make sure we don't hang + indefinitely at the bar.sync below. */ + nthreads = 1; state &= ~BAR_WAS_LAST; } else @@ -97,8 +107,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) state &= ~BAR_CANCELLED; state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); - if (bar->total > 1) - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (nthreads > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * nthreads)); return; } } @@ -107,8 +117,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) state &= ~BAR_CANCELLED; do { - if (bar->total > 1) - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (nthreads > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * nthreads)); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) { @@ -140,6 +150,7 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, gomp_barrier_state_t state) { unsigned int generation, gen; + unsigned int nthreads = bar->total; if (__builtin_expect (state & BAR_WAS_LAST, 0)) { @@ -156,14 +167,19 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, if (__builtin_expect (team->task_count, 0)) { gomp_barrier_handle_tasks (state); + /* Assert that all tasks have been handled. */ + if (team->task_count != 0) + __builtin_abort (); + /* See comment in gomp_team_barrier_wait_end. */ + nthreads = 1; state &= ~BAR_WAS_LAST; } else { state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); - if (bar->total > 1) - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (nthreads > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * nthreads)); return false; } } @@ -174,8 +190,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, generation = state; do { - if (bar->total > 1) - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (nthreads > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * nthreads)); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); if (__builtin_expect (gen & BAR_CANCELLED, 0)) return true; diff --git a/libgomp/testsuite/libgomp.c-c++-common/task-detach-6.c b/libgomp/testsuite/libgomp.c-c++-common/task-detach-6.c index f18b57bf047..e5c2291e6ff 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/task-detach-6.c +++ b/libgomp/testsuite/libgomp.c-c++-common/task-detach-6.c @@ -2,9 +2,6 @@ #include <omp.h> #include <assert.h> -#include <unistd.h> // For 'alarm'. - -#include "on_device_arch.h" /* Test tasks with detach clause on an offload device. Each device thread spawns off a chain of tasks, that can then be executed by @@ -12,11 +9,6 @@ int main (void) { - //TODO See '../libgomp.c/pr99555-1.c'. - if (on_device_arch_nvptx ()) - alarm (4); /*TODO Until resolved, make sure that we exit quickly, with error status. - { dg-xfail-run-if "PR99555" { offload_device_nvptx } } */ - int x = 0, y = 0, z = 0; int thread_count; omp_event_handle_t detach_event1, detach_event2; diff --git a/libgomp/testsuite/libgomp.c/pr99555-1.c b/libgomp/testsuite/libgomp.c/pr99555-1.c index bd33b93716b..7386e016fd2 100644 --- a/libgomp/testsuite/libgomp.c/pr99555-1.c +++ b/libgomp/testsuite/libgomp.c/pr99555-1.c @@ -2,16 +2,8 @@ // { dg-additional-options "-O0" } -#include <unistd.h> // For 'alarm'. - -#include "../libgomp.c-c++-common/on_device_arch.h" - int main (void) { - if (on_device_arch_nvptx ()) - alarm (4); /*TODO Until resolved, make sure that we exit quickly, with error status. - { dg-xfail-run-if "PR99555" { offload_device_nvptx } } */ - #pragma omp target #pragma omp parallel // num_threads(1) #pragma omp task diff --git a/libgomp/testsuite/libgomp.fortran/task-detach-6.f90 b/libgomp/testsuite/libgomp.fortran/task-detach-6.f90 index e4373b4c6f1..03a3b61540d 100644 --- a/libgomp/testsuite/libgomp.fortran/task-detach-6.f90 +++ b/libgomp/testsuite/libgomp.fortran/task-detach-6.f90 @@ -1,6 +1,5 @@ ! { dg-do run } -! { dg-additional-sources on_device_arch.c } ! { dg-prune-output "command-line option '-fintrinsic-modules-path=.*' is valid for Fortran but not for C" } ! Test tasks with detach clause on an offload device. Each device @@ -14,17 +13,6 @@ program task_detach_6 integer :: x = 0, y = 0, z = 0 integer :: thread_count - interface - integer function on_device_arch_nvptx() bind(C) - end function on_device_arch_nvptx - end interface - - !TODO See '../libgomp.c/pr99555-1.c'. - if (on_device_arch_nvptx () /= 0) then - call alarm (4, 0); !TODO Until resolved, make sure that we exit quickly, with error status. - ! { dg-xfail-run-if "PR99555" { offload_device_nvptx } } - end if - !$omp target map (tofrom: x, y, z) map (from: thread_count) !$omp parallel private (detach_event1, detach_event2) !$omp single