Hello world, as you can see from the PR, async_io_4.f90 exhibited a race condition every few thousands to tens of thousands of cases.
This was difficult to track down, and took some thinking, discussion with Nicolas and enabling of the debugging feature of the async I/O, which finally produced a failure which could then be analyzed. The solution was to remove the mutexes for the different conditions that can happen in the async library, relying on the lock that protects the async I/O queue instead. Tested by Bill Seurer (thanks!) with a previous version of the patch, and with valgrind --tool=helgrind and valgrind --tool=drd, both of which showed no (new) failures for all of the async_io_*.f90 tests. So, OK for trunk and gcc-9? Regards Thomas 2020-02-17 Thomas Koenig <tkoe...@gcc.gnu.org> PR fortran/93599 * io/async.c (destroy_adv_cond): Do not destroy lock. (async_io): Make sure au->lock is locked for finishing of thread. Do not lock/unlock around signalling emptysignal. Unlock au->lock before return. (init_adv_cond): Do not initialize lock. (enqueue_transfer): Unlock after signal. (enqueue_done_id): Likewise. (enqueue_done): Likewise. (enqueue_close): Likewise. (enqueue_data_transfer): Likewise. (async_wait_id): Do not lock/unlock around signalling au->work. (async_wait): Unlock after signal. * io/async.h (SIGNAL): Add comment about needed au->lock. Remove locking/unlocking of advcond->lock. (WAIT_SIGNAL_MUTEX): Add comment. Remove locking/unlocking of advcond->lock. Unlock mutex only at the end. Loop on __ghread_cond_wait returning zero. (REVOKE_SIGNAL): Add comment. Remove locking/unlocking of advcond->lock. (struct adv_cond): Remove mutex from struct.
diff --git a/libgfortran/io/async.c b/libgfortran/io/async.c index ab214af8a66..63b9158c0ba 100644 --- a/libgfortran/io/async.c +++ b/libgfortran/io/async.c @@ -80,7 +80,6 @@ update_pdt (st_parameter_dt **old, st_parameter_dt *new) { static void destroy_adv_cond (struct adv_cond *ac) { - T_ERROR (__gthread_mutex_destroy, &ac->lock); T_ERROR (__gthread_cond_destroy, &ac->signal); } @@ -156,6 +155,7 @@ async_io (void *arg) case AIO_CLOSE: NOTE ("Received AIO_CLOSE"); + LOCK (&au->lock); goto finish_thread; default: @@ -175,7 +175,6 @@ async_io (void *arg) else if (ctq->type == AIO_CLOSE) { NOTE ("Received AIO_CLOSE during error condition"); - UNLOCK (&au->lock); goto finish_thread; } } @@ -189,9 +188,7 @@ async_io (void *arg) au->tail = NULL; au->head = NULL; au->empty = 1; - UNLOCK (&au->lock); SIGNAL (&au->emptysignal); - LOCK (&au->lock); } finish_thread: au->tail = NULL; @@ -199,6 +196,7 @@ async_io (void *arg) au->empty = 1; SIGNAL (&au->emptysignal); free (ctq); + UNLOCK (&au->lock); return NULL; } @@ -223,7 +221,6 @@ static void init_adv_cond (struct adv_cond *ac) { ac->pending = 0; - __GTHREAD_MUTEX_INIT_FUNCTION (&ac->lock); __GTHREAD_COND_INIT_FUNCTION (&ac->signal); } @@ -279,8 +276,8 @@ enqueue_transfer (async_unit *au, transfer_args *arg, enum aio_do type) au->tail = tq; REVOKE_SIGNAL (&(au->emptysignal)); au->empty = false; - UNLOCK (&au->lock); SIGNAL (&au->work); + UNLOCK (&au->lock); } /* Enqueue an st_write_done or st_read_done which contains an ID. */ @@ -303,8 +300,8 @@ enqueue_done_id (async_unit *au, enum aio_do type) au->empty = false; ret = au->id.high++; NOTE ("Enqueue id: %d", ret); - UNLOCK (&au->lock); SIGNAL (&au->work); + UNLOCK (&au->lock); return ret; } @@ -324,8 +321,8 @@ enqueue_done (async_unit *au, enum aio_do type) au->tail = tq; REVOKE_SIGNAL (&(au->emptysignal)); au->empty = false; - UNLOCK (&au->lock); SIGNAL (&au->work); + UNLOCK (&au->lock); } /* Enqueue a CLOSE statement. */ @@ -344,8 +341,8 @@ enqueue_close (async_unit *au) au->tail = tq; REVOKE_SIGNAL (&(au->emptysignal)); au->empty = false; - UNLOCK (&au->lock); SIGNAL (&au->work); + UNLOCK (&au->lock); } /* The asynchronous unit keeps the currently active PDT around. @@ -374,9 +371,9 @@ enqueue_data_transfer_init (async_unit *au, st_parameter_dt *dt, int read_flag) au->tail->next = tq; au->tail = tq; REVOKE_SIGNAL (&(au->emptysignal)); - au->empty = 0; - UNLOCK (&au->lock); + au->empty = false; SIGNAL (&au->work); + UNLOCK (&au->lock); } /* Collect the errors that may have happened asynchronously. Return true if @@ -430,9 +427,7 @@ async_wait_id (st_parameter_common *cmp, async_unit *au, int i) NOTE ("Waiting for id %d", i); if (au->id.waiting < i) au->id.waiting = i; - UNLOCK (&au->lock); SIGNAL (&(au->work)); - LOCK (&au->lock); WAIT_SIGNAL_MUTEX (&(au->id.done), (au->id.low >= au->id.waiting || au->empty), &au->lock); LOCK (&au->lock); @@ -454,8 +449,8 @@ async_wait (st_parameter_common *cmp, async_unit *au) if (cmp == NULL) cmp = au->error.cmp; - SIGNAL (&(au->work)); LOCK (&(au->lock)); + SIGNAL (&(au->work)); if (au->empty) { diff --git a/libgfortran/io/async.h b/libgfortran/io/async.h index c6b2e0f94bd..17d303c127b 100644 --- a/libgfortran/io/async.h +++ b/libgfortran/io/async.h @@ -229,44 +229,44 @@ #if ASYNC_IO +/* au->lock has to be held when calling this macro. */ + #define SIGNAL(advcond) do{ \ - INTERN_LOCK (&(advcond)->lock); \ (advcond)->pending = 1; \ DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_ORANGE "SIGNAL: " DEBUG_NORM \ #advcond, __FUNCTION__, __LINE__, (void *) advcond); \ T_ERROR (__gthread_cond_broadcast, &(advcond)->signal); \ - INTERN_UNLOCK (&(advcond)->lock); \ } while (0) +/* Has to be entered with mutex locked. */ + #define WAIT_SIGNAL_MUTEX(advcond, condition, mutex) do{ \ __label__ finish; \ - INTERN_LOCK (&((advcond)->lock)); \ DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_BLUE "WAITING: " DEBUG_NORM \ #advcond, __FUNCTION__, __LINE__, (void *) advcond); \ - if ((advcond)->pending || (condition)){ \ - UNLOCK (mutex); \ + if ((advcond)->pending || (condition)) \ goto finish; \ - } \ - UNLOCK (mutex); \ - while (!__gthread_cond_wait(&(advcond)->signal, &(advcond)->lock)) { \ - { int cond; \ - LOCK (mutex); cond = condition; UNLOCK (mutex); \ - if (cond){ \ - DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_ORANGE "REC: " DEBUG_NORM \ + while (1) \ + { \ + int err_ret = __gthread_cond_wait(&(advcond)->signal, mutex); \ + if (err_ret) internal_error (NULL, "WAIT_SIGNAL_MUTEX failed"); \ + if (condition) \ + { \ + DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_ORANGE \ + "REC: " DEBUG_NORM \ #advcond, __FUNCTION__, __LINE__, (void *)advcond); \ break; \ } \ } \ - } \ finish: \ (advcond)->pending = 0; \ - INTERN_UNLOCK (&((advcond)->lock)); \ + UNLOCK (mutex); \ } while (0) +/* au->lock has to be held when calling this macro. */ + #define REVOKE_SIGNAL(advcond) do{ \ - INTERN_LOCK (&(advcond)->lock); \ (advcond)->pending = 0; \ - INTERN_UNLOCK (&(advcond)->lock); \ } while (0) #else @@ -330,7 +330,6 @@ struct adv_cond { #if ASYNC_IO int pending; - __gthread_mutex_t lock; __gthread_cond_t signal; #endif };