Hello world,

as you can see from the PR, async_io_4.f90 exhibited a race condition
every few thousands to tens of thousands of cases.

This was difficult to track down, and took some thinking, discussion
with Nicolas and enabling of the debugging feature of the async I/O,
which finally produced a failure which could then be analyzed.

The solution was to remove the mutexes for the different conditions that
can happen in the async library, relying on the lock that protects the
async I/O queue instead.

Tested by Bill Seurer (thanks!) with a previous version of the patch,
and with valgrind --tool=helgrind and valgrind --tool=drd, both of
which showed no (new) failures for all of the async_io_*.f90 tests.

So, OK for trunk and gcc-9?

Regards

        Thomas

2020-02-17  Thomas Koenig  <tkoe...@gcc.gnu.org>

        PR fortran/93599
        * io/async.c (destroy_adv_cond): Do not destroy lock.
        (async_io): Make sure au->lock is locked for finishing of thread.
        Do not lock/unlock around signalling emptysignal. Unlock au->lock
        before return.
        (init_adv_cond): Do not initialize lock.
        (enqueue_transfer): Unlock after signal.
        (enqueue_done_id): Likewise.
        (enqueue_done): Likewise.
        (enqueue_close): Likewise.
        (enqueue_data_transfer): Likewise.
        (async_wait_id): Do not lock/unlock around signalling au->work.
        (async_wait): Unlock after signal.
        * io/async.h (SIGNAL): Add comment about needed au->lock.
        Remove locking/unlocking of advcond->lock.
        (WAIT_SIGNAL_MUTEX): Add comment. Remove locking/unlocking of
        advcond->lock.  Unlock mutex only at the end.  Loop on
        __ghread_cond_wait returning zero.
        (REVOKE_SIGNAL): Add comment. Remove locking/unlocking of
        advcond->lock.
        (struct adv_cond): Remove mutex from struct.
diff --git a/libgfortran/io/async.c b/libgfortran/io/async.c
index ab214af8a66..63b9158c0ba 100644
--- a/libgfortran/io/async.c
+++ b/libgfortran/io/async.c
@@ -80,7 +80,6 @@ update_pdt (st_parameter_dt **old, st_parameter_dt *new) {
 static void
 destroy_adv_cond (struct adv_cond *ac)
 {
-  T_ERROR (__gthread_mutex_destroy, &ac->lock);
   T_ERROR (__gthread_cond_destroy, &ac->signal);
 }
 
@@ -156,6 +155,7 @@ async_io (void *arg)
 
 		case AIO_CLOSE:
 		  NOTE ("Received AIO_CLOSE");
+		  LOCK (&au->lock);
 		  goto finish_thread;
 
 		default:
@@ -175,7 +175,6 @@ async_io (void *arg)
 	      else if (ctq->type == AIO_CLOSE)
 		{
 		  NOTE ("Received AIO_CLOSE during error condition");
-		  UNLOCK (&au->lock);
 		  goto finish_thread;
 		}
 	    }
@@ -189,9 +188,7 @@ async_io (void *arg)
       au->tail = NULL;
       au->head = NULL;
       au->empty = 1;
-      UNLOCK (&au->lock);
       SIGNAL (&au->emptysignal);
-      LOCK (&au->lock);
     }
  finish_thread:
   au->tail = NULL;
@@ -199,6 +196,7 @@ async_io (void *arg)
   au->empty = 1;
   SIGNAL (&au->emptysignal);
   free (ctq);
+  UNLOCK (&au->lock);
   return NULL;
 }
 
@@ -223,7 +221,6 @@ static void
 init_adv_cond (struct adv_cond *ac)
 {
   ac->pending = 0;
-  __GTHREAD_MUTEX_INIT_FUNCTION (&ac->lock);
   __GTHREAD_COND_INIT_FUNCTION (&ac->signal);
 }
 
@@ -279,8 +276,8 @@ enqueue_transfer (async_unit *au, transfer_args *arg, enum aio_do type)
   au->tail = tq;
   REVOKE_SIGNAL (&(au->emptysignal));
   au->empty = false;
-  UNLOCK (&au->lock);
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
 }
 
 /* Enqueue an st_write_done or st_read_done which contains an ID.  */
@@ -303,8 +300,8 @@ enqueue_done_id (async_unit *au, enum aio_do type)
   au->empty = false;
   ret = au->id.high++;
   NOTE ("Enqueue id: %d", ret);
-  UNLOCK (&au->lock);
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
   return ret;
 }
 
@@ -324,8 +321,8 @@ enqueue_done (async_unit *au, enum aio_do type)
   au->tail = tq;
   REVOKE_SIGNAL (&(au->emptysignal));
   au->empty = false;
-  UNLOCK (&au->lock);
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
 }
 
 /* Enqueue a CLOSE statement.  */
@@ -344,8 +341,8 @@ enqueue_close (async_unit *au)
   au->tail = tq;
   REVOKE_SIGNAL (&(au->emptysignal));
   au->empty = false;
-  UNLOCK (&au->lock);
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
 }
 
 /* The asynchronous unit keeps the currently active PDT around.
@@ -374,9 +371,9 @@ enqueue_data_transfer_init (async_unit *au, st_parameter_dt *dt, int read_flag)
     au->tail->next = tq;
   au->tail = tq;
   REVOKE_SIGNAL (&(au->emptysignal));
-  au->empty = 0;
-  UNLOCK (&au->lock);
+  au->empty = false;
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
 }
 
 /* Collect the errors that may have happened asynchronously.  Return true if
@@ -430,9 +427,7 @@ async_wait_id (st_parameter_common *cmp, async_unit *au, int i)
   NOTE ("Waiting for id %d", i);
   if (au->id.waiting < i)
     au->id.waiting = i;
-  UNLOCK (&au->lock);
   SIGNAL (&(au->work));
-  LOCK (&au->lock);
   WAIT_SIGNAL_MUTEX (&(au->id.done),
 		     (au->id.low >= au->id.waiting || au->empty), &au->lock);
   LOCK (&au->lock);
@@ -454,8 +449,8 @@ async_wait (st_parameter_common *cmp, async_unit *au)
   if (cmp == NULL)
     cmp = au->error.cmp;
 
-  SIGNAL (&(au->work));
   LOCK (&(au->lock));
+  SIGNAL (&(au->work));
 
   if (au->empty)
     {
diff --git a/libgfortran/io/async.h b/libgfortran/io/async.h
index c6b2e0f94bd..17d303c127b 100644
--- a/libgfortran/io/async.h
+++ b/libgfortran/io/async.h
@@ -229,44 +229,44 @@
 
 #if ASYNC_IO
 
+/* au->lock has to be held when calling this macro.  */
+
 #define SIGNAL(advcond) do{						\
-    INTERN_LOCK (&(advcond)->lock);					\
     (advcond)->pending = 1;						\
     DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_ORANGE "SIGNAL: " DEBUG_NORM \
 		 #advcond, __FUNCTION__, __LINE__, (void *) advcond);	\
     T_ERROR (__gthread_cond_broadcast, &(advcond)->signal);			\
-    INTERN_UNLOCK (&(advcond)->lock);					\
   } while (0)
 
+/* Has to be entered with mutex locked.  */
+
 #define WAIT_SIGNAL_MUTEX(advcond, condition, mutex) do{		\
     __label__ finish;		       					\
-    INTERN_LOCK (&((advcond)->lock));					\
     DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_BLUE "WAITING: " DEBUG_NORM \
 		 #advcond, __FUNCTION__, __LINE__, (void *) advcond);	\
-    if ((advcond)->pending || (condition)){				\
-      UNLOCK (mutex);							\
+    if ((advcond)->pending || (condition))				\
       goto finish;							\
-    }									\
-    UNLOCK (mutex);							\
-     while (!__gthread_cond_wait(&(advcond)->signal, &(advcond)->lock)) {	\
-       { int cond;							\
-	 LOCK (mutex); cond = condition; UNLOCK (mutex);	\
-	   if (cond){							\
-	     DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_ORANGE "REC: " DEBUG_NORM \
+    while (1)								\
+      {									\
+	int err_ret = __gthread_cond_wait(&(advcond)->signal, mutex);	\
+	if (err_ret) internal_error (NULL, "WAIT_SIGNAL_MUTEX failed");	\
+	if (condition)							\
+	  {								\
+	    DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_ORANGE \
+			  "REC: " DEBUG_NORM				\
 			  #advcond,  __FUNCTION__, __LINE__, (void *)advcond); \
 	    break;				      			\
 	  }								\
       }									\
-    }									\
   finish:								\
     (advcond)->pending = 0;						\
-		 INTERN_UNLOCK (&((advcond)->lock));			\
+    UNLOCK (mutex);							\
   } while (0)
 
+/* au->lock has to be held when calling this macro.  */
+
 #define REVOKE_SIGNAL(advcond) do{		\
-    INTERN_LOCK (&(advcond)->lock);		\
     (advcond)->pending = 0;			\
-    INTERN_UNLOCK (&(advcond)->lock);		\
   } while (0)
 
 #else
@@ -330,7 +330,6 @@ struct adv_cond
 {
 #if ASYNC_IO
   int pending;
-  __gthread_mutex_t lock;
   __gthread_cond_t signal;
 #endif
 };

Reply via email to