On 2024-05-28 I did:
>       * lib/pthread-once.c (pthread_once): Add an implementation for Cygwin.

Unfortunately this implementation had a race condition. All of the three
programs
  test-call_once2
  test-lock
  test-pthread-once2
fail with a probability between ca. 10% and 20%. Among 50 consecutive runs,
I see at least one fail.

The race is that when two threads get here:

  if (once_control->done == 0)
    {
      // ----------- HERE --------------
      once_control->num_threads += 1;

then thread1 proceeds until the end (executing the initfunction, setting
done = 1, and destroying the lock),
then thread2 will proceed and attempt to lock a lock that has already been
destroyed — which is undefined behaviour.

This patch series fixes the race condition. With it, 100 consecutive runs
of test-pthread-once2 succeed.


2024-05-31  Bruno Haible  <br...@clisp.org>

        pthread-once: Fix race in Cygwin workaround implementation.
        * lib/pthread-once.c (pthread_once): Test the 'done' word after
        incrementing num_threads. Make sure to invoke pthread_mutex_destroy
        only once.

        pthread-once: Simplify Cygwin workaround implementation.
        * lib/pthread-once.c (pthread_once): Use separate 16-bit words to store
        the parts of the state.

        pthread-once: Simplify Cygwin workaround implementation.
        * lib/pthread-once.c (pthread_once): Use _Atomic instead of __sync_*
        gcc primitives.

>From ae63a0c4b2586a839b786cbe8688e93eb669af72 Mon Sep 17 00:00:00 2001
From: Bruno Haible <br...@clisp.org>
Date: Fri, 31 May 2024 01:56:49 +0200
Subject: [PATCH 1/3] pthread-once: Simplify Cygwin workaround implementation.

* lib/pthread-once.c (pthread_once): Use _Atomic instead of __sync_*
gcc primitives.
---
 ChangeLog          |  6 ++++++
 lib/pthread-once.c | 43 +++++++++++--------------------------------
 2 files changed, 17 insertions(+), 32 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a30f610ac1..583735a33e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2024-05-31  Bruno Haible  <br...@clisp.org>
+
+	pthread-once: Simplify Cygwin workaround implementation.
+	* lib/pthread-once.c (pthread_once): Use _Atomic instead of __sync_*
+	gcc primitives.
+
 2024-05-30  Bruno Haible  <br...@clisp.org>
 
 	assert-h, verify: Fix compilation error with g++ (4.8.5) -std=gnu++11.
diff --git a/lib/pthread-once.c b/lib/pthread-once.c
index 069e77e3f3..0f009c0ec5 100644
--- a/lib/pthread-once.c
+++ b/lib/pthread-once.c
@@ -77,52 +77,31 @@ pthread_once (pthread_once_t *once_control, void (*initfunction) (void))
      In other words:
        state = { unsigned int num_threads : 31; unsigned int done : 1; }
    */
+  _Atomic unsigned int *state_p = (_Atomic unsigned int *) &once_control->state;
   /* Test the 'done' bit.  */
-  if ((* (unsigned int volatile *) &once_control->state & 1) == 0)
+  if ((*state_p & 1) == 0)
     {
       /* The 'done' bit is still zero.  Increment num_threads (atomically).  */
-      for (;;)
-        {
-          unsigned int prev = * (unsigned int volatile *) &once_control->state;
-          if (__sync_bool_compare_and_swap ((unsigned int *) &once_control->state,
-                                            prev, prev + 2))
-            break;
-        }
+      *state_p += 2;
       /* We have incremented num_threads.  Now take the lock.  */
       pthread_mutex_lock (&once_control->mutex);
       /* Test the 'done' bit again.  */
-      if ((* (unsigned int volatile *) &once_control->state & 1) == 0)
+      if ((*state_p & 1) == 0)
         {
           /* Execute the initfunction.  */
           (*initfunction) ();
           /* Set the 'done' bit to 1 (atomically).  */
-          for (;;)
-            {
-              unsigned int prev = * (unsigned int volatile *) &once_control->state;
-              if (__sync_bool_compare_and_swap ((unsigned int *) &once_control->state,
-                                                prev, prev | 1))
-                break;
-            }
+          *state_p |= 1;
         }
       /* Now the 'done' bit is 1.  Release the lock.  */
       pthread_mutex_unlock (&once_control->mutex);
       /* Decrement num_threads (atomically).  */
-      for (;;)
-        {
-          unsigned int prev = * (unsigned int volatile *) &once_control->state;
-          if (prev < 2)
-            abort ();
-          if (__sync_bool_compare_and_swap ((unsigned int *) &once_control->state,
-                                            prev, prev - 2))
-            {
-              if (prev - 2 == 1)
-                /* num_threads is now zero, and done is 1.
-                   No other thread will need to use the lock.
-                   We can therefore destroy the lock, to free resources.  */
-                pthread_mutex_destroy (&once_control->mutex);
-              break;
-            }
-        }
+      unsigned int new_state = (*state_p -= 2);
+      if (new_state == 1)
+        /* num_threads is now zero, and done is 1.
+           No other thread will need to use the lock.
+           We can therefore destroy the lock, to free resources.  */
+        pthread_mutex_destroy (&once_control->mutex);
     }
 #  endif
   return 0;
-- 
2.34.1

>From d37eb8d07a9bc3d33b3d1a4c0acc7a12e3dbf683 Mon Sep 17 00:00:00 2001
From: Bruno Haible <br...@clisp.org>
Date: Fri, 31 May 2024 11:19:28 +0200
Subject: [PATCH 2/3] pthread-once: Simplify Cygwin workaround implementation.

* lib/pthread-once.c (pthread_once): Use separate 16-bit words to store
the parts of the state.
---
 ChangeLog          |  4 ++++
 lib/pthread-once.c | 22 +++++++++++++---------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 583735a33e..e822cb74bd 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2024-05-31  Bruno Haible  <br...@clisp.org>
 
+	pthread-once: Simplify Cygwin workaround implementation.
+	* lib/pthread-once.c (pthread_once): Use separate 16-bit words to store
+	the parts of the state.
+
 	pthread-once: Simplify Cygwin workaround implementation.
 	* lib/pthread-once.c (pthread_once): Use _Atomic instead of __sync_*
 	gcc primitives.
diff --git a/lib/pthread-once.c b/lib/pthread-once.c
index 0f009c0ec5..ef583daed4 100644
--- a/lib/pthread-once.c
+++ b/lib/pthread-once.c
@@ -73,31 +73,35 @@ pthread_once (pthread_once_t *once_control, void (*initfunction) (void))
        typedef struct { pthread_mutex_t mutex; int state; } pthread_once_t;
        #define PTHREAD_ONCE_INIT { PTHREAD_MUTEX_INITIALIZER, 0 }
      while assigning the following meaning to the state:
-       state = 2 * <number of waiting threads> + <1 if done>
+       state = (<number of waiting threads> << 16) + <1 if done>
      In other words:
-       state = { unsigned int num_threads : 31; unsigned int done : 1; }
+       state = { unsigned int num_threads : 16; unsigned int done : 16; }
    */
-  _Atomic unsigned int *state_p = (_Atomic unsigned int *) &once_control->state;
+  struct actual_state
+    {
+      _Atomic unsigned short num_threads;
+      _Atomic unsigned short done;
+    };
+  struct actual_state *state_p = (struct actual_state *) &once_control->state;
   /* Test the 'done' bit.  */
-  if ((*state_p & 1) == 0)
+  if (state_p->done == 0)
     {
       /* The 'done' bit is still zero.  Increment num_threads (atomically).  */
-      *state_p += 2;
+      state_p->num_threads += 1;
       /* We have incremented num_threads.  Now take the lock.  */
       pthread_mutex_lock (&once_control->mutex);
       /* Test the 'done' bit again.  */
-      if ((*state_p & 1) == 0)
+      if (state_p->done == 0)
         {
           /* Execute the initfunction.  */
           (*initfunction) ();
           /* Set the 'done' bit to 1 (atomically).  */
-          *state_p |= 1;
+          state_p->done = 1;
         }
       /* Now the 'done' bit is 1.  Release the lock.  */
       pthread_mutex_unlock (&once_control->mutex);
       /* Decrement num_threads (atomically).  */
-      unsigned int new_state = (*state_p -= 2);
-      if (new_state == 1)
+      if ((state_p->num_threads -= 1) == 0)
         /* num_threads is now zero, and done is 1.
            No other thread will need to use the lock.
            We can therefore destroy the lock, to free resources.  */
-- 
2.34.1

>From 659f0def364d9d08759fe30173fc4a113badc248 Mon Sep 17 00:00:00 2001
From: Bruno Haible <br...@clisp.org>
Date: Fri, 31 May 2024 15:25:30 +0200
Subject: [PATCH 3/3] pthread-once: Fix race in Cygwin workaround
 implementation.

* lib/pthread-once.c (pthread_once): Test the 'done' word after
incrementing num_threads. Make sure to invoke pthread_mutex_destroy
only once.
---
 ChangeLog          |  5 +++
 lib/pthread-once.c | 98 +++++++++++++++++++++++++++-------------------
 2 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index e822cb74bd..2ebc8a792b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2024-05-31  Bruno Haible  <br...@clisp.org>
 
+	pthread-once: Fix race in Cygwin workaround implementation.
+	* lib/pthread-once.c (pthread_once): Test the 'done' word after
+	incrementing num_threads. Make sure to invoke pthread_mutex_destroy
+	only once.
+
 	pthread-once: Simplify Cygwin workaround implementation.
 	* lib/pthread-once.c (pthread_once): Use separate 16-bit words to store
 	the parts of the state.
diff --git a/lib/pthread-once.c b/lib/pthread-once.c
index ef583daed4..4b4a18d2af 100644
--- a/lib/pthread-once.c
+++ b/lib/pthread-once.c
@@ -45,30 +45,6 @@ pthread_once (pthread_once_t *once_control, void (*initfunction) (void))
 int
 pthread_once (pthread_once_t *once_control, void (*initfunction) (void))
 {
-#  if 0
-  /* This would be the code, for
-       typedef struct
-         {
-           pthread_mutex_t mutex;
-           _Atomic unsigned int num_threads;
-           _Atomic unsigned int done;
-         }
-       pthread_once_t;
-   */
-  if (once_control->done == 0)
-    {
-      once_control->num_threads += 1;
-      pthread_mutex_lock (&once_control->mutex);
-      if (once_control->done == 0)
-        {
-          (*initfunction) ();
-          once_control->done = 1;
-        }
-      pthread_mutex_unlock (&once_control->mutex);
-      if ((once_control->num_threads -= 1) == 0)
-        pthread_mutex_destroy (&once_control->mutex);
-    }
-#  else
   /* In this implementation, we reuse the type
        typedef struct { pthread_mutex_t mutex; int state; } pthread_once_t;
        #define PTHREAD_ONCE_INIT { PTHREAD_MUTEX_INITIALIZER, 0 }
@@ -80,34 +56,76 @@ pthread_once (pthread_once_t *once_control, void (*initfunction) (void))
   struct actual_state
     {
       _Atomic unsigned short num_threads;
+      /* done == 0: initial state
+         done == 1: initfunction executed, lock still active
+         done == 2: initfunction executed, lock no longer usable */
       _Atomic unsigned short done;
     };
   struct actual_state *state_p = (struct actual_state *) &once_control->state;
-  /* Test the 'done' bit.  */
+  /* This test is not necessary.  It's only an optimization, to establish
+     a fast path for the common case that the 'done' word is already > 0.  */
   if (state_p->done == 0)
     {
-      /* The 'done' bit is still zero.  Increment num_threads (atomically).  */
+      /* Increment num_threads (atomically), to indicate that this thread will
+         possibly take the lock.  */
       state_p->num_threads += 1;
-      /* We have incremented num_threads.  Now take the lock.  */
-      pthread_mutex_lock (&once_control->mutex);
-      /* Test the 'done' bit again.  */
+      /* Test the 'done' word.  */
       if (state_p->done == 0)
         {
-          /* Execute the initfunction.  */
-          (*initfunction) ();
-          /* Set the 'done' bit to 1 (atomically).  */
-          state_p->done = 1;
+          /* The 'done' word is still zero.  Now take the lock.  */
+          pthread_mutex_lock (&once_control->mutex);
+          /* Test the 'done' word again.  */
+          if (state_p->done == 0)
+            {
+              /* Execute the initfunction.  */
+              (*initfunction) ();
+              /* Set the 'done' word to 1 (atomically).  */
+              state_p->done = 1;
+            }
+          /* Now the 'done' word is 1.  Release the lock.  */
+          pthread_mutex_unlock (&once_control->mutex);
         }
-      /* Now the 'done' bit is 1.  Release the lock.  */
-      pthread_mutex_unlock (&once_control->mutex);
+      /* Here, done is > 0.  */
       /* Decrement num_threads (atomically).  */
       if ((state_p->num_threads -= 1) == 0)
-        /* num_threads is now zero, and done is 1.
-           No other thread will need to use the lock.
-           We can therefore destroy the lock, to free resources.  */
-        pthread_mutex_destroy (&once_control->mutex);
+        {
+          /* num_threads is now zero, and done is > 0.
+             No other thread will need to use the lock.
+             We can therefore destroy the lock, to free resources.  */
+          if (__sync_bool_compare_and_swap (&state_p->done, 1, 2))
+            pthread_mutex_destroy (&once_control->mutex);
+        }
     }
-#  endif
+  /* Proof of correctness:
+     * num_threads is incremented and then decremented by some threads.
+       Therefore, num_threads always stays >= 0, and is == 0 at the end.
+     * The 'done' word, once > 0, stays > 0 (since it is never assigned 0).
+     * The 'done' word is changed from == 0 to > 0 only while the lock
+       is taken. Therefore, only the first thread that succeeds in taking
+       the lock executes the initfunction and sets the 'done' word to a
+       value > 0; the other threads that take the lock do no side effects
+       between taking and releasing the lock.
+     * The 'done' word does not change any more once it is 2.
+       Therefore, it can be changed from 1 to 2 only once.
+     * pthread_mutex_destroy gets invoked right after 'done' has been changed
+       from 1 to 2.  Therefore, pthread_mutex_destroy gets invoked only once.
+     * After a moment where num_threads was 0 and done was > 0, no thread can
+       reach the pthread_mutex_lock invocation. Proof:
+       - At such a moment, no thread is in the code range between
+           state_p->num_threads += 1
+         and
+           state_p->num_threads -= 1
+       - After such a moment, some thread can increment num_threads, but from
+         there they cannot reach the pthread_mutex_lock invocation, because the
+           if (state_p->done == 0)
+         test prevents that.
+     * From this it follows that:
+       - pthread_mutex_destroy cannot be executed while the lock is taken
+         (because pthread_mutex_destroy is only executed after a moment where
+         num_threads was 0 and done was > 0).
+       - Once pthread_mutex_destroy has been executed, the lock is not used any
+         more.
+   */
   return 0;
 }
 
-- 
2.34.1

Reply via email to