I've produced a small (140 line) test case. Note that it only fails 25%-50% of the time, whereas my application failed 90%+ of the time, probably due to the test case being much quicker.
-- Adam Olsen, aka Rhamphoryncus
#define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <pthread.h> #include <errno.h> #include <atomic_ops.h> #define THREAD_COUNT 10 #define REPEAT 100000 #define RATIO 50 typedef struct { int num; pthread_mutex_t lock; pthread_t id; } Thread; static Thread threads[THREAD_COUNT]; static pthread_cond_t wakeup = PTHREAD_COND_INITIALIZER; static pthread_mutex_t world_lock = PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP; static AO_t world_sleep; #define CHECK(expr) do { \ int __status; \ if ((__status = expr)) { \ fprintf(stderr, "%s:%d syscall failed with %d, %d\n", __FILE__, __LINE__, __status, errno); \ abort(); \ } \ } while (0) static void logmsg(Thread *thread, const char *msg) { printf("%d(0x%llx) %s\n", thread->num, (unsigned long long)thread->id, msg); } static void do_stoptheworld(Thread *self) { int i; CHECK(pthread_mutex_unlock(&self->lock)); CHECK(pthread_mutex_lock(&world_lock)); CHECK(pthread_mutex_lock(&self->lock)); AO_store_full(&world_sleep, 1); for (i = 0; i < THREAD_COUNT; i++) { Thread *other = &threads[i]; if (other == self) continue; CHECK(pthread_mutex_lock(&other->lock)); } CHECK(pthread_cond_broadcast(&wakeup)); for (i = 0; i < THREAD_COUNT; i++) { Thread *other = &threads[i]; if (other == self) continue; CHECK(pthread_mutex_unlock(&other->lock)); } AO_store_full(&world_sleep, 0); CHECK(pthread_mutex_unlock(&world_lock)); } static void do_tick(Thread *thread) { if (AO_load_acquire(&world_sleep)) { logmsg(thread, "Sleeping"); /* pthread_cond_wait could return EINTR, but for this test we * treat that as fatal */ CHECK(pthread_cond_wait(&wakeup, &thread->lock)); logmsg(thread, "Woken up"); } } static void * worker(void *arg) { Thread *thread = (Thread *)arg; int i, j; logmsg(thread, "Started"); CHECK(pthread_mutex_lock(&thread->lock)); for (i = 0; i < REPEAT; i++) { do_stoptheworld(thread); for (j = 0; j < RATIO; j++) do_tick(thread); if ((i % (REPEAT / 5)) == 0) printf("%d() ticked %d\n", thread->num, i); } CHECK(pthread_mutex_unlock(&thread->lock)); logmsg(thread, "Finished"); return NULL; } int main(int argc, char **argv) { int i; pthread_mutexattr_t attr; printf("%d %d\n", EINVAL, EPERM); CHECK(pthread_mutexattr_init(&attr)); CHECK(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP)); for (i = 0; i < THREAD_COUNT; i++) { Thread *thread = &threads[i]; thread->num = i; CHECK(pthread_mutex_init(&thread->lock, &attr)); } for (i = 0; i < THREAD_COUNT; i++) { Thread *thread = &threads[i]; CHECK(pthread_create(&thread->id, NULL, worker, (void *)thread)); } for (i = 0; i < THREAD_COUNT; i++) { Thread *thread = &threads[i]; CHECK(pthread_join(thread->id, NULL)); } return 0; }