Hi, Mathieu,
I tried to write a simple program to reproduce this issue, but failed.
The environment we used to produce this issue is a storage cluster.
It has 3 nodes, each with an NVME and several SATA disks, urcu is part of the
storage cluster software.
The storage software is pretty big, 3+ GB in size. I guess it's hard for you to
build a environment like ours.

Besides, I made some change into workqueue.c to debug this issue today, here's
the patch i used for debuging:

>From 18f3fea1436e0c999b346b998b7153150b92e62f Mon Sep 17 00:00:00 2001
From: wangminlan <wangmin...@szsandstone.com>
Date: Wed, 15 Jun 2022 14:28:53 +0800
Subject: [PATCH] <urcu> debug workqueue->futex

---
 .../userspace-rcu-0.12.1/src/workqueue.c           | 49 ++++++++++++++++++----
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git 
a/ceph/thirdpart/ascache/thirdparty/userspace-rcu-0.12.1/src/workqueue.c 
b/ceph/thirdpart/ascache/thirdparty/userspace-rcu-0.12.1/src/workqueue.c
index 59eb21d..eed1e21 100644
--- a/ceph/thirdpart/ascache/thirdparty/userspace-rcu-0.12.1/src/workqueue.c
+++ b/ceph/thirdpart/ascache/thirdparty/userspace-rcu-0.12.1/src/workqueue.c
@@ -132,12 +132,21 @@ static int set_thread_cpu_affinity(struct urcu_workqueue 
*workqueue)
 
 static void futex_wait(int32_t *futex)
 {
+       int ret;
+       int err;
        /* Read condition before read futex */
        cmm_smp_mb();
-       if (uatomic_read(futex) != -1)
+       if (uatomic_read(futex) != -1) {
+               fprintf(stderr, "%lu: wq %p: %s futex != -1, don't wait\n",
+                       pthread_self(), caa_container_of(futex, struct 
urcu_workqueue, futex), __func__);
+
                return;
-       while (futex_async(futex, FUTEX_WAIT, -1, NULL, NULL, 0)) {
-               switch (errno) {
+       }
+       while ((ret = futex_async(futex, FUTEX_WAIT, -1, NULL, NULL, 0))) {
+               err = errno;
+               fprintf(stderr, "%lu: wq %p: %s failed, errno %d\n",
+                       pthread_self(), caa_container_of(futex, struct 
urcu_workqueue, futex), __func__, err);
+               switch (err) {
                case EWOULDBLOCK:
                        /* Value already changed. */
                        return;
@@ -146,17 +155,35 @@ static void futex_wait(int32_t *futex)
                        break;  /* Get out of switch. */
                default:
                        /* Unexpected error. */
-                       urcu_die(errno);
+                       urcu_die(err);
                }
        }
+
+       fprintf(stderr, "%lu: wq %p: %s wait return %d\n",
+               pthread_self(), caa_container_of(futex, struct urcu_workqueue, 
futex), __func__, ret);
+
 }
 
 static void futex_wake_up(int32_t *futex)
 {
        /* Write to condition before reading/writing futex */
+       int32_t old;
+
        cmm_smp_mb();
-       if (caa_unlikely(uatomic_read(futex) == -1)) {
-               uatomic_set(futex, 0);
+       old = uatomic_read(futex);
+       if (caa_unlikely(old == -1)) {
+               old = uatomic_xchg(futex, 0);
+               if (old == -1) {
+                       fprintf(stderr, "%lu: wq %p, wakeup succeed: old %d\n",
+                               pthread_self(),
+                               caa_container_of(futex, struct urcu_workqueue, 
futex),
+                               old);
+               } else {
+                       fprintf(stderr, "%lu: wq %p, wakeup failed: old %d\n",
+                               pthread_self(),
+                               caa_container_of(futex, struct urcu_workqueue, 
futex),
+                               old);
+               }
                if (futex_async(futex, FUTEX_WAKE, 1,
                                NULL, NULL, 0) < 0)
                        urcu_die(errno);
@@ -237,8 +264,16 @@ static void *workqueue_thread(void *arg)
                if (!rt) {
                        if (cds_wfcq_empty(&workqueue->cbs_head,
                                        &workqueue->cbs_tail)) {
+                               int32_t new;
                                futex_wait(&workqueue->futex);
-                               uatomic_dec(&workqueue->futex);
+                               new = uatomic_add_return(&workqueue->futex, -1);
+                               if (new == -1) {
+                                       fprintf(stderr, "%lu: wq %p dec 
succeed: old %d, new %d\n",
+                                               pthread_self(), workqueue, new 
+ 1, new);
+                               } else {
+                                       fprintf(stderr, "%lu: wq %p dec failed: 
old %d\n",
+                                               pthread_self(), workqueue, new 
+ 1);
+                               }
                                /*
                                 * Decrement futex before reading
                                 * urcu_work list.
-- 
1.8.3.1

And the instresting log began from here:
---
...
   2097 140196498532096: wq 0x55f69a99ea00, wakeup succeed: old -1      /* 
enqueue op set futex: -1 -> 0 */
   2098 140196272453376: wq 0x55f69a99ea00: futex_wait wait return 0    /* 
workqueue_thread wait success */
   2099 140196272453376: wq 0x55f69a99ea00 dec failed: old -1           /* 
workqueue_thread got futex == -1 ? */
   2100 140196272453376: wq 0x55f69a99ea00: futex_wait futex != -1, don't wait
   2101 140196272453376: wq 0x55f69a99ea00 dec failed: old -2
...
---
In line 2099, how did workqueue_thread got futex as -1? This is wierd.

I put the patch i used to debug, and the complete log of the reproduce into the
attachment, may this helps.

Thanks,
Minlan

Attachment: debug.tar.gz
Description: application/gzip

_______________________________________________
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev

Reply via email to