Hi all! I have a coredump of our qemu branch, based on rhev-2.12.0-44.el7_8.2, which in turn is based on v2.12.0.. And don't have any kind of reproduce.
The backtrace: #0 aio_co_schedule (ctx=0x0, co=0x55dd539fa340) at util/async.c:455 #1 0x000055dd51149716 in aio_co_enter (ctx=<optimized out>, co=<optimized out>) at util/async.c:476 #2 0x000055dd511497bc in aio_co_wake (co=<optimized out>) at util/async.c:470 #3 0x000055dd5115ea43 in qemu_co_mutex_wake (mutex=0x55dd539c36b0, co=<optimized out>) at util/qemu-coroutine-lock.c:197 #4 qemu_co_mutex_unlock (mutex=mutex@entry=0x55dd539c36b0) at util/qemu-coroutine-lock.c:300 #5 0x000055dd5109f4e0 in qcow2_co_pwritev_cluster_compressed (qiov=0x7fcbbc972a70, bytes=65536, offset=17582325760, bs=0x55dd539fe000) at block/qcow2.c:4360 #6 qcow2_co_pwritev_compressed (bs=0x55dd539fe000, offset=17582325760, bytes=65536, qiov=0x7fcbbc972de0) at block/qcow2.c:4425 #7 0x000055dd510d5cd2 in bdrv_driver_pwritev_compressed (qiov=0x7fcbbc972de0, bytes=65536, offset=17582325760, bs=0x55dd539fe000) at block/io.c:1227 #8 bdrv_aligned_pwritev (req=req@entry=0x7fcbbc972c60, offset=offset@entry=17582325760, bytes=bytes@entry=65536, align=align@entry=1, qiov=qiov@entry=0x7fcbbc972de0, flags=flags@entry=32, child=0x55dd539cea80, child=0x55dd539cea80) at block/io.c:1850 #9 0x000055dd510d6369 in bdrv_co_pwritev (child=0x55dd539cea80, offset=offset@entry=17582325760, bytes=bytes@entry=65536, qiov=qiov@entry=0x7fcbbc972de0, flags=BDRV_REQ_WRITE_COMPRESSED) at block/io.c:2144 #10 0x000055dd510c3644 in blk_co_pwritev (blk=0x55dd539fc300, offset=17582325760, bytes=65536, qiov=0x7fcbbc972de0, flags=<optimized out>) at block/block-backend.c:1237 #11 0x000055dd510c372c in blk_write_entry (opaque=0x7fcbbc972e00) at block/block-backend.c:1264 #12 0x000055dd510c1e18 in blk_prw (blk=0x55dd539fc300, offset=17582325760, buf=buf@entry=0x55dd54a38000 "", bytes=bytes@entry=65536, co_entry=co_entry@entry=0x55dd510c3710 <blk_write_entry>, flags=BDRV_REQ_WRITE_COMPRESSED) at block/block-backend.c:1292 #13 0x000055dd510c2f45 in blk_pwrite (blk=<optimized out>, offset=<optimized out>, buf=buf@entry=0x55dd54a38000, count=count@entry=65536, flags=<optimized out>) at block/block-backend.c:1486 #14 0x000055dd510ef949 in nbd_handle_request (errp=0x7fcbbc972ef8, data=0x55dd54a38000 "", request=<synthetic pointer>, client=0x55dd539ee420) at nbd/server.c:2264 #15 nbd_trip (opaque=0x55dd539ee420) at nbd/server.c:2393 #16 0x000055dd5115f72a in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at util/coroutine-ucontext.c:116 #17 0x00007fcbc5422190 in ?? () from /work/crash-bugs/PSBM-123528/ccpp-2020-12-08-00_59_06-418945/root/lib64/libc.so.6 #18 0x00007fcbbca736d0 in ?? () #19 0x0000000000000000 in ?? () Backtrace stopped: Cannot access memory at address 0x7fcbbc973000 (gdb) p *co $1 = {entry = 0x0, entry_arg = 0x0, caller = 0x0, pool_next = {sle_next = 0x0}, locks_held = 0, ctx = 0x0, scheduled = 0x55dd51195660 <__func__.23793> "aio_co_schedule", co_queue_next = {sqe_next = 0x0}, co_queue_wakeup = {sqh_first = 0x0, sqh_last = 0x55dd539f5680}, co_scheduled_next = {sle_next = 0x0}} So, it looks like we want to wake up a coroutine on co-mutex unlock, but the coroutine is already exited.. I had no idea what how to debug it, and decided to add some assertions, to make sure that coroutine waiting on mutex is entered through qemu_co_mutex_unlock, see the patch below. Still, when I run make check with this patch applied, I faced a crash in ./tests/test-aio-multithread, my assertion fails: test-aio-multithread: ../util/qemu-coroutine-lock.c:197: qemu_co_mutex_wake: Assertion `mutex == co->wait_on_mutex' failed. Thread 18 "test-aio-multit" received signal SIGABRT, Aborted. [Switching to Thread 0x7fffe5ffb700 (LWP 24549)] 0x00007ffff7063625 in raise () from /lib64/libc.so.6 (gdb) bt #0 0x00007ffff7063625 in raise () from /lib64/libc.so.6 #1 0x00007ffff704c8d9 in abort () from /lib64/libc.so.6 #2 0x00007ffff704c7a9 in __assert_fail_base.cold () from /lib64/libc.so.6 #3 0x00007ffff705ba66 in __assert_fail () from /lib64/libc.so.6 #4 0x000055555568c153 in qemu_co_mutex_wake (mutex=0x555555771360 <comutex>, co=0x555555803ec0) at ../util/qemu-coroutine-lock.c:197 #5 0x000055555568c5a0 in qemu_co_mutex_unlock (mutex=0x555555771360 <comutex>) at ../util/qemu-coroutine-lock.c:307 #6 0x000055555557acfd in test_multi_co_mutex_entry (opaque=0x0) at ../tests/test-aio-multithread.c:208 #7 0x00005555556bb5d7 in coroutine_trampoline (i0=1434467712, i1=21845) at ../util/coroutine-ucontext.c:173 #8 0x00007ffff7078d30 in ?? () from /lib64/libc.so.6 #9 0x00007fffffffd850 in ?? () #10 0x0000000000000000 in ?? () (gdb) fr 4 #4 0x000055555568c153 in qemu_co_mutex_wake (mutex=0x555555771360 <comutex>, co=0x555555803ec0) at ../util/qemu-coroutine-lock.c:197 197 assert(mutex == co->wait_on_mutex); (gdb) p mutex $1 = (CoMutex *) 0x555555771360 <comutex> (gdb) p co->wait_on_mutex $2 = (CoMutex *) 0x555555771360 <comutex> (gdb) p mutex == co->wait_on_mutex $3 = 1 So, it failed, but in gdb the condition is true.. How can that be? Does it mean, that assertion crashed prior to set "self->wait_on_mutex = mutex;"? But then, we do wake-up a coroutine prior to qemu_coroutine_yield() in it, is it correct? Or do I need some kind of barrier/atomic-access to make co->wait_on_mutex work? Hmm, if I move self->wait_on_mutex = mutex before push_waiter: diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 8af3e70343..a38d5e4bf7 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -208,6 +208,7 @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx, trace_qemu_co_mutex_lock_entry(mutex, self); w.co = self; + self->wait_on_mutex = mutex; push_waiter(mutex, &w); /* This is the "Responsibility Hand-Off" protocol; a lock() picks from @@ -227,13 +228,13 @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx, assert(to_wake == &w); mutex->ctx = ctx; mutex->holder = self; + self->wait_on_mutex = NULL; return; } qemu_co_mutex_wake(mutex, co); } - self->wait_on_mutex = mutex; qemu_coroutine_yield(); assert(mutex->holder == self); /* set by qemu_co_mutex_wake() */ trace_qemu_co_mutex_lock_return(mutex, self); Than it doesn't crash. But still.. if in parallel thread qemu_co_mutex_wake() may happen during qemu_co_mutex_lock_slowpath() before qemu_coroutine_yield() it seems bad. Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> --- include/qemu/coroutine_int.h | 1 + util/qemu-coroutine-lock.c | 10 +++++++++- util/qemu-coroutine.c | 6 ++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h index 1da148552f..1fd21b4437 100644 --- a/include/qemu/coroutine_int.h +++ b/include/qemu/coroutine_int.h @@ -49,6 +49,7 @@ struct Coroutine { /* Only used when the coroutine has terminated. */ QSLIST_ENTRY(Coroutine) pool_next; + CoMutex *wait_on_mutex; size_t locks_held; /* Only used when the coroutine has yielded. */ diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 5816bf8900..8af3e70343 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -193,6 +193,9 @@ static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co) */ smp_read_barrier_depends(); mutex->ctx = co->ctx; + mutex->holder = co; + assert(mutex == co->wait_on_mutex); + co->wait_on_mutex = NULL; /* asserted in qemu_coroutine_enter */ aio_co_wake(co); } @@ -223,13 +226,16 @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx, /* We got the lock ourselves! */ assert(to_wake == &w); mutex->ctx = ctx; + mutex->holder = self; return; } qemu_co_mutex_wake(mutex, co); } + self->wait_on_mutex = mutex; qemu_coroutine_yield(); + assert(mutex->holder == self); /* set by qemu_co_mutex_wake() */ trace_qemu_co_mutex_lock_return(mutex, self); } @@ -266,10 +272,12 @@ retry_fast_path: /* Uncontended. */ trace_qemu_co_mutex_lock_uncontended(mutex, self); mutex->ctx = ctx; + mutex->holder = self; } else { qemu_co_mutex_lock_slowpath(ctx, mutex); + assert(mutex->ctx == ctx); + assert(mutex->holder == self); } - mutex->holder = self; self->locks_held++; } diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c index 38fb6d3084..6c75a8fe41 100644 --- a/util/qemu-coroutine.c +++ b/util/qemu-coroutine.c @@ -136,6 +136,12 @@ void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co) abort(); } + if (to->wait_on_mutex) { + fprintf(stderr, "Co-routine waiting on mutex is entered not by " + "qemu_co_mutex_wake()\n"); + abort(); + } + to->caller = from; to->ctx = ctx; -- 2.25.4