I'm back to trying to figure this out. I can't use migrate and copy storage until this bug is fixed, so I'm pretty motivated. Today I configured libvirt/qemu to dump the core, and I compiled qemu with debugging symbols. Here is the backtrace. I'm not sure it says anything we don't already know.
I may try to hack in some more debugging later today, but my C is terrible. Any other ideas on ways I can help? (gdb) bt full #0 0x00007f1a6a3313f8 in raise () at /lib64/libc.so.6 #1 0x00007f1a6a332ffa in abort () at /lib64/libc.so.6 #2 0x00007f1a6a329c17 in __assert_fail_base () at /lib64/libc.so.6 #3 0x00007f1a6a329cc2 in () at /lib64/libc.so.6 #4 0x000055a6cba705a6 in nbd_reply_chunk_iter_receive (s=s@entry=0x55a6ce458200, iter=iter@entry=0x7f1945fe8890, handle=handle@entry=94174913593865, qiov=qiov@entry=0x0, reply=0x7f1945fe8800, reply@entry=0x0, payload=payload@entry=0x0) at block/nbd-client.c:606 local_reply = {simple = {magic = 1732535960, error = 0, handle = 94174913593865}, structured = {magic = 1732535960, flags = 0, type = 0, handle = 94174913593865, length = 0}, {magic = 1732535960, _skip = 0, handle = 94174913593865}} chunk = 0x7f1945fe8800 local_err = 0x0 __func__ = "nbd_reply_chunk_iter_receive" __PRETTY_FUNCTION__ = "nbd_reply_chunk_iter_receive" #5 0x000055a6cba706d6 in nbd_co_request (errp=0x7f1945fe8888, handle=94174913593865, s=0x55a6ce458200) at block/nbd-client.c:634 iter = {ret = 0, fatal = false, err = 0x0, done = false, only_structured = true} ret = <optimized out> local_err = 0x0 client = 0x55a6ce458200 __PRETTY_FUNCTION__ = "nbd_co_request" #6 0x000055a6cba706d6 in nbd_co_request (bs=bs@entry=0x55a6ce450130, request=request@entry=0x7f1945fe88e0, write_qiov=write_qiov@entry=0x0) at block/nbd-client.c:772 ret = <optimized out> local_err = 0x0 client = 0x55a6ce458200 __PRETTY_FUNCTION__ = "nbd_co_request" #7 0x000055a6cba70cb5 in nbd_client_co_pwrite_zeroes (bs=0x55a6ce450130, offset=2483027968, bytes=16777216, flags=<optimized out>) at block/nbd-client.c:860 client = <optimized out> request = {handle = 94174913593865, from = 2483027968, len = 16777216, flags = 0, type = 6} __PRETTY_FUNCTION__ = "nbd_client_co_pwrite_zeroes" #8 0x000055a6cba67f44 in bdrv_co_do_pwrite_zeroes (bs=bs@entry=0x55a6ce450130, offset=offset@entry=2483027968, bytes=bytes@entry=16777216, flags=flags@entry=6) at block/io.c:1410 num = 16777216 drv = 0x55a6cc3b0600 <bdrv_nbd_unix> qiov = {iov = 0x100000, niov = -834338512, nalloc = 21926, size = 1831862272} iov = {iov_base = 0x0, iov_len = 0} ret = -95 need_flush = false head = 0 tail = 0 max_write_zeroes = 33554432 alignment = 512 max_transfer = 16777216 __PRETTY_FUNCTION__ = "bdrv_co_do_pwrite_zeroes" #9 0x000055a6cba68373 in bdrv_aligned_pwritev (req=req@entry=0x7f1945fe8b50, offset=offset@entry=2483027968, bytes=bytes@entry=16777216, align=align@entry=512, qiov=0x0, flags=6, child=0x55a6ce333f50, child=0x55a6ce333f50) at block/io.c:1522 bs = 0x55a6ce450130 drv = 0x55a6cc3b0600 <bdrv_nbd_unix> waited = <optimized out> ret = <optimized out> end_sector = 4882432 bytes_remaining = 16777216 max_transfer = 33554432 #10 0x000055a6cba69a42 in bdrv_co_pwritev (req=0x7f1945fe8b50, flags=6, bytes=16777216, offset=2483027968, child=0x55a6ce333f50) at block/io.c:1625 aligned_bytes = 16777216 bs = 0x55a6ce450130 buf = <optimized out> tail_padding_bytes = 0 ---Type <return> to continue, or q <return> to quit--- local_qiov = {iov = 0x0, niov = 0, nalloc = 0, size = 1825570816} align = 512 head_padding_bytes = <optimized out> ret = 0 iov = {iov_base = 0x7f1945fe8bc0, iov_len = 1} bs = 0x55a6ce450130 req = {bs = 0x55a6ce450130, offset = 2483027968, bytes = 16777216, type = BDRV_TRACKED_WRITE, serialising = false, overlap_offset = 2483027968, overlap_bytes = 16777216, list = {le_next = 0x0, le_prev = 0x7f19452dbb80}, co = 0x7f1a5c003030, wait_queue = {entries = {sqh_first = 0x0, sqh_last = 0x7f1945fe8b98}}, waiting_for = 0x0} align = <optimized out> head_buf = 0x0 tail_buf = 0x0 local_qiov = {iov = 0x7f1945fe8bc0, niov = 1, nalloc = 0, size = 94171452932608} use_local_qiov = false ret = <optimized out> __PRETTY_FUNCTION__ = "bdrv_co_pwritev" #11 0x000055a6cba69a42 in bdrv_co_pwritev (child=child@entry=0x55a6ce333f50, offset=offset@entry=2483027968, bytes=bytes@entry=16777216, qiov=qiov@entry=0x0, flags=flags@entry=6) at block/io.c:1698 bs = 0x55a6ce450130 req = {bs = 0x55a6ce450130, offset = 2483027968, bytes = 16777216, type = BDRV_TRACKED_WRITE, serialising = false, overlap_offset = 2483027968, overlap_bytes = 16777216, list = {le_next = 0x0, le_prev = 0x7f19452dbb80}, co = 0x7f1a5c003030, wait_queue = {entries = {sqh_first = 0x0, sqh_last = 0x7f1945fe8b98}}, waiting_for = 0x0} align = <optimized out> head_buf = 0x0 tail_buf = 0x0 local_qiov = {iov = 0x7f1945fe8bc0, niov = 1, nalloc = 0, size = 94171452932608} use_local_qiov = false ret = <optimized out> __PRETTY_FUNCTION__ = "bdrv_co_pwritev" #12 0x000055a6cba6a3be in bdrv_co_pwrite_zeroes (child=0x55a6ce333f50, offset=2483027968, bytes=16777216, flags=<optimized out>) at block/io.c:1822 #13 0x000055a6cba67f44 in bdrv_co_do_pwrite_zeroes (bs=bs@entry=0x55a6ce35dd80, offset=offset@entry=2483027968, bytes=bytes@entry=16777216, flags=flags@entry=6) at block/io.c:1410 num = 16777216 drv = 0x55a6cc3aa5a0 <bdrv_raw> qiov = {iov = 0x6d400000, niov = -878284120, nalloc = 21926, size = 1} iov = {iov_base = 0x0, iov_len = 0} ret = -95 need_flush = false head = 0 tail = 0 max_write_zeroes = 2147483647 alignment = 1 max_transfer = 16777216 __PRETTY_FUNCTION__ = "bdrv_co_do_pwrite_zeroes" #14 0x000055a6cba68373 in bdrv_aligned_pwritev (req=req@entry=0x7f1945fe8e90, offset=offset@entry=2483027968, bytes=bytes@entry=16777216, align=align@entry=1, qiov=0x0, flags=6, child=0x55a6ce457960, child=0x55a6ce457960) at block/io.c:1522 bs = 0x55a6ce35dd80 drv = 0x55a6cc3aa5a0 <bdrv_raw> waited = <optimized out> ret = <optimized out> end_sector = 4882432 bytes_remaining = 16777216 max_transfer = 33554432 #15 0x000055a6cba69a42 in bdrv_co_pwritev (req=0x7f1945fe8e90, flags=6, bytes=16777216, offset=2483027968, child=0x55a6ce457960) at block/io.c:1625 aligned_bytes = 16777216 bs = 0x55a6ce35dd80 buf = <optimized out> ---Type <return> to continue, or q <return> to quit--- tail_padding_bytes = 0 local_qiov = {iov = 0x55a6ce3f8b40, niov = 1543529136, nalloc = 32538, size = 139746525220544} align = 1 head_padding_bytes = <optimized out> ret = 0 iov = {iov_base = 0x7f1a5c003030, iov_len = 94174870259312} bs = 0x55a6ce35dd80 req = {bs = 0x55a6ce35dd80, offset = 2483027968, bytes = 16777216, type = BDRV_TRACKED_WRITE, serialising = false, overlap_offset = 2483027968, overlap_bytes = 16777216, list = {le_next = 0x0, le_prev = 0x7f19452dbec0}, co = 0x7f1a5c003030, wait_queue = {entries = {sqh_first = 0x0, sqh_last = 0x7f1945fe8ed8}}, waiting_for = 0x0} align = <optimized out> head_buf = 0x0 tail_buf = 0x0 local_qiov = {iov = 0x7f1a5c003030, niov = -877640080, nalloc = 21926, size = 139751189406384} use_local_qiov = false ret = <optimized out> __PRETTY_FUNCTION__ = "bdrv_co_pwritev" #16 0x000055a6cba69a42 in bdrv_co_pwritev (child=0x55a6ce457960, offset=offset@entry=2483027968, bytes=bytes@entry=16777216, qiov=qiov@entry=0x0, flags=6) at block/io.c:1698 bs = 0x55a6ce35dd80 req = {bs = 0x55a6ce35dd80, offset = 2483027968, bytes = 16777216, type = BDRV_TRACKED_WRITE, serialising = false, overlap_offset = 2483027968, overlap_bytes = 16777216, list = {le_next = 0x0, le_prev = 0x7f19452dbec0}, co = 0x7f1a5c003030, wait_queue = {entries = {sqh_first = 0x0, sqh_last = 0x7f1945fe8ed8}}, waiting_for = 0x0} align = <optimized out> head_buf = 0x0 tail_buf = 0x0 local_qiov = {iov = 0x7f1a5c003030, niov = -877640080, nalloc = 21926, size = 139751189406384} use_local_qiov = false ret = <optimized out> __PRETTY_FUNCTION__ = "bdrv_co_pwritev" #17 0x000055a6cba5898b in blk_co_pwritev (blk=0x55a6ce4576b0, offset=2483027968, bytes=16777216, qiov=0x0, flags=<optimized out>) at block/block-backend.c:1188 ret = <optimized out> bs = 0x55a6ce35dd80 #18 0x000055a6cba58a9b in blk_aio_write_entry (opaque=0x7f1a5c0084b0) at block/block-backend.c:1394 acb = 0x7f1a5c0084b0 rwco = 0x7f1a5c0084d8 qiov = <optimized out> #19 0x000055a6cbb046ba in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at util/coroutine-ucontext.c:116 co = 0x7f1a5c003030 #20 0x00007f1a6a346560 in __start_context () at /lib64/libc.so.6 #21 0x00007f1a617a3fb0 in () #22 0x0000000000000000 in () -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1793791 Title: Crash with nbd_reply_chunk_iter_receive: Assertion `chunk->flags & NBD_REPLY_FLAG_DONE' failed Status in QEMU: New Bug description: Qemu version on both sides: 2.12.1 Host A Linux: 4.9.76 Host B Linux: 4.14.67 While calling from Host A: virsh migrate virtualmachine qemu+ssh://hostB/system --live --undefinesource --persistent --verbose --copy-storage-all I get a qemu crash with: 2018-09-21 16:12:23.073+0000: 14428: info : virObjectUnref:350 : OBJECT_UNREF: obj=0x7f922c03d990 qemu-system-x86_64: block/nbd-client.c:606: nbd_reply_chunk_iter_receive: Assertion `chunk->flags & NBD_REPLY_FLAG_DONE' failed. 2018-09-21 16:12:41.230+0000: shutting down, reason=crashed 2018-09-21 16:12:52.900+0000: shutting down, reason=failed It doesn't do it every time, but most of the time. To manage notifications about this bug go to: https://bugs.launchpad.net/qemu/+bug/1793791/+subscriptions