[Qemu-devel] [PATCH] monitor: increase amount of data for monitor to read
Right now QMP and HMP monitors read 1 byte at a time from the socket, which is very inefficient. With 100+ VMs on the host this easily reasults in a lot of unnecessary system calls and CPU usage in the system. This patch changes the amount of data to read to 4096 bytes, which matches buffer size on the channel level. Fortunately, monitor protocol is synchronous right now thus we should not face side effects in reality. Signed-off-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- include/monitor/monitor.h | 2 +- monitor.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h index c1b40a9cac..afa1ed34a4 100644 --- a/include/monitor/monitor.h +++ b/include/monitor/monitor.h @@ -14,7 +14,7 @@ extern __thread Monitor *cur_mon; #define MONITOR_USE_CONTROL 0x04 #define MONITOR_USE_PRETTY0x08 -#define QMP_REQ_QUEUE_LEN_MAX 8 +#define QMP_REQ_QUEUE_LEN_MAX 4096 bool monitor_cur_is_qmp(void); diff --git a/monitor.c b/monitor.c index 4807bbe811..a08e020b61 100644 --- a/monitor.c +++ b/monitor.c @@ -4097,7 +4097,7 @@ static int monitor_can_read(void *opaque) { Monitor *mon = opaque; -return !atomic_mb_read(&mon->suspend_cnt); +return !atomic_mb_read(&mon->suspend_cnt) ? 4096 : 0; } /* -- 2.17.0
Re: [Qemu-devel] [PATCH v5] qemu-io: add pattern file for write command
On 31.05.2019 18:13, Eric Blake wrote: > On 5/31/19 2:46 AM, Denis Plotnikov wrote: >> The patch allows to provide a pattern file for write >> command. There was no similar ability before. >> >> Signed-off-by: Denis Plotnikov >> --- >> v5: >>* file name initiated with null to make compilers happy >> > >> +static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len, >> + char *file_name) >> +{ >> +char *buf, *buf_pos; >> +FILE *f = fopen(file_name, "r"); >> +int l; >> + >> +if (!f) { >> +printf("'%s': %s\n", file_name, strerror(errno)); >> +return NULL; >> +} >> + >> +if (qemuio_misalign) { >> +len += MISALIGN_OFFSET; >> +} >> +buf = blk_blockalign(blk, len); >> +memset(buf, 0, len); >> + >> +buf_pos = buf; >> + >> +while (len > 0) { >> +l = fread(buf_pos, sizeof(char), len, f); >> + >> +if (feof(f)) { >> +rewind(f); >> +} > > Why are we reading the file more than once? Once we've read it once, > it's more efficient to switch to a loop that memcpy()s the prefix into > the rest of the buffer, rather than to perform repeated I/O. > Yes, it is. Will change it. Denis
[Qemu-devel] [PATCH v6] qemu-io: add pattern file for write command
The patch allows to provide a pattern file for write command. There was no similar ability before. Signed-off-by: Denis Plotnikov --- v6: * the pattern file is read once to reduce io v5: * file name initiated with null to make compilers happy v4: * missing signed-off clause added v3: * missing file closing added * exclusive flags processing changed * buffer void* converted to char* to fix pointer arithmetics * file reading error processing added --- qemu-io-cmds.c | 88 ++ 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 09750a23ce..e27203f747 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -343,6 +343,69 @@ static void *qemu_io_alloc(BlockBackend *blk, size_t len, int pattern) return buf; } +static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len, + char *file_name) +{ +char *buf, *buf_origin; +FILE *f = fopen(file_name, "r"); +int l; + +if (!f) { +printf("'%s': %s\n", file_name, strerror(errno)); +return NULL; +} + +if (qemuio_misalign) { +len += MISALIGN_OFFSET; +} +buf_origin = blk_blockalign(blk, len); +memset(buf_origin, 0, len); + +buf = buf_origin; + +l = fread(buf, sizeof(char), len, f); + +if (ferror(f)) { +printf("'%s': %s\n", file_name, strerror(errno)); +goto error; +} + +if (l == 0) { +printf("'%s' is empty\n", file_name); +goto error; +} + +if (l < len) { +char *file_buf = g_malloc(sizeof(char) * l); +memcpy(file_buf, buf, l); +len -= l; +buf += l; + +while (len > 0) { +size_t len_to_copy = len > l ? l : len; + +memcpy(buf, file_buf, len_to_copy); + +len -= len_to_copy; +buf += len_to_copy; +} +qemu_vfree(file_buf); +} + +if (qemuio_misalign) { +buf_origin += MISALIGN_OFFSET; +} + +goto out; + +error: +qemu_vfree(buf); +buf_origin = NULL; +out: +fclose(f); +return buf_origin; +} + static void qemu_io_free(void *p) { if (qemuio_misalign) { @@ -965,7 +1028,7 @@ static const cmdinfo_t write_cmd = { .perm = BLK_PERM_WRITE, .argmin = 2, .argmax = -1, -.args = "[-bcCfnquz] [-P pattern] off len", +.args = "[-bcCfnquz] [-P pattern | -s source_file] off len", .oneline= "writes a number of bytes at a specified offset", .help = write_help, }; @@ -974,7 +1037,7 @@ static int write_f(BlockBackend *blk, int argc, char **argv) { struct timeval t1, t2; bool Cflag = false, qflag = false, bflag = false; -bool Pflag = false, zflag = false, cflag = false; +bool Pflag = false, zflag = false, cflag = false, sflag = false; int flags = 0; int c, cnt, ret; char *buf = NULL; @@ -983,8 +1046,9 @@ static int write_f(BlockBackend *blk, int argc, char **argv) /* Some compilers get confused and warn if this is not initialized. */ int64_t total = 0; int pattern = 0xcd; +char *file_name = NULL; -while ((c = getopt(argc, argv, "bcCfnpP:quz")) != -1) { +while ((c = getopt(argc, argv, "bcCfnpP:quzs:")) != -1) { switch (c) { case 'b': bflag = true; @@ -1020,6 +1084,10 @@ static int write_f(BlockBackend *blk, int argc, char **argv) case 'z': zflag = true; break; +case 's': +sflag = true; +file_name = g_strdup(optarg); +break; default: qemuio_command_usage(&write_cmd); return -EINVAL; @@ -1051,8 +1119,9 @@ static int write_f(BlockBackend *blk, int argc, char **argv) return -EINVAL; } -if (zflag && Pflag) { -printf("-z and -P cannot be specified at the same time\n"); +if ((int)zflag + (int)Pflag + (int)sflag > 1) { +printf("Only one of -z, -P, and -s" + "can be specified at the same time\n"); return -EINVAL; } @@ -1088,7 +1157,14 @@ static int write_f(BlockBackend *blk, int argc, char **argv) } if (!zflag) { -buf = qemu_io_alloc(blk, count, pattern); +if (sflag) { +buf = qemu_io_alloc_from_file(blk, count, file_name); +if (!buf) { +return -EINVAL; +} +} else { +buf = qemu_io_alloc(blk, count, pattern); +} } gettimeofday(&t1, NULL); -- 2.17.0
Re: [Qemu-devel] [PATCH 0/3] migration: add sztd compression
ping ping ping! On 04.03.2019 18:10, Denis Plotnikov wrote: > ping! > > On 26.02.2019 16:15, Denis Plotnikov wrote: >> zstd date compression algorithm shows better performance on data compression. >> It might be useful to employ the algorithm in VM migration to reduce CPU >> usage. >> A user will be able to choose between those algorithms, therefor >> compress-type >> migration parameter is added. >> >> Here are some results of performance comparison zstd vs gzip: >> >> host: i7-4790 8xCPU @ 3.60GHz, 16G RAM >> migration to the same host >> VM: 2xVCPU, 8G RAM total >> 5G RAM used, memory populated with postgreqsl data >> produced by pgbench performance benchmark >> >> >> Threads: 1 compress – 1 decompress >> >> zstd provides slightly less compression ratio with almost the same >> CPU usage but copes with RAM compression roghly 2 times faster >> >> compression type zlib | zstd >> - >> compression level 1 5 | 1 5 >> compression ratio 6.927.05 | 6.696.89 >> cpu idle, %82 83| 86 80 >> time, sec 49 71| 26 31 >> time diff to zlib, sec -25 -41 >> >> >> Threads: 8 compress – 2 decompress >> >> zstd provides the same migration time with less cpu consumption >> >> compression type none |gzip(zlib)| zstd >> -- >> compression level- | 1 5 9| 1 5 15 >> compression ratio- | 6.94 6.997.14 | 6.646.89 >> 6.93 >> time, sec154 | 22 23 27 | 23 23 25 >> cpu idle, % 99| 45 30 12 | 70 52 23 >> cpu idle diff to zlib | | -25%-22%-11% >> >> >> Denis Plotnikov (3): >> migration: rework compression code for adding more data compressors >> hmp: add compress-type parameter to migration parameters >> migration: add zstd compression >> >>configure | 26 >>hmp.c | 8 ++ >>migration/migration.c | 45 ++- >>migration/migration.h | 1 + >>migration/qemu-file.c | 39 ++ >>migration/qemu-file.h | 18 ++- >>migration/ram.c | 291 ++ >>qapi/migration.json | 26 +++- >>8 files changed, 369 insertions(+), 85 deletions(-) >> > -- Best, Denis
[PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839
From: "Denis V. Lunev" Linux guests submit IO requests no longer than PAGE_SIZE * max_seg field reported by SCSI controler. Thus typical sequential read with 1 MB size results in the following pattern of the IO from the guest: 8,16 115754 2.766095122 2071 D R 2095104 + 1008 [dd] 8,16 115755 2.766108785 2071 D R 2096112 + 1008 [dd] 8,16 115756 2.766113486 2071 D R 2097120 + 32 [dd] 8,16 115757 2.767668961 0 C R 2095104 + 1008 [0] 8,16 115758 2.768534315 0 C R 2096112 + 1008 [0] 8,16 115759 2.768539782 0 C R 2097120 + 32 [0] The IO was generated by dd if=/dev/sda of=/dev/null bs=1024 iflag=direct This effectively means that on rotational disks we will observe 3 IOPS for each 2 MBs processed. This definitely negatively affects both guest and host IO performance. The cure is relatively simple - we should report lengthy scatter-gather ability of the SCSI controller. Fortunately the situation here is very good. VirtIO transport layer can accomodate 1024 items in one request while we are using only 128. This situation is present since almost very beginning. 2 items are dedicated for request metadata thus we should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg. The following pattern is observed after the patch: 8,16 1 9921 2.662721340 2063 D R 2095104 + 1024 [dd] 8,16 1 9922 2.662737585 2063 D R 2096128 + 1024 [dd] 8,16 1 9923 2.665188167 0 C R 2095104 + 1024 [0] 8,16 1 9924 2.665198777 0 C R 2096128 + 1024 [0] which is much better. The dark side of this patch is that we are tweaking guest visible parameter, though this should be relatively safe as above transport layer support is present in QEMU/host Linux for a very long time. The patch adds configurable property for VirtIO SCSI with a new default and hardcode option for VirtBlock which does not provide good configurable framework. Unfortunately the commit can not be applied as is. For the real cure we need guest to be fixed to accomodate that queue length, which is done only in the latest 4.14 kernel. Thus we are going to expose the property and tweak it on machine type level. The problem with the old kernels is that they have max_segments <= virtqueue_size restriction which cause the guest crashing in the case of violation. To fix the case described above in the old kernels we can increase virtqueue_size to 256 and max_segments to 254. The pitfall here is that seabios allows the virtqueue_size-s < 128, however, the seabios patch extending that value to 256 is pending. CC: "Michael S. Tsirkin" CC: Stefan Hajnoczi CC: Kevin Wolf CC: Max Reitz CC: Gerd Hoffmann Signed-off-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/block/virtio-blk.c | 3 ++- hw/scsi/vhost-scsi.c| 2 ++ hw/scsi/virtio-scsi.c | 4 +++- include/hw/virtio/virtio-blk.h | 1 + include/hw/virtio/virtio-scsi.h | 1 + 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 06e57a4d39..b2eaeeaf67 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -903,7 +903,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) blk_get_geometry(s->blk, &capacity); memset(&blkcfg, 0, sizeof(blkcfg)); virtio_stq_p(vdev, &blkcfg.capacity, capacity); -virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2); +virtio_stl_p(vdev, &blkcfg.seg_max, s->conf.max_segments); virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls); virtio_stl_p(vdev, &blkcfg.blk_size, blk_size); virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size); @@ -1240,6 +1240,7 @@ static Property virtio_blk_properties[] = { conf.max_discard_sectors, BDRV_REQUEST_MAX_SECTORS), DEFINE_PROP_UINT32("max-write-zeroes-sectors", VirtIOBlock, conf.max_write_zeroes_sectors, BDRV_REQUEST_MAX_SECTORS), +DEFINE_PROP_UINT32("max_segments", VirtIOBlock, conf.max_segments, 126), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index 61e2e57da9..fa3b377807 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -242,6 +242,8 @@ static Property vhost_scsi_properties[] = { DEFINE_PROP_BIT64("t10_pi", VHostSCSICommon, host_features, VIRTIO_SCSI_F_T10_PI, false), +DEFINE_PROP_UINT32("max_segments", VirtIOSCSICommon, conf.max_segments, + 126), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 839f120256..8b070ddeed 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -650,7 +650,
[PATCH] blockdev: modify blockdev-change-medium to change non-removable device
The modification is useful to workaround exclusive file access restrictions, e.g. to implement VM migration with shared disk stored on a storage with the exclusive file opening model: a destination VM is started waiting for incomming migration with a fake image drive, and later, on the last migration phase, the fake image file is replaced with the real one. Signed-off-by: Denis Plotnikov --- blockdev.c | 69 +++- hmp.c| 2 ++ qapi/block-core.json | 7 +++-- qmp.c| 3 +- 4 files changed, 57 insertions(+), 24 deletions(-) diff --git a/blockdev.c b/blockdev.c index d358169995..23f3465cfc 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2609,6 +2609,8 @@ void qmp_blockdev_change_medium(bool has_device, const char *device, bool has_format, const char *format, bool has_read_only, BlockdevChangeReadOnlyMode read_only, +bool has_medium_name, +const char *medium_name, Error **errp) { BlockBackend *blk; @@ -2667,29 +2669,56 @@ void qmp_blockdev_change_medium(bool has_device, const char *device, goto fail; } -rc = do_open_tray(has_device ? device : NULL, - has_id ? id : NULL, - false, &err); -if (rc && rc != -ENOSYS) { -error_propagate(errp, err); -goto fail; -} -error_free(err); -err = NULL; +if (blk_dev_has_removable_media(blk)) { +rc = do_open_tray(has_device ? device : NULL, + has_id ? id : NULL, + false, &err); +if (rc && rc != -ENOSYS) { +error_propagate(errp, err); +goto fail; +} +error_free(err); +err = NULL; -blockdev_remove_medium(has_device, device, has_id, id, &err); -if (err) { -error_propagate(errp, err); -goto fail; -} +blockdev_remove_medium(has_device, device, has_id, id, &err); +if (err) { +error_propagate(errp, err); +goto fail; +} -qmp_blockdev_insert_anon_medium(blk, medium_bs, &err); -if (err) { -error_propagate(errp, err); -goto fail; -} +qmp_blockdev_insert_anon_medium(blk, medium_bs, &err); +if (err) { +error_propagate(errp, err); +goto fail; +} + +qmp_blockdev_close_tray(has_device, device, has_id, id, errp); +} else { +if (!medium_name) { +error_setg(errp, "A medium name should be given"); +goto fail; +} -qmp_blockdev_close_tray(has_device, device, has_id, id, errp); +if (runstate_is_running()) { +error_setg(errp, "Can't set a medium for non-removable device " +"in a running VM"); +goto fail; +} + +if (strlen(blk_name(blk))) { +error_setg(errp, "The device already has a medium"); +goto fail; +} + +if (blk_insert_bs(blk, medium_bs, &err) < 0) { +error_propagate(errp, err); +goto fail; +} + +if (!monitor_add_blk(blk, medium_name, &err)) { +error_propagate(errp, err); +} +} fail: /* If the medium has been inserted, the device has its own reference, so diff --git a/hmp.c b/hmp.c index 8eec768088..fc7bac5b4b 100644 --- a/hmp.c +++ b/hmp.c @@ -1948,6 +1948,7 @@ void hmp_change(Monitor *mon, const QDict *qdict) const char *target = qdict_get_str(qdict, "target"); const char *arg = qdict_get_try_str(qdict, "arg"); const char *read_only = qdict_get_try_str(qdict, "read-only-mode"); +const char *target_name = qdict_get_try_str(qdict, "target-name"); BlockdevChangeReadOnlyMode read_only_mode = 0; Error *err = NULL; @@ -1982,6 +1983,7 @@ void hmp_change(Monitor *mon, const QDict *qdict) qmp_blockdev_change_medium(true, device, false, NULL, target, !!arg, arg, !!read_only, read_only_mode, + !!target_name, target_name, &err); } diff --git a/qapi/block-core.json b/qapi/block-core.json index 7ccbfff9d0..f493a7c737 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -4769,6 +4769,8 @@ # @read-only-mode: change the read-only mode of the device; defaults # to 'retain' # +# @medium-name: drive-name when changing the media in non-removable devices +# ignored when changing media in removable devices # Since: 2.5 # # Examples: @@ -4807,9 +4809,8 @@ '*id'
Re: [PATCH] blockdev: modify blockdev-change-medium to change non-removable device
On 18.10.2019 18:02, Max Reitz wrote: > On 18.10.19 14:09, Denis Plotnikov wrote: >> The modification is useful to workaround exclusive file access restrictions, >> e.g. to implement VM migration with shared disk stored on a storage with >> the exclusive file opening model: a destination VM is started waiting for >> incomming migration with a fake image drive, and later, on the last migration >> phase, the fake image file is replaced with the real one. >> >> Signed-off-by: Denis Plotnikov > Isn’t this what we would want to use reopen for? > > Max Could you please explain what is "use reopen"? Denis >
Re: [PATCH] blockdev: modify blockdev-change-medium to change non-removable device
On 22.10.2019 14:05, Max Reitz wrote: > On 21.10.19 08:50, Denis Plotnikov wrote: >> On 18.10.2019 18:02, Max Reitz wrote: >>> On 18.10.19 14:09, Denis Plotnikov wrote: >>>> The modification is useful to workaround exclusive file access >>>> restrictions, >>>> e.g. to implement VM migration with shared disk stored on a storage with >>>> the exclusive file opening model: a destination VM is started waiting for >>>> incomming migration with a fake image drive, and later, on the last >>>> migration >>>> phase, the fake image file is replaced with the real one. >>>> >>>> Signed-off-by: Denis Plotnikov >>> Isn’t this what we would want to use reopen for? >>> >>> Max >> Could you please explain what is "use reopen"? > I was thinking of using (x-)blockdev-reopen to change the file that is > used by the format node (e.g. from a null-co node to a real file); or to > change the filename of the protocol node. > > Kevin has pointed out (on IRC) that this will not allow you to change > the node that is directly attached to the device. While I don’t know > whether that’s really necessary in this case, if it were indeed > necessary, I’d prefer a method to change a guest device’s @drive option > because that seems more natural to me. > > In contrast, the approach taken in this patch seems not quite right to > me, because it overloads the whole blockdev-change-medium command with a > completely new and different implementation based on whether there’s a > removable medium or not. If the implementation is so different (and the > interface is, too, because in one path you must give @medium whereas the > other doesn’t evaluate it at all), it should be a new command. > > I don’t know whether we need a new command at all, though. On the node > level, we have (x-)blockdev-reopen. So assuming we need something to > change the link between the guest device and the block layer, I wonder > whether there isn’t something similar; specifically, I’d prefer > something to simply change the device’s @drive option. > > Kevin has pointed out (on IRC again) that there is indeed one such > command, and that’s qom-set. Unfortunately, this is what happens if you > try to use it for @drive: > > {"error": {"class": "GenericError", "desc": "Attempt to set property > 'drive' on anonymous device (type 'virtio-blk-device') after it was > realized"}} > > However, Kevin has claimed it would be technically possible to make an > exception for @drive. Maybe this is worth investigating? Is there any guess how complex it might be? In the case if it's quite complex may be it's worth to make the separate command? > > > (As for blockdev-change-medium, as I’ve said, I don’t really think this > fits there. Furthermore, blockdev-change-medium is kind of a legacy > command because I think every command but blockdev-add that does a > bdrv_open() kind of is a legacy command. Out of curiosity, could you please explain why it's decided to be so? > So if anything, it should be a > new command that then takes a node-name. > But OTOH, it would be a bit strange to add a separate command for > something that in theory should be covered by qom-set @drive.) > > Max >
Re: [PATCH] blockdev: modify blockdev-change-medium to change non-removable device
On 22.10.2019 16:18, Max Reitz wrote: > On 22.10.19 14:53, Denis Plotnikov wrote: >> On 22.10.2019 14:05, Max Reitz wrote: >>> On 21.10.19 08:50, Denis Plotnikov wrote: >>>> On 18.10.2019 18:02, Max Reitz wrote: >>>>> On 18.10.19 14:09, Denis Plotnikov wrote: >>>>>> The modification is useful to workaround exclusive file access >>>>>> restrictions, >>>>>> e.g. to implement VM migration with shared disk stored on a storage with >>>>>> the exclusive file opening model: a destination VM is started waiting for >>>>>> incomming migration with a fake image drive, and later, on the last >>>>>> migration >>>>>> phase, the fake image file is replaced with the real one. >>>>>> >>>>>> Signed-off-by: Denis Plotnikov >>>>> Isn’t this what we would want to use reopen for? >>>>> >>>>> Max >>>> Could you please explain what is "use reopen"? >>> I was thinking of using (x-)blockdev-reopen to change the file that is >>> used by the format node (e.g. from a null-co node to a real file); or to >>> change the filename of the protocol node. >>> >>> Kevin has pointed out (on IRC) that this will not allow you to change >>> the node that is directly attached to the device. While I don’t know >>> whether that’s really necessary in this case, if it were indeed >>> necessary, I’d prefer a method to change a guest device’s @drive option >>> because that seems more natural to me. >>> >>> In contrast, the approach taken in this patch seems not quite right to >>> me, because it overloads the whole blockdev-change-medium command with a >>> completely new and different implementation based on whether there’s a >>> removable medium or not. If the implementation is so different (and the >>> interface is, too, because in one path you must give @medium whereas the >>> other doesn’t evaluate it at all), it should be a new command. >>> >>> I don’t know whether we need a new command at all, though. On the node >>> level, we have (x-)blockdev-reopen. So assuming we need something to >>> change the link between the guest device and the block layer, I wonder >>> whether there isn’t something similar; specifically, I’d prefer >>> something to simply change the device’s @drive option. >>> >>> Kevin has pointed out (on IRC again) that there is indeed one such >>> command, and that’s qom-set. Unfortunately, this is what happens if you >>> try to use it for @drive: >>> >>> {"error": {"class": "GenericError", "desc": "Attempt to set property >>> 'drive' on anonymous device (type 'virtio-blk-device') after it was >>> realized"}} >>> >>> However, Kevin has claimed it would be technically possible to make an >>> exception for @drive. Maybe this is worth investigating? >> Is there any guess how complex it might be? In the case if it's quite >> complex may be it's worth to make the separate command? > I can translate the chat log for you: > > In theory that’s called qom-set > However, I believe it doesn’t support qdev properties > Hm, but that could be changed specifically for the drive property > qdev keeps confusing me. Drive isn’t supposed to call > qdev_prop_set_after_realize(), but the error message’s still there. > Where is that hidden call...? > Ah, set_pointer() does > Yes, then it should be possible to make that work rather locally > > And that took him about 10 minutes. > > So I suppose it would be to check in set_drive() and > set_drive_iothread() whether the device is already realized, and if so, > divert it to some other function that does the runtime change? ok, that might be a good starting point for me. Thanks. > > (No idea how the qdev maintainers think about doing that in set_drive() > and set_drive_iothread(), though) > >>> (As for blockdev-change-medium, as I’ve said, I don’t really think this >>> fits there. Furthermore, blockdev-change-medium is kind of a legacy >>> command because I think every command but blockdev-add that does a >>> bdrv_open() kind of is a legacy command. >> Out of curiosity, could you please explain why it's decided to be so? > Because we have blockdev-add, which supports all block device options > there are and so on. blockdev-change-medium (which is basically just a > more rigid “change”) only gets filename, which isn’t as expressive. > > We generally want users to add new nodes with blockdev-add and let all > other commands only take node-names. > > (There’s also the fact that historically we’ve used filenames to > identify BlockDriverStates, but that doesn’t work so well. Thus I think > we should get away from using filenames as much as we can so people > don’t use them for identification again.) > > Max Thanks for the explanation, Max! Denis >
Re: [PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839
On 21.10.2019 16:24, Stefan Hajnoczi wrote: > On Fri, Oct 18, 2019 at 02:55:47PM +0300, Denis Plotnikov wrote: >> From: "Denis V. Lunev" >> >> Linux guests submit IO requests no longer than PAGE_SIZE * max_seg >> field reported by SCSI controler. Thus typical sequential read with >> 1 MB size results in the following pattern of the IO from the guest: >>8,16 115754 2.766095122 2071 D R 2095104 + 1008 [dd] >>8,16 115755 2.766108785 2071 D R 2096112 + 1008 [dd] >>8,16 115756 2.766113486 2071 D R 2097120 + 32 [dd] >>8,16 115757 2.767668961 0 C R 2095104 + 1008 [0] >>8,16 115758 2.768534315 0 C R 2096112 + 1008 [0] >>8,16 115759 2.768539782 0 C R 2097120 + 32 [0] >> The IO was generated by >>dd if=/dev/sda of=/dev/null bs=1024 iflag=direct >> >> This effectively means that on rotational disks we will observe 3 IOPS >> for each 2 MBs processed. This definitely negatively affects both >> guest and host IO performance. >> >> The cure is relatively simple - we should report lengthy scatter-gather >> ability of the SCSI controller. Fortunately the situation here is very >> good. VirtIO transport layer can accomodate 1024 items in one request >> while we are using only 128. This situation is present since almost >> very beginning. 2 items are dedicated for request metadata thus we >> should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg. >> >> The following pattern is observed after the patch: >>8,16 1 9921 2.662721340 2063 D R 2095104 + 1024 [dd] >>8,16 1 9922 2.662737585 2063 D R 2096128 + 1024 [dd] >>8,16 1 9923 2.665188167 0 C R 2095104 + 1024 [0] >>8,16 1 9924 2.665198777 0 C R 2096128 + 1024 [0] >> which is much better. >> >> The dark side of this patch is that we are tweaking guest visible >> parameter, though this should be relatively safe as above transport >> layer support is present in QEMU/host Linux for a very long time. >> The patch adds configurable property for VirtIO SCSI with a new default >> and hardcode option for VirtBlock which does not provide good >> configurable framework. >> >> Unfortunately the commit can not be applied as is. For the real cure we >> need guest to be fixed to accomodate that queue length, which is done >> only in the latest 4.14 kernel. Thus we are going to expose the property >> and tweak it on machine type level. >> >> The problem with the old kernels is that they have >> max_segments <= virtqueue_size restriction which cause the guest >> crashing in the case of violation. >> To fix the case described above in the old kernels we can increase >> virtqueue_size to 256 and max_segments to 254. The pitfall here is >> that seabios allows the virtqueue_size-s < 128, however, the seabios >> patch extending that value to 256 is pending. > If I understand correctly you are relying on Indirect Descriptor support > in the guest driver in order to exceed the Virtqueue Descriptor Table > size. > > Unfortunately the "max_segments <= virtqueue_size restriction" is > required by the VIRTIO 1.1 specification: > >2.6.5.3.1 Driver Requirements: Indirect Descriptors > >A driver MUST NOT create a descriptor chain longer than the Queue >Size of the device. > > So this idea seems to be in violation of the specification? > > There is a bug in hw/block/virtio-blk.c:virtio_blk_update_config() and > hw/scsi/virtio-scsi.c:virtio_scsi_get_config(): > >virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2); > > This number should be the minimum of blk_get_max_iov() and > virtio_queue_get_num(), minus 2 for the header and footer. Stefan, It seems VitrioSCSI don't have a direct link to blk, apart of VirtIOBlock->blk, and the link to a blk comes with each scsi request. I suspect that idea here is that a single virtioscsi can serve several blk-s. If my assumption is corect, then we can't get blk_get_max_iov() on virtioscsi configuration stage and we shouldn't take into account max_iov and limit max_segments with virtio_queue_get_num()-2 only. Is it so, or is there any other details to take into account? Thanks! Denis > > I looked at the Linux SCSI driver code and it seems each HBA has a > single max_segments number - it does not vary on a per-device basis. > This could be a problem if two host block device with different > max_segments are exposed to the guest through the same virtio-scsi > controller. Another bug? :( > > Anyway, if you want ~1024 descriptors you should set Queue Size to 1024. > I don't see a spec-compliant way of doing it otherwise. Hopefully I > have overlooked something and there is a nice way to solve this. > > Stefan
Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
On 12.02.2020 18:43, Stefan Hajnoczi wrote: On Tue, Feb 11, 2020 at 05:14:14PM +0300, Denis Plotnikov wrote: The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from a guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/block/virtio-blk.c | 4 ++-- hw/core/machine.c | 2 ++ hw/scsi/virtio-scsi.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 09f46ed85f..6df3a7a6df 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -914,7 +914,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) memset(&blkcfg, 0, sizeof(blkcfg)); virtio_stq_p(vdev, &blkcfg.capacity, capacity); virtio_stl_p(vdev, &blkcfg.seg_max, - s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2); + s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 256 - 2); This value must not change on older machine types. Yes, that's true, but .. So does this patch need to turn seg-max-adjust *on* in hw_compat_4_2 so that old machine types get 126 instead of 254? If we set seg-max-adjust "on" in older machine types, the setups using them and having queue_sizes set , for example, 1024 will also set seg_max to 1024 - 2 which isn't the expected behavior: older mt didn't change seg_max in that case and stuck with 128 - 2. So, should we, instead, leave the default 128 - 2, for seg_max? Denis virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls); virtio_stl_p(vdev, &blkcfg.blk_size, blk_size); virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size); @@ -1272,7 +1272,7 @@ static Property virtio_blk_properties[] = { DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, true), DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), -DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128), +DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true), DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD, IOThread *), diff --git a/hw/core/machine.c b/hw/core/machine.c index 2501b540ec..3427d6cf4c 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -28,6 +28,8 @@ #include "hw/mem/nvdimm.h" GlobalProperty hw_compat_4_2[] = { +{ "virtio-blk-device", "queue-size", "128"}, +{ "virtio-scsi-device", "virtqueue_size", "128"}, { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" }, { "virtio-blk-device", "seg-max-adjust", "off"}, { "virtio-scsi-device", "seg_max_adjust", "off"}, diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 3b61563609..b38f50a429 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -660,7 +660,7 @@ static void virtio_scsi_get_config(VirtIODevice *vdev, virtio_stl_p(vdev, &scsiconf->num_queues, s->conf.num_queues); virtio_stl_p(vdev, &scsiconf->seg_max, - s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 128 - 2); + s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 256 - 2); virtio_stl_p(vdev, &scsiconf->max_sectors, s->conf.max_sectors); virtio_stl_p(vdev, &scsiconf->cmd_per_lun, s->conf.cmd_per_lun); virtio_stl_p(vdev, &scsiconf->event_info_size, sizeof(VirtIOSCSIEvent)); @@ -965,7 +965,7 @@ static void virtio_scsi_device_unrealize(DeviceState *dev, Error **errp) static Property virtio_scsi_properties[] = { DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 1), DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI, - parent_obj.conf.virtqueue_size, 128), + parent_obj.conf.virtqueue_size, 256), DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI, parent_obj.conf.seg_max_adjust, true), DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors, -- 2.17.0
Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
On 13.02.2020 12:08, Stefan Hajnoczi wrote: On Thu, Feb 13, 2020 at 11:08:35AM +0300, Denis Plotnikov wrote: On 12.02.2020 18:43, Stefan Hajnoczi wrote: On Tue, Feb 11, 2020 at 05:14:14PM +0300, Denis Plotnikov wrote: The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from a guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/block/virtio-blk.c | 4 ++-- hw/core/machine.c | 2 ++ hw/scsi/virtio-scsi.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 09f46ed85f..6df3a7a6df 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -914,7 +914,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) memset(&blkcfg, 0, sizeof(blkcfg)); virtio_stq_p(vdev, &blkcfg.capacity, capacity); virtio_stl_p(vdev, &blkcfg.seg_max, - s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2); + s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 256 - 2); This value must not change on older machine types. Yes, that's true, but .. So does this patch need to turn seg-max-adjust *on* in hw_compat_4_2 so that old machine types get 126 instead of 254? If we set seg-max-adjust "on" in older machine types, the setups using them and having queue_sizes set , for example, 1024 will also set seg_max to 1024 - 2 which isn't the expected behavior: older mt didn't change seg_max in that case and stuck with 128 - 2. So, should we, instead, leave the default 128 - 2, for seg_max? Argh! Good point :-). How about a seg_max_default property that is initialized to 254 for modern machines and 126 to old machines? Hmm, but we'll achieve the same but with more code changes, don't we? 254 is because the queue-size is 256. We gonna leave 128-2 for older machine types just for not breaking anything. All other seg_max adjustment is provided by seg_max_adjust which is "on" by default in modern machine types. to summarize: modern mt defaults: seg_max_adjust = on queue_size = 256 => default seg_max = 254 => changing queue-size will change seg_max = queue_size - 2 old mt defaults: seg_max_adjust = off queue_size = 128 => default seg_max = 126 => changing queue-size won't change seg_max, it's always = 126 like it was before Denis Stefan
Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
On 13.02.2020 14:45, Stefan Hajnoczi wrote: On Thu, Feb 13, 2020 at 12:28:25PM +0300, Denis Plotnikov wrote: On 13.02.2020 12:08, Stefan Hajnoczi wrote: On Thu, Feb 13, 2020 at 11:08:35AM +0300, Denis Plotnikov wrote: On 12.02.2020 18:43, Stefan Hajnoczi wrote: On Tue, Feb 11, 2020 at 05:14:14PM +0300, Denis Plotnikov wrote: The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from a guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/block/virtio-blk.c | 4 ++-- hw/core/machine.c | 2 ++ hw/scsi/virtio-scsi.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 09f46ed85f..6df3a7a6df 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -914,7 +914,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) memset(&blkcfg, 0, sizeof(blkcfg)); virtio_stq_p(vdev, &blkcfg.capacity, capacity); virtio_stl_p(vdev, &blkcfg.seg_max, - s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2); + s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 256 - 2); This value must not change on older machine types. Yes, that's true, but .. So does this patch need to turn seg-max-adjust *on* in hw_compat_4_2 so that old machine types get 126 instead of 254? If we set seg-max-adjust "on" in older machine types, the setups using them and having queue_sizes set , for example, 1024 will also set seg_max to 1024 - 2 which isn't the expected behavior: older mt didn't change seg_max in that case and stuck with 128 - 2. So, should we, instead, leave the default 128 - 2, for seg_max? Argh! Good point :-). How about a seg_max_default property that is initialized to 254 for modern machines and 126 to old machines? Hmm, but we'll achieve the same but with more code changes, don't we? 254 is because the queue-size is 256. We gonna leave 128-2 for older machine types just for not breaking anything. All other seg_max adjustment is provided by seg_max_adjust which is "on" by default in modern machine types. to summarize: modern mt defaults: seg_max_adjust = on queue_size = 256 => default seg_max = 254 => changing queue-size will change seg_max = queue_size - 2 old mt defaults: seg_max_adjust = off queue_size = 128 => default seg_max = 126 => changing queue-size won't change seg_max, it's always = 126 like it was before You're right! The only strange case is a modern machine type with seg_max_adjust=off, where queue_size will be 256 but seg_max will be 126. But no user would want to disable seg_max_adjust, so it's okay. I agree with you that the line of code can remain unchanged: /* * Only old machine types use seg_max_adjust=off and there the default * value of queue_size is 128. */ virtio_stl_p(vdev, &blkcfg.seg_max, s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2); Stefan Ok, I'll resend the patch sortly Thanks! Denis
[PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
v1: * seg_max default value changing removed --- The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from a guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/block/virtio-blk.c | 2 +- hw/core/machine.c | 2 ++ hw/scsi/virtio-scsi.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 09f46ed85f..142863a3b2 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -1272,7 +1272,7 @@ static Property virtio_blk_properties[] = { DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, true), DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), -DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128), +DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true), DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD, IOThread *), diff --git a/hw/core/machine.c b/hw/core/machine.c index 2501b540ec..3427d6cf4c 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -28,6 +28,8 @@ #include "hw/mem/nvdimm.h" GlobalProperty hw_compat_4_2[] = { +{ "virtio-blk-device", "queue-size", "128"}, +{ "virtio-scsi-device", "virtqueue_size", "128"}, { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" }, { "virtio-blk-device", "seg-max-adjust", "off"}, { "virtio-scsi-device", "seg_max_adjust", "off"}, diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 3b61563609..472bbd233b 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -965,7 +965,7 @@ static void virtio_scsi_device_unrealize(DeviceState *dev, Error **errp) static Property virtio_scsi_properties[] = { DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 1), DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI, - parent_obj.conf.virtqueue_size, 128), + parent_obj.conf.virtqueue_size, 256), DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI, parent_obj.conf.seg_max_adjust, true), DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors, -- 2.17.0
[PATCH v3] virtio: increase virtqueue size for virtio-scsi and virtio-blk
The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from a guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- v3: * typos fixed v2: * seg_max default value changing removed --- hw/block/virtio-blk.c | 2 +- hw/core/machine.c | 2 ++ hw/scsi/virtio-scsi.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 09f46ed85f..142863a3b2 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -1272,7 +1272,7 @@ static Property virtio_blk_properties[] = { DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, true), DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), -DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128), +DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true), DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD, IOThread *), diff --git a/hw/core/machine.c b/hw/core/machine.c index 2501b540ec..3427d6cf4c 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -28,6 +28,8 @@ #include "hw/mem/nvdimm.h" GlobalProperty hw_compat_4_2[] = { +{ "virtio-blk-device", "queue-size", "128"}, +{ "virtio-scsi-device", "virtqueue_size", "128"}, { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" }, { "virtio-blk-device", "seg-max-adjust", "off"}, { "virtio-scsi-device", "seg_max_adjust", "off"}, diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 3b61563609..472bbd233b 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -965,7 +965,7 @@ static void virtio_scsi_device_unrealize(DeviceState *dev, Error **errp) static Property virtio_scsi_properties[] = { DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 1), DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI, - parent_obj.conf.virtqueue_size, 128), + parent_obj.conf.virtqueue_size, 256), DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI, parent_obj.conf.seg_max_adjust, true), DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors, -- 2.17.0
Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
On 18.02.2020 16:53, Stefan Hajnoczi wrote: On Thu, Feb 13, 2020 at 05:59:27PM +0300, Denis Plotnikov wrote: v1: * seg_max default value changing removed --- The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from a guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/block/virtio-blk.c | 2 +- hw/core/machine.c | 2 ++ hw/scsi/virtio-scsi.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) I fixed up the "virtuqueue" typo in the commit message and the mis-formatted commit description (git-am(1) stops including lines after the first "---"). Actually, I sent the corrected version v3 of the patch last week. But it seems it got lost among that gigantic patch flow in the mailing list :) Thanks for applying! Denis Thanks, applied to my block tree: https://github.com/stefanha/qemu/commits/block Stefan
Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
On 18.02.2020 16:59, Denis Plotnikov wrote: On 18.02.2020 16:53, Stefan Hajnoczi wrote: On Thu, Feb 13, 2020 at 05:59:27PM +0300, Denis Plotnikov wrote: v1: * seg_max default value changing removed --- The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from a guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/block/virtio-blk.c | 2 +- hw/core/machine.c | 2 ++ hw/scsi/virtio-scsi.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) I fixed up the "virtuqueue" typo in the commit message and the mis-formatted commit description (git-am(1) stops including lines after the first "---"). Actually, I sent the corrected version v3 of the patch last week. But it seems it got lost among that gigantic patch flow in the mailing list :) Thanks for applying! Denis Thanks, applied to my block tree: https://github.com/stefanha/qemu/commits/block Stefan I'm going to send the test checking the virtqueue-sizes for machine types a little bit later. Denis
Re: [PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839
Ping! On 25.11.2019 12:16, Denis Plotnikov wrote: > > > On 06.11.2019 15:03, Michael S. Tsirkin wrote: >> On Thu, Oct 24, 2019 at 11:34:34AM +, Denis Lunev wrote: >>> On 10/24/19 12:28 AM, Michael S. Tsirkin wrote: >>>> On Fri, Oct 18, 2019 at 02:55:47PM +0300, Denis Plotnikov wrote: >>>>> From: "Denis V. Lunev" >>>>> >>>>> Linux guests submit IO requests no longer than PAGE_SIZE * max_seg >>>>> field reported by SCSI controler. Thus typical sequential read with >>>>> 1 MB size results in the following pattern of the IO from the guest: >>>>> 8,16 1 15754 2.766095122 2071 D R 2095104 + 1008 [dd] >>>>> 8,16 1 15755 2.766108785 2071 D R 2096112 + 1008 [dd] >>>>> 8,16 1 15756 2.766113486 2071 D R 2097120 + 32 [dd] >>>>> 8,16 1 15757 2.767668961 0 C R 2095104 + 1008 [0] >>>>> 8,16 1 15758 2.768534315 0 C R 2096112 + 1008 [0] >>>>> 8,16 1 15759 2.768539782 0 C R 2097120 + 32 [0] >>>>> The IO was generated by >>>>> dd if=/dev/sda of=/dev/null bs=1024 iflag=direct >>>>> >>>>> This effectively means that on rotational disks we will observe 3 >>>>> IOPS >>>>> for each 2 MBs processed. This definitely negatively affects both >>>>> guest and host IO performance. >>>>> >>>>> The cure is relatively simple - we should report lengthy >>>>> scatter-gather >>>>> ability of the SCSI controller. Fortunately the situation here is >>>>> very >>>>> good. VirtIO transport layer can accomodate 1024 items in one request >>>>> while we are using only 128. This situation is present since almost >>>>> very beginning. 2 items are dedicated for request metadata thus we >>>>> should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg. >>>>> >>>>> The following pattern is observed after the patch: >>>>> 8,16 1 9921 2.662721340 2063 D R 2095104 + 1024 [dd] >>>>> 8,16 1 9922 2.662737585 2063 D R 2096128 + 1024 [dd] >>>>> 8,16 1 9923 2.665188167 0 C R 2095104 + 1024 [0] >>>>> 8,16 1 9924 2.665198777 0 C R 2096128 + 1024 [0] >>>>> which is much better. >>>>> >>>>> The dark side of this patch is that we are tweaking guest visible >>>>> parameter, though this should be relatively safe as above transport >>>>> layer support is present in QEMU/host Linux for a very long time. >>>>> The patch adds configurable property for VirtIO SCSI with a new >>>>> default >>>>> and hardcode option for VirtBlock which does not provide good >>>>> configurable framework. >>>>> >>>>> Unfortunately the commit can not be applied as is. For the real >>>>> cure we >>>>> need guest to be fixed to accomodate that queue length, which is done >>>>> only in the latest 4.14 kernel. Thus we are going to expose the >>>>> property >>>>> and tweak it on machine type level. >>>>> >>>>> The problem with the old kernels is that they have >>>>> max_segments <= virtqueue_size restriction which cause the guest >>>>> crashing in the case of violation. >>>> This isn't just in the guests: virtio spec also seems to imply this, >>>> or at least be vague on this point. >>>> >>>> So I think it'll need a feature bit. >>>> Doing that in a safe way will also allow being compatible with old >>>> guests. >>>> >>>> The only downside is it's a bit more work as we need to >>>> spec this out and add guest support. >>>> >>>>> To fix the case described above in the old kernels we can increase >>>>> virtqueue_size to 256 and max_segments to 254. The pitfall here is >>>>> that seabios allows the virtqueue_size-s < 128, however, the seabios >>>>> patch extending that value to 256 is pending. >>>> And the fix here is just to limit large vq size to virtio 1.0. >>>> In that mode it's fine I think: >>>> >>>> >>>> /* check if the queue is available */ >>>> if (vp->use_modern) { >>>> num
[PING]Re: [PATCH v0 2/2] block: allow to set 'drive' property on a realized block device
On 18.11.2019 13:50, Denis Plotnikov wrote: > > > On 10.11.2019 22:08, Denis Plotnikov wrote: >> >> On 10.11.2019 22:03, Denis Plotnikov wrote: >>> This allows to change (replace) the file on a block device and is >>> useful >>> to workaround exclusive file access restrictions, e.g. to implement VM >>> migration with a shared disk stored on some storage with the exclusive >>> file opening model: a destination VM is started waiting for incomming >>> migration with a fake image drive, and later, on the last migration >>> phase, the fake image file is replaced with the real one. >>> >>> Signed-off-by: Denis Plotnikov >>> --- >>> hw/core/qdev-properties-system.c | 89 >>> +++- >>> 1 file changed, 77 insertions(+), 12 deletions(-) >>> >>> diff --git a/hw/core/qdev-properties-system.c >>> b/hw/core/qdev-properties-system.c >>> index c534590dcd..aaab1370a4 100644 >>> --- a/hw/core/qdev-properties-system.c >>> +++ b/hw/core/qdev-properties-system.c >>> @@ -79,8 +79,55 @@ static void set_pointer(Object *obj, Visitor *v, >>> Property *prop, >>> /* --- drive --- */ >>> -static void do_parse_drive(DeviceState *dev, const char *str, >>> void **ptr, >>> - const char *propname, bool iothread, >>> Error **errp) >>> +static void do_parse_drive_realized(DeviceState *dev, const char *str, >>> + void **ptr, const char *propname, >>> + bool iothread, Error **errp) >>> +{ >>> + BlockBackend *blk = *ptr; >>> + BlockDriverState *bs = bdrv_lookup_bs(NULL, str, NULL); >>> + int ret; >>> + bool blk_created = false; >>> + >>> + if (!bs) { >>> + error_setg(errp, "Can't find blockdev '%s'", str); >>> + return; >>> + } >>> + >>> + if (!blk) { >>> + AioContext *ctx = iothread ? bdrv_get_aio_context(bs) : >>> + qemu_get_aio_context(); >>> + blk = blk_new(ctx, BLK_PERM_ALL, BLK_PERM_ALL); >>> + blk_created = true; >> >> Actually, I have concerns about situation where blk=null. >> >> Is there any case when scsi-hd (or others) doesn't have a blk >> assigned and it's legal? >> >>> + } else { >>> + if (blk_bs(blk)) { >>> + blk_remove_bs(blk); >>> + } >>> + } >>> + >>> + ret = blk_insert_bs(blk, bs, errp); >>> + >>> + if (!ret && blk_created) { >>> + if (blk_attach_dev(blk, dev) < 0) { >>> + /* >>> + * Shouldn't be any errors here since we just created >>> + * the new blk because the device doesn't have any. >>> + * Leave the message here in case blk_attach_dev is >>> changed >>> + */ >>> + error_setg(errp, "Can't attach drive '%s' to device >>> '%s'", >>> + str, object_get_typename(OBJECT(dev))); >>> + } else { >>> + *ptr = blk; >>> + } >>> + } > Another problem here, is that the "size" of the device dev may not > match after setting a drive. > So, we should update it after the drive setting. > It was found, that it could be done by calling > BlockDevOps.bdrv_parent_cb_resize. > > But I have some concerns about doing it so. In the case of virtio scsi > disk we have the following callstack > > bdrv_parent_cb_resize calls() -> > scsi_device_report_change(dev, SENSE_CODE(CAPACITY_CHANGED)) -> > virtio_scsi_change -> > virtio_scsi_push_event(s, dev, > VIRTIO_SCSI_T_PARAM_CHANGE, > sense.asc | > (sense.ascq << 8)); > > > virtio_scsi_change pushes the event to the guest to make the guest > ask for size refreshing. > If I'm not mistaken, here we can get a race condition when some > another request is processed with an unchanged > size and then the size changing request is processed. > > I didn't find a better way to update device size so any comments are > welcome. > > Thanks! > > Denis >>> + >>> + if (blk_created) { >>> + blk_unref(blk); >&
[PING] [PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839
On 05.12.2019 10:59, Denis Plotnikov wrote: > Ping! > > On 25.11.2019 12:16, Denis Plotnikov wrote: >> >> >> On 06.11.2019 15:03, Michael S. Tsirkin wrote: >>> On Thu, Oct 24, 2019 at 11:34:34AM +, Denis Lunev wrote: >>>> On 10/24/19 12:28 AM, Michael S. Tsirkin wrote: >>>>> On Fri, Oct 18, 2019 at 02:55:47PM +0300, Denis Plotnikov wrote: >>>>>> From: "Denis V. Lunev" >>>>>> >>>>>> Linux guests submit IO requests no longer than PAGE_SIZE * max_seg >>>>>> field reported by SCSI controler. Thus typical sequential read with >>>>>> 1 MB size results in the following pattern of the IO from the guest: >>>>>> 8,16 1 15754 2.766095122 2071 D R 2095104 + 1008 >>>>>> [dd] >>>>>> 8,16 1 15755 2.766108785 2071 D R 2096112 + 1008 >>>>>> [dd] >>>>>> 8,16 1 15756 2.766113486 2071 D R 2097120 + 32 [dd] >>>>>> 8,16 1 15757 2.767668961 0 C R 2095104 + 1008 [0] >>>>>> 8,16 1 15758 2.768534315 0 C R 2096112 + 1008 [0] >>>>>> 8,16 1 15759 2.768539782 0 C R 2097120 + 32 [0] >>>>>> The IO was generated by >>>>>> dd if=/dev/sda of=/dev/null bs=1024 iflag=direct >>>>>> >>>>>> This effectively means that on rotational disks we will observe 3 >>>>>> IOPS >>>>>> for each 2 MBs processed. This definitely negatively affects both >>>>>> guest and host IO performance. >>>>>> >>>>>> The cure is relatively simple - we should report lengthy >>>>>> scatter-gather >>>>>> ability of the SCSI controller. Fortunately the situation here is >>>>>> very >>>>>> good. VirtIO transport layer can accomodate 1024 items in one >>>>>> request >>>>>> while we are using only 128. This situation is present since almost >>>>>> very beginning. 2 items are dedicated for request metadata thus we >>>>>> should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg. >>>>>> >>>>>> The following pattern is observed after the patch: >>>>>> 8,16 1 9921 2.662721340 2063 D R 2095104 + 1024 >>>>>> [dd] >>>>>> 8,16 1 9922 2.662737585 2063 D R 2096128 + 1024 >>>>>> [dd] >>>>>> 8,16 1 9923 2.665188167 0 C R 2095104 + 1024 [0] >>>>>> 8,16 1 9924 2.665198777 0 C R 2096128 + 1024 [0] >>>>>> which is much better. >>>>>> >>>>>> The dark side of this patch is that we are tweaking guest visible >>>>>> parameter, though this should be relatively safe as above transport >>>>>> layer support is present in QEMU/host Linux for a very long time. >>>>>> The patch adds configurable property for VirtIO SCSI with a new >>>>>> default >>>>>> and hardcode option for VirtBlock which does not provide good >>>>>> configurable framework. >>>>>> >>>>>> Unfortunately the commit can not be applied as is. For the real >>>>>> cure we >>>>>> need guest to be fixed to accomodate that queue length, which is >>>>>> done >>>>>> only in the latest 4.14 kernel. Thus we are going to expose the >>>>>> property >>>>>> and tweak it on machine type level. >>>>>> >>>>>> The problem with the old kernels is that they have >>>>>> max_segments <= virtqueue_size restriction which cause the guest >>>>>> crashing in the case of violation. >>>>> This isn't just in the guests: virtio spec also seems to imply this, >>>>> or at least be vague on this point. >>>>> >>>>> So I think it'll need a feature bit. >>>>> Doing that in a safe way will also allow being compatible with old >>>>> guests. >>>>> >>>>> The only downside is it's a bit more work as we need to >>>>> spec this out and add guest support. >>>>> >>>>>> To fix the case described above in the old kernels we can increase >>>>>> virtqueue_size to 256 an
[PATCH v4 0/2] virtio: make seg_max virtqueue size dependent
v4: * rebased on 4.2 [MST] v3: * add property to set in machine type [MST] * add min queue size check [Stefan] * add avocado based test [Max, Stefan, Eduardo, Cleber] v2: * the standalone patch to make seg_max virtqueue size dependent * other patches are postponed v1: the initial series Denis Plotnikov (2): virtio: make seg_max virtqueue size dependent tests: add virtio-scsi and virtio-blk seg_max_adjust test hw/block/virtio-blk.c | 9 +- hw/core/machine.c | 3 + hw/scsi/vhost-scsi.c | 2 + hw/scsi/virtio-scsi.c | 10 +- include/hw/virtio/virtio-blk.h| 1 + include/hw/virtio/virtio-scsi.h | 1 + tests/acceptance/virtio_seg_max_adjust.py | 135 ++ 7 files changed, 159 insertions(+), 2 deletions(-) create mode 100755 tests/acceptance/virtio_seg_max_adjust.py -- 2.17.0
[PATCH v4 2/2] tests: add virtio-scsi and virtio-blk seg_max_adjust test
It tests proper seg_max_adjust settings for all machine types except 'none', 'isapc', 'microvm' Signed-off-by: Denis Plotnikov --- tests/acceptance/virtio_seg_max_adjust.py | 135 ++ 1 file changed, 135 insertions(+) create mode 100755 tests/acceptance/virtio_seg_max_adjust.py diff --git a/tests/acceptance/virtio_seg_max_adjust.py b/tests/acceptance/virtio_seg_max_adjust.py new file mode 100755 index 00..00cf2565d9 --- /dev/null +++ b/tests/acceptance/virtio_seg_max_adjust.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# +# Test virtio-scsi and virtio-blk queue settings for all machine types +# +# Copyright (c) 2019 Virtuozzo International GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import sys +import os +import re + +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'python')) +from qemu.machine import QEMUMachine +from avocado_qemu import Test + +#list of machine types and virtqueue properties to test +VIRTIO_SCSI_PROPS = {'seg_max_adjust': 'seg_max_adjust'} +VIRTIO_BLK_PROPS = {'seg_max_adjust': 'seg-max-adjust'} + +DEV_TYPES = {'virtio-scsi-pci': VIRTIO_SCSI_PROPS, + 'virtio-blk-pci': VIRTIO_BLK_PROPS} + +VM_DEV_PARAMS = {'virtio-scsi-pci': ['-device', 'virtio-scsi-pci,id=scsi0'], + 'virtio-blk-pci': ['-device', +'virtio-blk-pci,id=scsi0,drive=drive0', +'-drive', +'driver=null-co,id=drive0,if=none']} + + +class VirtioMaxSegSettingsCheck(Test): +@staticmethod +def make_pattern(props): +pattern_items = ['{0} = \w+'.format(prop) for prop in props] +return '|'.join(pattern_items) + +def query_virtqueue(self, vm, dev_type_name): +query_ok = False +error = None +props = None + +output = vm.command('human-monitor-command', +command_line = 'info qtree') +props_list = DEV_TYPES[dev_type_name].values(); +pattern = self.make_pattern(props_list) +res = re.findall(pattern, output) + +if len(res) != len(props_list): +props_list = set(props_list) +res = set(res) +not_found = props_list.difference(res) +not_found = ', '.join(not_found) +error = '({0}): The following properties not found: {1}'\ + .format(dev_type_name, not_found) +else: +query_ok = True +props = dict() +for prop in res: +p = prop.split(' = ') +props[p[0]] = p[1] +return query_ok, props, error + +def check_mt(self, mt, dev_type_name): +with QEMUMachine(self.qemu_bin) as vm: +vm.set_machine(mt["name"]) +for s in VM_DEV_PARAMS[dev_type_name]: +vm.add_args(s) +vm.launch() +query_ok, props, error = self.query_virtqueue(vm, dev_type_name) + +if not query_ok: +self.fail('machine type {0}: {1}'.format(mt['name'], error)) + +for prop_name, prop_val in props.items(): +expected_val = mt[prop_name] +self.assertEqual(expected_val, prop_val) + +@staticmethod +def seg_max_adjust_enabled(mt): +# machine types > 4.2 should have seg_max_adjust = true +# others seg_max_adjust = false +mt = mt.split("-") + +# machine types with one line name and name like pc-x.x +if len(mt) <= 2: +return False + +# machine types like pc--x.x[.x] +ver = mt[2] +ver = ver.split("."); + +# all versions greater than 4.2 goes with seg_max_adjust enabled +major = int(ver[0]) +minor = int(ver[1]) + +if major > 4 or (major == 4 and minor > 2): +return True +return False + +def test_machine_types(self): +# collect all machine types except 'none', 'isapc', 'microvm' +with QEMUMachine(self.qemu_
[PATCH v4 1/2] virtio: make seg_max virtqueue size dependent
Before the patch, seg_max parameter was immutable and hardcoded to 126 (128 - 2) without respect to queue size. This has two negative effects: 1. when queue size is < 128, we have Virtio 1.1 specfication violation: (2.6.5.3.1 Driver Requirements) seq_max must be <= queue_size. This violation affects the old Linux guests (ver < 4.14). These guests crash on these queue_size setups. 2. when queue_size > 128, as was pointed out by Denis Lunev , seg_max restrics guest's block request length which affects guests' performance making them issues more block request than needed. https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html To mitigate this two effects, the patch adds the property adjusting seg_max to queue size automaticaly. Since seg_max is a guest visible parameter, the property is machine type managable and allows to choose between old (seg_max = 126 always) and new (seg_max = queue_size - 2) behaviors. Not to change the behavior of the older VMs, prevent setting the default seg_max_adjust value for older machine types. Signed-off-by: Denis Plotnikov --- hw/block/virtio-blk.c | 9 - hw/core/machine.c | 3 +++ hw/scsi/vhost-scsi.c| 2 ++ hw/scsi/virtio-scsi.c | 10 +- include/hw/virtio/virtio-blk.h | 1 + include/hw/virtio/virtio-scsi.h | 1 + 6 files changed, 24 insertions(+), 2 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index d62e6377c2..0f6f8113b7 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -908,7 +908,8 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) blk_get_geometry(s->blk, &capacity); memset(&blkcfg, 0, sizeof(blkcfg)); virtio_stq_p(vdev, &blkcfg.capacity, capacity); -virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2); +virtio_stl_p(vdev, &blkcfg.seg_max, + s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2); virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls); virtio_stl_p(vdev, &blkcfg.blk_size, blk_size); virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size); @@ -1133,6 +1134,11 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) error_setg(errp, "num-queues property must be larger than 0"); return; } +if (conf->queue_size <= 2) { +error_setg(errp, "invalid queue-size property (%" PRIu16 "), " + "must be > 2", conf->queue_size); +return; +} if (!is_power_of_2(conf->queue_size) || conf->queue_size > VIRTQUEUE_MAX_SIZE) { error_setg(errp, "invalid queue-size property (%" PRIu16 "), " @@ -1262,6 +1268,7 @@ static Property virtio_blk_properties[] = { true), DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128), +DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true), DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD, IOThread *), DEFINE_PROP_BIT64("discard", VirtIOBlock, host_features, diff --git a/hw/core/machine.c b/hw/core/machine.c index 023548b4f3..bfa320387e 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -29,6 +29,9 @@ GlobalProperty hw_compat_4_2[] = { { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" }, +{ "virtio-blk-device", "seg-max-adjust", "off"}, +{ "virtio-scsi-device", "seg_max_adjust", "off"}, +{ "vhost-blk-device", "seg_max_adjust", "off"}, }; const size_t hw_compat_4_2_len = G_N_ELEMENTS(hw_compat_4_2); diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index c693fc748a..26f710d3ec 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -275,6 +275,8 @@ static Property vhost_scsi_properties[] = { DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues, 1), DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSICommon, conf.virtqueue_size, 128), +DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSICommon, conf.seg_max_adjust, + true), DEFINE_PROP_UINT32("max_sectors", VirtIOSCSICommon, conf.max_sectors, 0x), DEFINE_PROP_UINT32("cmd_per_lun", VirtIOSCSICommon, conf.cmd_per_lun, 128), diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index e8b2b64d09..405cb6c953 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -654,7 +654,8 @@ static void virtio_scsi_get_config(VirtIODevice *vdev,
Re: [PATCH v0 2/2] block: allow to set 'drive' property on a realized block device
On 13.12.2019 13:32, Kevin Wolf wrote: > Am 18.11.2019 um 11:50 hat Denis Plotnikov geschrieben: >> >> On 10.11.2019 22:08, Denis Plotnikov wrote: >>> On 10.11.2019 22:03, Denis Plotnikov wrote: >>>> This allows to change (replace) the file on a block device and is useful >>>> to workaround exclusive file access restrictions, e.g. to implement VM >>>> migration with a shared disk stored on some storage with the exclusive >>>> file opening model: a destination VM is started waiting for incomming >>>> migration with a fake image drive, and later, on the last migration >>>> phase, the fake image file is replaced with the real one. >>>> >>>> Signed-off-by: Denis Plotnikov >>>> --- >>>> hw/core/qdev-properties-system.c | 89 +++- >>>> 1 file changed, 77 insertions(+), 12 deletions(-) >>>> >>>> diff --git a/hw/core/qdev-properties-system.c >>>> b/hw/core/qdev-properties-system.c >>>> index c534590dcd..aaab1370a4 100644 >>>> --- a/hw/core/qdev-properties-system.c >>>> +++ b/hw/core/qdev-properties-system.c >>>> @@ -79,8 +79,55 @@ static void set_pointer(Object *obj, Visitor *v, >>>> Property *prop, >>>> /* --- drive --- */ >>>> -static void do_parse_drive(DeviceState *dev, const char *str, void >>>> **ptr, >>>> - const char *propname, bool iothread, >>>> Error **errp) >>>> +static void do_parse_drive_realized(DeviceState *dev, const char *str, >>>> + void **ptr, const char *propname, >>>> + bool iothread, Error **errp) >>>> +{ >>>> + BlockBackend *blk = *ptr; >>>> + BlockDriverState *bs = bdrv_lookup_bs(NULL, str, NULL); >>>> + int ret; >>>> + bool blk_created = false; >>>> + >>>> + if (!bs) { >>>> + error_setg(errp, "Can't find blockdev '%s'", str); >>>> + return; >>>> + } >>>> + >>>> + if (!blk) { >>>> + AioContext *ctx = iothread ? bdrv_get_aio_context(bs) : >>>> + qemu_get_aio_context(); >>>> + blk = blk_new(ctx, BLK_PERM_ALL, BLK_PERM_ALL); >>>> + blk_created = true; >>> Actually, I have concerns about situation where blk=null. >>> >>> Is there any case when scsi-hd (or others) doesn't have a blk assigned >>> and it's legal? > No, block devices will always have a BlockBackend, even if it doesn't > have a root node inserted. > >>>> + } else { >>>> + if (blk_bs(blk)) { >>>> + blk_remove_bs(blk); >>>> + } >>>> + } >>>> + >>>> + ret = blk_insert_bs(blk, bs, errp); >>>> + >>>> + if (!ret && blk_created) { >>>> + if (blk_attach_dev(blk, dev) < 0) { >>>> + /* >>>> + * Shouldn't be any errors here since we just created >>>> + * the new blk because the device doesn't have any. >>>> + * Leave the message here in case blk_attach_dev is changed >>>> + */ >>>> + error_setg(errp, "Can't attach drive '%s' to device '%s'", >>>> + str, object_get_typename(OBJECT(dev))); >>>> + } else { >>>> + *ptr = blk; >>>> + } >>>> + } >> Another problem here, is that the "size" of the device dev may not match >> after setting a drive. >> So, we should update it after the drive setting. >> It was found, that it could be done by calling >> BlockDevOps.bdrv_parent_cb_resize. >> >> But I have some concerns about doing it so. In the case of virtio scsi >> disk we have the following callstack >> >> bdrv_parent_cb_resize calls() -> >> scsi_device_report_change(dev, SENSE_CODE(CAPACITY_CHANGED)) -> >> virtio_scsi_change -> >> virtio_scsi_push_event(s, dev, VIRTIO_SCSI_T_PARAM_CHANGE, >> sense.asc | >> (sense.ascq << 8)); > I think the safest option for now (and which should solve the case you > want to address) is checking whether old and new size match and > returning an error otherwise. > >> virtio_scsi_change pushes the event to the guest to make the guest >> ask for size refreshing. If I'm not mistaken, here we can get a race >> condition when some another request is processed with an unchanged >> size and then the size changing request is processed. > I think this is actually a problem even without resizing: We need to > quiesce the device between removing the old root and inserting the new > one. They way to achieve this is probably by splitting blk_drain() into > a blk_drain_begin()/end() and then draining the BlockBackend here while > we're working on it. > > Kevin Why don't we use bdrv_drained_begin/end directly? This is what blk_drain does. If we want to split blk_drain we must keep track if blk's brdv isn't change otherwise we can end up with drain_begin one and drain end another bdrv if we do remove/insert in between. Another thing is should we really care about this if we have VM stopped and the sizes matched? Denis >
Re: [PATCH v0 2/2] block: allow to set 'drive' property on a realized block device
On 16.12.2019 18:38, Kevin Wolf wrote: > Am 16.12.2019 um 15:51 hat Denis Plotnikov geschrieben: >> On 13.12.2019 13:32, Kevin Wolf wrote: >>> Am 18.11.2019 um 11:50 hat Denis Plotnikov geschrieben: >>>> Another problem here, is that the "size" of the device dev may not match >>>> after setting a drive. >>>> So, we should update it after the drive setting. >>>> It was found, that it could be done by calling >>>> BlockDevOps.bdrv_parent_cb_resize. >>>> >>>> But I have some concerns about doing it so. In the case of virtio scsi >>>> disk we have the following callstack >>>> >>>> bdrv_parent_cb_resize calls() -> >>>> scsi_device_report_change(dev, SENSE_CODE(CAPACITY_CHANGED)) -> >>>> virtio_scsi_change -> >>>> virtio_scsi_push_event(s, dev, >>>> VIRTIO_SCSI_T_PARAM_CHANGE, >>>> sense.asc | >>>> (sense.ascq << 8)); >>> I think the safest option for now (and which should solve the case you >>> want to address) is checking whether old and new size match and >>> returning an error otherwise. >>> >>>> virtio_scsi_change pushes the event to the guest to make the guest >>>> ask for size refreshing. If I'm not mistaken, here we can get a race >>>> condition when some another request is processed with an unchanged >>>> size and then the size changing request is processed. >>> I think this is actually a problem even without resizing: We need to >>> quiesce the device between removing the old root and inserting the new >>> one. They way to achieve this is probably by splitting blk_drain() into >>> a blk_drain_begin()/end() and then draining the BlockBackend here while >>> we're working on it. >>> >>> Kevin >> Why don't we use bdrv_drained_begin/end directly? This is what >> blk_drain does. >> If we want to split blk_drain we must keep track if blk's brdv isn't >> change otherwise we can end up with drain_begin one and drain end >> another bdrv if we do remove/insert in between. > Hmm, true, we would have to keep track of draining at the BlockBackend > level and consider it in blk_remove_bs() and blk_insert_bs(). Maybe > that's not worth it. > > If we use bdrv_drained_begin/end directly, I think we need to drain both > the old and the new root node during the process. > >> Another thing is should we really care about this if we have VM >> stopped and the sizes matched? > How do we know that the VM is stopped? And why would we require this? I implied the scenario of VM migration over a shared storage with an exclusive file access model. The VM is stopped on drive changing phase. If there is no use to require it, than ok. Denis > Your patch doesn't implement or at least check this, and it seems a bit > impractical for example when all you want is inserting a filter node. > > Kevin
Re: [PATCH v0 1/2] qdev-properties-system: extend set_pionter for unrealized devices
On 18.11.2019 21:54, Eduardo Habkost wrote: > On Sun, Nov 10, 2019 at 10:03:09PM +0300, Denis Plotnikov wrote: >> Some device's property can be changed if the device has been already >> realized. For example, it could be "drive" property of a scsi disk device. >> >> So far, set_pointer could operate only on a relized device. The patch >> extends its interface for operation on an unrealized device. >> >> Signed-off-by: Denis Plotnikov >> --- >> hw/core/qdev-properties-system.c | 32 +--- >> 1 file changed, 21 insertions(+), 11 deletions(-) >> >> diff --git a/hw/core/qdev-properties-system.c >> b/hw/core/qdev-properties-system.c >> index ba412dd2ca..c534590dcd 100644 >> --- a/hw/core/qdev-properties-system.c >> +++ b/hw/core/qdev-properties-system.c >> @@ -38,9 +38,14 @@ static void get_pointer(Object *obj, Visitor *v, Property >> *prop, >> } >> >> static void set_pointer(Object *obj, Visitor *v, Property *prop, >> -void (*parse)(DeviceState *dev, const char *str, >> - void **ptr, const char *propname, >> - Error **errp), >> +void (*parse_realized)(DeviceState *dev, >> + const char *str, void **ptr, >> + const char *propname, >> + Error **errp), >> +void (*parse_unrealized)(DeviceState *dev, >> + const char *str, void >> **ptr, >> + const char *propname, >> + Error **errp), >> const char *name, Error **errp) > Wouldn't it be simpler to just add a PropertyInfo::allow_set_after_realize > bool field, and call the same setter function? Then you can > simply change do_parse_drive() to check if realized is true. May be, but I thought It would be more clear to have a separate callback for all the devices supporting the property setting when realized. Also the "drive" property setting on realized and non-realized device a little bit different: in the realized case the setter function expects to get BlockDriverState only, when in the unrealized case the setter can accept both BlockBackend and BlockDriverState. Also, in the unrealized case the setter function doesn't expect to have a device with an empty BlockBackend. I decided that extending do_parse_drive would make it more complex for understanding. That's why I made two separate functions for both cases. I'd like to mention that I have a few concerns about do_parse_drive_realized (please see the next patch from the series) and I'd like them to be reviewed as well. After that, may be it would be better to go the way you suggested. Thanks for reviewing! Denis > >> { >> DeviceState *dev = DEVICE(obj); >> @@ -48,11 +53,6 @@ static void set_pointer(Object *obj, Visitor *v, Property >> *prop, >> void **ptr = qdev_get_prop_ptr(dev, prop); >> char *str; >> >> -if (dev->realized) { >> -qdev_prop_set_after_realize(dev, name, errp); >> -return; >> -} >> - >> visit_type_str(v, name, &str, &local_err); >> if (local_err) { >> error_propagate(errp, local_err); >> @@ -63,7 +63,17 @@ static void set_pointer(Object *obj, Visitor *v, Property >> *prop, >> *ptr = NULL; >> return; >> } >> -parse(dev, str, ptr, prop->name, errp); >> + >> +if (dev->realized) { >> +if (parse_realized) { >> +parse_realized(dev, str, ptr, prop->name, errp); >> +} else { >> +qdev_prop_set_after_realize(dev, name, errp); >> +} >> +} else { >> +parse_unrealized(dev, str, ptr, prop->name, errp); >> +} >> + >> g_free(str); >> } >> >> @@ -178,13 +188,13 @@ static void get_drive(Object *obj, Visitor *v, const >> char *name, void *opaque, >> static void set_drive(Object *obj, Visitor *v, const char *name, void >> *opaque, >> Error **errp) >> { >> -set_pointer(obj, v, opaque, parse_drive, name, errp); >> +set_pointer(obj, v, opaque, NULL, parse_drive, name, errp); >> } >> >> static void set_drive_iothread(Object *obj, Visitor *v, const char *name, >> void *opaque, Error **errp) >> { >> -set_pointer(obj, v, opaque, parse_drive_iothread, name, errp); >> +set_pointer(obj, v, opaque, NULL, parse_drive_iothread, name, errp); >> } >> >> const PropertyInfo qdev_prop_drive = { >> -- >> 2.17.0 >>
Re: [PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839
On 06.11.2019 15:03, Michael S. Tsirkin wrote: > On Thu, Oct 24, 2019 at 11:34:34AM +, Denis Lunev wrote: >> On 10/24/19 12:28 AM, Michael S. Tsirkin wrote: >>> On Fri, Oct 18, 2019 at 02:55:47PM +0300, Denis Plotnikov wrote: >>>> From: "Denis V. Lunev" >>>> >>>> Linux guests submit IO requests no longer than PAGE_SIZE * max_seg >>>> field reported by SCSI controler. Thus typical sequential read with >>>> 1 MB size results in the following pattern of the IO from the guest: >>>>8,16 115754 2.766095122 2071 D R 2095104 + 1008 [dd] >>>>8,16 115755 2.766108785 2071 D R 2096112 + 1008 [dd] >>>>8,16 115756 2.766113486 2071 D R 2097120 + 32 [dd] >>>>8,16 115757 2.767668961 0 C R 2095104 + 1008 [0] >>>>8,16 115758 2.768534315 0 C R 2096112 + 1008 [0] >>>>8,16 115759 2.768539782 0 C R 2097120 + 32 [0] >>>> The IO was generated by >>>>dd if=/dev/sda of=/dev/null bs=1024 iflag=direct >>>> >>>> This effectively means that on rotational disks we will observe 3 IOPS >>>> for each 2 MBs processed. This definitely negatively affects both >>>> guest and host IO performance. >>>> >>>> The cure is relatively simple - we should report lengthy scatter-gather >>>> ability of the SCSI controller. Fortunately the situation here is very >>>> good. VirtIO transport layer can accomodate 1024 items in one request >>>> while we are using only 128. This situation is present since almost >>>> very beginning. 2 items are dedicated for request metadata thus we >>>> should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg. >>>> >>>> The following pattern is observed after the patch: >>>>8,16 1 9921 2.662721340 2063 D R 2095104 + 1024 [dd] >>>>8,16 1 9922 2.662737585 2063 D R 2096128 + 1024 [dd] >>>>8,16 1 9923 2.665188167 0 C R 2095104 + 1024 [0] >>>>8,16 1 9924 2.665198777 0 C R 2096128 + 1024 [0] >>>> which is much better. >>>> >>>> The dark side of this patch is that we are tweaking guest visible >>>> parameter, though this should be relatively safe as above transport >>>> layer support is present in QEMU/host Linux for a very long time. >>>> The patch adds configurable property for VirtIO SCSI with a new default >>>> and hardcode option for VirtBlock which does not provide good >>>> configurable framework. >>>> >>>> Unfortunately the commit can not be applied as is. For the real cure we >>>> need guest to be fixed to accomodate that queue length, which is done >>>> only in the latest 4.14 kernel. Thus we are going to expose the property >>>> and tweak it on machine type level. >>>> >>>> The problem with the old kernels is that they have >>>> max_segments <= virtqueue_size restriction which cause the guest >>>> crashing in the case of violation. >>> This isn't just in the guests: virtio spec also seems to imply this, >>> or at least be vague on this point. >>> >>> So I think it'll need a feature bit. >>> Doing that in a safe way will also allow being compatible with old guests. >>> >>> The only downside is it's a bit more work as we need to >>> spec this out and add guest support. >>> >>>> To fix the case described above in the old kernels we can increase >>>> virtqueue_size to 256 and max_segments to 254. The pitfall here is >>>> that seabios allows the virtqueue_size-s < 128, however, the seabios >>>> patch extending that value to 256 is pending. >>> And the fix here is just to limit large vq size to virtio 1.0. >>> In that mode it's fine I think: >>> >>> >>> /* check if the queue is available */ >>> if (vp->use_modern) { >>> num = vp_read(&vp->common, virtio_pci_common_cfg, queue_size); >>> if (num > MAX_QUEUE_NUM) { >>> vp_write(&vp->common, virtio_pci_common_cfg, queue_size, >>> MAX_QUEUE_NUM); >>> num = vp_read(&vp->common, virtio_pci_common_cfg, queue_size); >>> } >>> } else { >>> num = vp_read(&vp->legacy, virtio_pci_legacy, queue_num); >>>
Re: [PATCH v0 1/2] qdev-properties-system: extend set_pionter for unrealized devices
On 25.11.2019 18:30, Eduardo Habkost wrote: > On Fri, Nov 22, 2019 at 11:36:30AM +0000, Denis Plotnikov wrote: >> >> On 18.11.2019 21:54, Eduardo Habkost wrote: >>> On Sun, Nov 10, 2019 at 10:03:09PM +0300, Denis Plotnikov wrote: >>>> Some device's property can be changed if the device has been already >>>> realized. For example, it could be "drive" property of a scsi disk device. >>>> >>>> So far, set_pointer could operate only on a relized device. The patch >>>> extends its interface for operation on an unrealized device. >>>> >>>> Signed-off-by: Denis Plotnikov >>>> --- >>>>hw/core/qdev-properties-system.c | 32 +--- >>>>1 file changed, 21 insertions(+), 11 deletions(-) >>>> >>>> diff --git a/hw/core/qdev-properties-system.c >>>> b/hw/core/qdev-properties-system.c >>>> index ba412dd2ca..c534590dcd 100644 >>>> --- a/hw/core/qdev-properties-system.c >>>> +++ b/hw/core/qdev-properties-system.c >>>> @@ -38,9 +38,14 @@ static void get_pointer(Object *obj, Visitor *v, >>>> Property *prop, >>>>} >>>> >>>>static void set_pointer(Object *obj, Visitor *v, Property *prop, >>>> -void (*parse)(DeviceState *dev, const char *str, >>>> - void **ptr, const char *propname, >>>> - Error **errp), >>>> +void (*parse_realized)(DeviceState *dev, >>>> + const char *str, void >>>> **ptr, >>>> + const char *propname, >>>> + Error **errp), >>>> +void (*parse_unrealized)(DeviceState *dev, >>>> + const char *str, void >>>> **ptr, >>>> + const char *propname, >>>> + Error **errp), >>>>const char *name, Error **errp) >>> Wouldn't it be simpler to just add a PropertyInfo::allow_set_after_realize >>> bool field, and call the same setter function? Then you can >>> simply change do_parse_drive() to check if realized is true. >> May be, but I thought It would be more clear to have a separate callback >> for all the devices supporting the property setting when realized. >> Also the "drive" property setting on realized and non-realized device a >> little bit different: in the realized case the setter function expects >> to get >> BlockDriverState only, when in the unrealized case the setter can accept >> both BlockBackend and BlockDriverState. Also, in the unrealized case the >> setter function doesn't expect to have a device with an empty BlockBackend. >> I decided that extending do_parse_drive would make it more complex for >> understanding. That's why I made two separate functions for both cases. > I understand you might want two separate functions in the > specific case of drive. You can still call different > functions after checking dev->realized inside do_parse_drive(). > > My point was that you don't need to make set_pointer() require > two separate function pointers just to propagate 1 bit of > information that is already available in DeviceState. In patch > 2/2 you had to create 4 different copies of parse_drive*() > because of this. Yes, that's true. I wanted to suggest a more general way to deal with a device on realized and non-realized state. I may be too much and not necessary. May be we should wait for a feedback from the block maintainers? > > >> I'd like to mention that I have a few concerns about >> do_parse_drive_realized (please see the next patch from the series) and >> I'd like them to be reviewed as well. After that, may be it would be >> better to go the way you suggested. > In the case if your questions in patch 2/2, I'm afraid I don't > know the answers and we need help from the block maintainers. Anyway, thanks for taking a glance. >
[Qemu-devel] [PATCH v3 1/3] qcow2: introduce compression type feature
The patch adds some preparation parts for incompatible compression type feature to QCOW2 header that indicates that *all* compressed clusters must be (de)compressed using a certain compression type. It is implied that the compression type is set on the image creation and can be changed only later by image conversion, thus compression type defines the only compression algorithm used for the image. The goal of the feature is to add support of other compression algorithms to qcow2. For example, ZSTD which is more effective on compression than ZLIB. It works roughly 2x faster than ZLIB providing a comparable compression ratio and therefore provides a performance advantage in backup scenarios. The default compression is ZLIB. Images created with ZLIB compression type are backward compatible with older qemu versions. Signed-off-by: Denis Plotnikov --- block/qcow2.c | 94 +++ block/qcow2.h | 26 --- docs/interop/qcow2.txt| 19 +++- include/block/block_int.h | 1 + qapi/block-core.json | 22 - 5 files changed, 152 insertions(+), 10 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 039bdc2f7e..4e07b7e9ec 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1197,6 +1197,32 @@ static int qcow2_update_options(BlockDriverState *bs, QDict *options, return ret; } +static int check_compression_type(BDRVQcow2State *s, Error **errp) +{ +switch (s->compression_type) { +case QCOW2_COMPRESSION_TYPE_ZLIB: +break; + +default: +error_setg(errp, "qcow2: unknown compression type: %u", + s->compression_type); +return -ENOTSUP; +} + +/* + * if the compression type differs from QCOW2_COMPRESSION_TYPE_ZLIB + * the incompatible feature flag must be set + */ + +if (s->compression_type != QCOW2_COMPRESSION_TYPE_ZLIB && +!(s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION_TYPE)) { +error_setg(errp, "qcow2: Invalid compression type setting"); +return -EINVAL; +} + +return 0; +} + /* Called with s->lock held. */ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) @@ -1312,6 +1338,35 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options, s->compatible_features = header.compatible_features; s->autoclear_features = header.autoclear_features; +/* + * Handle compression type + * Older qcow2 images don't contain the compression type header. + * Distinguish them by the header length and use + * the only valid (default) compression type in that case + */ +if (header.header_length > offsetof(QCowHeader, compression_type)) { +/* sanity check that we can read a compression type */ +size_t min_len = offsetof(QCowHeader, compression_type) + + sizeof(header.compression_type); +if (header.header_length < min_len) { +error_setg(errp, + "Could not read compression type, " + "qcow2 header is too short"); +ret = -EINVAL; +goto fail; +} + +header.compression_type = be32_to_cpu(header.compression_type); +s->compression_type = header.compression_type; +} else { +s->compression_type = QCOW2_COMPRESSION_TYPE_ZLIB; +} + +ret = check_compression_type(s, errp); +if (ret) { +goto fail; +} + if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { void *feature_table = NULL; qcow2_read_extensions(bs, header.header_length, ext_end, @@ -2516,6 +2571,12 @@ int qcow2_update_header(BlockDriverState *bs) total_size = bs->total_sectors * BDRV_SECTOR_SIZE; refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); +ret = check_compression_type(s, NULL); + +if (ret) { +goto fail; +} + *header = (QCowHeader) { /* Version 2 fields */ .magic = cpu_to_be32(QCOW_MAGIC), @@ -2538,6 +2599,7 @@ int qcow2_update_header(BlockDriverState *bs) .autoclear_features = cpu_to_be64(s->autoclear_features), .refcount_order = cpu_to_be32(s->refcount_order), .header_length = cpu_to_be32(header_length), +.compression_type = cpu_to_be32(s->compression_type), }; /* For older versions, write a shorter header */ @@ -2635,6 +2697,11 @@ int qcow2_update_header(BlockDriverState *bs) .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, .name = "lazy refcounts", }, +{ +.type = QCOW2_FEAT_TYPE_INCOMPATIBLE, +.bit = QCOW2_INCOMPAT_C
[Qemu-devel] [PATCH v3 0/3] qcow2: add zstd cluster compression
v3: * relax the compression type setting requirement when the compression type is not zlib [Eric, Kevin] * add compression type values to the spec [Eric] * fix wording in the spec and descriptions [Eric] * fix functions descriptions [Max] * fix zstd (de)compression functions flaws [Max] * fix zstd related parts of configure file [Max] * rebased to v4.1.0-rc5 and chenged the series version aiming to 4.2 v2: * relax the compression type setting restriction in the spec * fix qcow2 header size checking * fix error processing and messaging * fix qcow2 image specific info reporting * set Qcow2CompressionType zstd config dependant * add zstd compressed cluster format description to the spec v1: * extend qcow2 header instead of adding a new incompatible extension header specification re-written accordingly * enable zstd compression via config * fix zstd (de)compression functions * fix comments/description * fix function naming --- The goal of the patch-set is to enable qcow2 to use zstd compression for clusters. ZSTD provides better (de)compression performance than currently used ZLIB. Using it will improve perforamnce (reduce compression time) when the compressed clusters is used, e.g backup scenarios. Also, the patch-set extends qcow2 specification by adding compression_type feature. The feature enables adding ZSTD and another compression algorithms in the future. Here is some measurements ZSTD vs ZLIB: The test: Test compresses and decompresses qemu qcow2 image with just installed rhel-7.6 guest. Image cluster size: 64K. Image on disk size: 2.2G The test was conducted with brd disk to reduce the influence of disk subsystem to the test results. The results is given in seconds. compress cmd: time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd] src.img [zlib|zstd]_compressed.img decompress cmd time ./qemu-img convert -O qcow2 [zlib|zstd]_compressed.img uncompressed.img The results: compression decompression zlib zstd zlib zstd real 65.5 16.3 (-75 %) 1.9 1.6 (-16 %) user 65.0 15.8 5.3 2.5 sys 3.3 0.2 2.0 2.0 Both ZLIB and ZSTD gave the same compression ratio: ~1.5 compressed image size in both cases: ~1.4G Denis Plotnikov (3): qcow2: introduce compression type feature qcow2: rework the cluster compression routine qcow2: add zstd cluster compression block/qcow2-threads.c | 172 ++ block/qcow2.c | 100 ++ block/qcow2.h | 26 -- configure | 34 docs/interop/qcow2.txt| 39 - include/block/block_int.h | 1 + qapi/block-core.json | 23 - 7 files changed, 371 insertions(+), 24 deletions(-) -- 2.17.0
[Qemu-devel] [PATCH v3 3/3] qcow2: add zstd cluster compression
zstd significantly reduces cluster compression time. It provides better compression performance maintaining the same level of compression ratio in comparison with zlib, which, at the moment, has been the only compression method available. The performance test results: Test compresses and decompresses qemu qcow2 image with just installed rhel-7.6 guest. Image cluster size: 64K. Image on disk size: 2.2G The test was conducted with brd disk to reduce the influence of disk subsystem to the test results. The results is given in seconds. compress cmd: time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd] src.img [zlib|zstd]_compressed.img decompress cmd time ./qemu-img convert -O qcow2 [zlib|zstd]_compressed.img uncompressed.img compression decompression zlib zstd zlib zstd real 65.5 16.3 (-75 %)1.9 1.6 (-16 %) user 65.0 15.85.3 2.5 sys 3.30.22.0 2.0 Both ZLIB and ZSTD gave the same compression ratio: 1.57 compressed image size in both cases: 1.4G Signed-off-by: Denis Plotnikov --- block/qcow2-threads.c | 94 ++ block/qcow2.c | 6 +++ configure | 34 +++ docs/interop/qcow2.txt | 20 + qapi/block-core.json | 3 +- 5 files changed, 156 insertions(+), 1 deletion(-) diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c index 14b5bd76fb..85d04e6c2e 100644 --- a/block/qcow2-threads.c +++ b/block/qcow2-threads.c @@ -28,6 +28,11 @@ #define ZLIB_CONST #include +#ifdef CONFIG_ZSTD +#include +#include +#endif + #include "qcow2.h" #include "block/thread-pool.h" #include "crypto.h" @@ -165,6 +170,85 @@ static ssize_t qcow2_zlib_decompress(void *dest, size_t dest_size, return ret; } +#ifdef CONFIG_ZSTD +/* + * qcow2_zstd_compress() + * + * Compress @src_size bytes of data using zstd compression method + * + * @dest - destination buffer, @dest_size bytes + * @src - source buffer, @src_size bytes + * + * Returns: compressed size on success + * -ENOMEM destination buffer is not enough to store compressed data + * -EIOon any other error + */ + +static ssize_t qcow2_zstd_compress(void *dest, size_t dest_size, + const void *src, size_t src_size) +{ +ssize_t ret; +uint32_t *c_size = dest; +/* steal some bytes to store compressed chunk size */ +char *d_buf = ((char *) dest) + sizeof(*c_size); + +if (dest_size < sizeof(*c_size)) { +return -ENOMEM; +} + +dest_size -= sizeof(*c_size); + +ret = ZSTD_compress(d_buf, dest_size, src, src_size, 5); + +if (ZSTD_isError(ret)) { +if (ZSTD_getErrorCode(ret) == ZSTD_error_dstSize_tooSmall) { +return -ENOMEM; +} else { +return -EIO; +} +} + +/* store the compressed chunk size in the very beginning of the buffer */ +*c_size = ret; + +return ret + sizeof(*c_size); +} + +/* + * qcow2_zstd_decompress() + * + * Decompress some data (not more than @src_size bytes) to produce exactly + * @dest_size bytes using zstd compression method + * + * @dest - destination buffer, @dest_size bytes + * @src - source buffer, @src_size bytes + * + * Returns: 0 on success + * -EIO on any error + */ + +static ssize_t qcow2_zstd_decompress(void *dest, size_t dest_size, + const void *src, size_t src_size) +{ +ssize_t ret; +/* + * zstd decompress wants to know the exact length of the data + * for that purpose, on the compression the length is stored in + * the very beginning of the compressed buffer + */ +const uint32_t *s_size = src; +const char *s_buf = ((const char *) src) + sizeof(*s_size); + +ret = ZSTD_decompress(dest, dest_size, s_buf, *s_size); + +if (ZSTD_isError(ret)) { +return -EIO; +} + +return 0; +} +#endif + static int qcow2_compress_pool_func(void *opaque) { Qcow2CompressData *data = opaque; @@ -216,6 +300,11 @@ qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, fn = qcow2_zlib_compress; break; +#ifdef CONFIG_ZSTD +case QCOW2_COMPRESSION_TYPE_ZSTD: +fn = qcow2_zstd_compress; +break; +#endif default: return -ENOTSUP; } @@ -248,6 +337,11 @@ qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size, fn = qcow2_zlib_decompress; break; +#ifdef CONFIG_ZSTD +case QCOW2_COMPRESSION_TYPE_ZSTD: +fn = qcow2_zstd_decompress; +break; +#endif default: return -ENOTSUP; } diff --git a/block/qcow2.c b/block/qcow2.c index 4e07b7e9ec..dfb7b52033 100644 --- a/block/qcow2.c +++ b/block/qcow2.c
[Qemu-devel] [PATCH v3 2/3] qcow2: rework the cluster compression routine
The patch allow to process image compression type defined in the image header and choose an appropriate method for image clusters (de)compression. Signed-off-by: Denis Plotnikov --- block/qcow2-threads.c | 78 +++ 1 file changed, 64 insertions(+), 14 deletions(-) diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c index 3b1e63fe41..14b5bd76fb 100644 --- a/block/qcow2-threads.c +++ b/block/qcow2-threads.c @@ -73,8 +73,11 @@ typedef struct Qcow2CompressData { Qcow2CompressFunc func; } Qcow2CompressData; + /* - * qcow2_compress() + * qcow2_zlib_compress() + * + * Compress @src_size bytes of data using zlib compression method * * @dest - destination buffer, @dest_size bytes * @src - source buffer, @src_size bytes @@ -83,8 +86,8 @@ typedef struct Qcow2CompressData { * -ENOMEM destination buffer is not enough to store compressed data * -EIOon any other error */ -static ssize_t qcow2_compress(void *dest, size_t dest_size, - const void *src, size_t src_size) +static ssize_t qcow2_zlib_compress(void *dest, size_t dest_size, + const void *src, size_t src_size) { ssize_t ret; z_stream strm; @@ -119,19 +122,19 @@ static ssize_t qcow2_compress(void *dest, size_t dest_size, } /* - * qcow2_decompress() + * qcow2_zlib_decompress() * * Decompress some data (not more than @src_size bytes) to produce exactly - * @dest_size bytes. + * @dest_size bytes using zlib compression method * * @dest - destination buffer, @dest_size bytes * @src - source buffer, @src_size bytes * * Returns: 0 on success - * -1 on fail + * -EIO on fail */ -static ssize_t qcow2_decompress(void *dest, size_t dest_size, -const void *src, size_t src_size) +static ssize_t qcow2_zlib_decompress(void *dest, size_t dest_size, + const void *src, size_t src_size) { int ret = 0; z_stream strm; @@ -144,7 +147,7 @@ static ssize_t qcow2_decompress(void *dest, size_t dest_size, ret = inflateInit2(&strm, -12); if (ret != Z_OK) { -return -1; +return -EIO; } ret = inflate(&strm, Z_FINISH); @@ -154,7 +157,7 @@ static ssize_t qcow2_decompress(void *dest, size_t dest_size, * @src buffer may be processed partly (because in qcow2 we know size of * compressed data with precision of one sector) */ -ret = -1; +ret = -EIO; } inflateEnd(&strm); @@ -189,20 +192,67 @@ qcow2_co_do_compress(BlockDriverState *bs, void *dest, size_t dest_size, return arg.ret; } +/* + * qcow2_co_compress() + * + * Compress @src_size bytes of data using the compression + * method defined by the image compression type + * + * @dest - destination buffer, @dest_size bytes + * @src - source buffer, @src_size bytes + * + * Returns: 0 on success + * a negative error code on fail + */ ssize_t coroutine_fn qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size, const void *src, size_t src_size) { -return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, -qcow2_compress); +BDRVQcow2State *s = bs->opaque; +Qcow2CompressFunc fn; + +switch (s->compression_type) { +case QCOW2_COMPRESSION_TYPE_ZLIB: +fn = qcow2_zlib_compress; +break; + +default: +return -ENOTSUP; +} + +return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, fn); } +/* + * qcow2_co_decompress() + * + * Decompress some data (not more than @src_size bytes) to produce exactly + * @dest_size bytes using the compression method defined by the image + * compression type + * + * @dest - destination buffer, @dest_size bytes + * @src - source buffer, @src_size bytes + * + * Returns: 0 on success + * a negative error code on fail + */ ssize_t coroutine_fn qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size, const void *src, size_t src_size) { -return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, -qcow2_decompress); +BDRVQcow2State *s = bs->opaque; +Qcow2CompressFunc fn; + +switch (s->compression_type) { +case QCOW2_COMPRESSION_TYPE_ZLIB: +fn = qcow2_zlib_decompress; +break; + +default: +return -ENOTSUP; +} + +return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, fn); } -- 2.17.0
[Qemu-devel] [PATCH v9] qemu-io: add pattern file for write command
The patch allows to provide a pattern file for write command. There was no similar ability before. Signed-off-by: Denis Plotnikov --- v9: * replace flag cast to int with bool [Eric] * fix the error message [Eric] * use qemu_io_free instead of qemu_vfree [Eric] * add function description [Eric] v8: fix according to Max's comments * get rid of unnecessary buffer for the pattern * buffer allocation just in bytes * take into account the missalign offset * don't copy file name * changed char* to const char* in input params v7: * fix variable naming * make code more readable * extend help for write command v6: * the pattern file is read once to reduce io v5: * file name initiated with null to make compilers happy v4: * missing signed-off clause added v3: * missing file closing added * exclusive flags processing changed * buffer void* converted to char* to fix pointer arithmetics * file reading error processing added --- qemu-io-cmds.c | 97 ++ 1 file changed, 91 insertions(+), 6 deletions(-) diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 09750a23ce..f7bdfe673b 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -351,6 +351,77 @@ static void qemu_io_free(void *p) qemu_vfree(p); } +/* + * qemu_io_alloc_from_file() + * + * Allocates the buffer and populates it with the content of the given file + * up to @len bytes. If the file length is less then @len, then the buffer + * is populated with then file content cyclically. + * + * @blk - the block backend where the buffer content is going to be written to + * @len - the buffer length + * @file_name - the file to copy the content from + * + * Returns: the buffer pointer on success + * NULL on error + */ +static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len, + const char *file_name) +{ +char *buf, *buf_origin; +FILE *f = fopen(file_name, "r"); +int pattern_len; + +if (!f) { +perror(file_name); +return NULL; +} + +if (qemuio_misalign) { +len += MISALIGN_OFFSET; +} + +buf_origin = buf = blk_blockalign(blk, len); + +if (qemuio_misalign) { +buf_origin += MISALIGN_OFFSET; +} + +pattern_len = fread(buf_origin, 1, len, f); + +if (ferror(f)) { +perror(file_name); +goto error; +} + +if (pattern_len == 0) { +fprintf(stderr, "%s: file is empty\n", file_name); +goto error; +} + +fclose(f); + +if (len > pattern_len) { +len -= pattern_len; +buf += pattern_len; + +while (len > 0) { +size_t len_to_copy = MIN(pattern_len, len); + +memcpy(buf, buf_origin, len_to_copy); + +len -= len_to_copy; +buf += len_to_copy; +} +} + +return buf_origin; + +error: +qemu_io_free(buf_origin); +return NULL; +} + static void dump_buffer(const void *buffer, int64_t offset, int64_t len) { uint64_t i; @@ -949,6 +1020,7 @@ static void write_help(void) " -n, -- with -z, don't allow slow fallback\n" " -p, -- ignored for backwards compatibility\n" " -P, -- use different pattern to fill file\n" +" -s, -- use a pattern file to fill the write buffer\n" " -C, -- report statistics in a machine parsable format\n" " -q, -- quiet mode, do not show I/O statistics\n" " -u, -- with -z, allow unmapping\n" @@ -965,7 +1037,7 @@ static const cmdinfo_t write_cmd = { .perm = BLK_PERM_WRITE, .argmin = 2, .argmax = -1, -.args = "[-bcCfnquz] [-P pattern] off len", +.args = "[-bcCfnquz] [-P pattern | -s source_file] off len", .oneline= "writes a number of bytes at a specified offset", .help = write_help, }; @@ -974,7 +1046,7 @@ static int write_f(BlockBackend *blk, int argc, char **argv) { struct timeval t1, t2; bool Cflag = false, qflag = false, bflag = false; -bool Pflag = false, zflag = false, cflag = false; +bool Pflag = false, zflag = false, cflag = false, sflag = false; int flags = 0; int c, cnt, ret; char *buf = NULL; @@ -983,8 +1055,9 @@ static int write_f(BlockBackend *blk, int argc, char **argv) /* Some compilers get confused and warn if this is not initialized. */ int64_t total = 0; int pattern = 0xcd; +const char *file_name = NULL; -while ((c = getopt(argc, argv, "bcCfnpP:quz")) != -1) { +while ((c = getopt(argc, argv, "bcCfnpP:quzs:")) != -1) { switch (c) { case 'b': bflag = true; @@ -1020,6 +1093,10 @@ static int write_f(BlockBackend *blk, int argc, char **argv) case 'z': zflag = true; break; +case 's': +sf
Re: [Qemu-devel] [PATCH v7] qemu-io: add pattern file for write command
Ping! On Jul 5 2019, at 1:21 pm, Denis Plotnikov wrote: The patch allows to provide a pattern file for write command. There was no similar ability before. Signed-off-by: Denis Plotnikov --- v7: * fix variable naming * make code more readable * extend help for write command v6: * the pattern file is read once to reduce io v5: * file name initiated with null to make compilers happy v4: * missing signed-off clause added v3: * missing file closing added * exclusive flags processing changed * buffer void* converted to char* to fix pointer arithmetics * file reading error processing added --- qemu-io-cmds.c | 86 ++ 1 file changed, 80 insertions(+), 6 deletions(-) diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 09750a23ce..495170380a 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -343,6 +343,66 @@ static void *qemu_io_alloc(BlockBackend *blk, size_t len, int pattern) return buf; } +static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len, + char *file_name) +{ + char *buf, *buf_origin; + FILE *f = fopen(file_name, "r"); + int pattern_len; + + if (!f) { + printf("'%s': %s\n", file_name, strerror(errno)); + return NULL; + } + + if (qemuio_misalign) { + len += MISALIGN_OFFSET; + } + + buf_origin = buf = blk_blockalign(blk, len); + + pattern_len = fread(buf, sizeof(char), len, f); + + if (ferror(f)) { + printf("'%s': %s\n", file_name, strerror(errno)); + goto error; + } + + if (pattern_len == 0) { + printf("'%s' is empty\n", file_name); + goto error; + } + + fclose(f); + + if (len > pattern_len) { + char *file_buf = g_malloc(sizeof(char) * pattern_len); + memcpy(file_buf, buf, pattern_len); + len -= pattern_len; + buf += pattern_len; + + while (len > 0) { + size_t len_to_copy = MIN(pattern_len, len); + + memcpy(buf, file_buf, len_to_copy); + + len -= len_to_copy; + buf += len_to_copy; + } + qemu_vfree(file_buf); + } + + if (qemuio_misalign) { + buf_origin += MISALIGN_OFFSET; + } + + return buf_origin; + +error: + qemu_vfree(buf_origin); + return NULL; +} + static void qemu_io_free(void *p) { if (qemuio_misalign) { @@ -949,6 +1009,7 @@ static void write_help(void) " -n, -- with -z, don't allow slow fallback\n" " -p, -- ignored for backwards compatibility\n" " -P, -- use different pattern to fill file\n" +" -s, -- use a pattern file to fill the write buffer\n" " -C, -- report statistics in a machine parsable format\n" " -q, -- quiet mode, do not show I/O statistics\n" " -u, -- with -z, allow unmapping\n" @@ -965,7 +1026,7 @@ static const cmdinfo_t write_cmd = { .perm = BLK_PERM_WRITE, .argmin = 2, .argmax = -1, - .args = "[-bcCfnquz] [-P pattern] off len", + .args = "[-bcCfnquz] [-P pattern | -s source_file] off len", .oneline = "writes a number of bytes at a specified offset", .help = write_help, }; @@ -974,7 +1035,7 @@ static int write_f(BlockBackend *blk, int argc, char **argv) { struct timeval t1, t2; bool Cflag = false, qflag = false, bflag = false; - bool Pflag = false, zflag = false, cflag = false; + bool Pflag = false, zflag = false, cflag = false, sflag = false; int flags = 0; int c, cnt, ret; char *buf = NULL; @@ -983,8 +1044,9 @@ static int write_f(BlockBackend *blk, int argc, char **argv) /* Some compilers get confused and warn if this is not initialized. */ int64_t total = 0; int pattern = 0xcd; + char *file_name = NULL; - while ((c = getopt(argc, argv, "bcCfnpP:quz")) != -1) { + while ((c = getopt(argc, argv, "bcCfnpP:quzs:")) != -1) { switch (c) { case 'b': bflag = true; @@ -1020,6 +1082,10 @@ static int write_f(BlockBackend *blk, int argc, char **argv) case 'z': zflag = true; break; + case 's': + sflag = true; + file_name = g_strdup(optarg); + break; default: qemuio_command_usage(&write_cmd); return -EINVAL; @@ -1051,8 +1117,9 @@ static int write_f(BlockBackend *blk, int argc, char **argv) return -EINVAL; } - if (zflag && Pflag) { - printf("-z and -P cannot be specified at the same time\n"); + if ((int)zflag + (int)Pflag + (int)sflag > 1) { + printf("Only one of -z, -P, and -s" + "can be specified at the same time\n"); return -EINVAL; } @@ -1088,7 +1155,14 @@ static int write_f(BlockBackend *blk, int argc, char **argv) } if (!zflag) { - buf = qemu_io_alloc(blk, count, pattern); + if (sflag) { + buf = qemu_io_alloc_from_file(blk, count, file_name); + if (!buf) { + return -EINVAL; + } + } else { + buf = qemu_io_alloc(blk, count, pattern); + } } gettimeofday(&t1, NULL); -- 2.17.0
Re: [Qemu-devel] [PATCH v2 3/3] qcow2: add zstd cluster compression
On Jul 9 2019, at 9:18 am, Markus Armbruster wrote: Denis Plotnikov writes: zstd significantly reduces cluster compression time. It provides better compression performance maintaining the same level of compression ratio in comparison with zlib, which, by the moment, has been the only compression method available. The performance test results: Test compresses and decompresses qemu qcow2 image with just installed rhel-7.6 guest. Image cluster size: 64K. Image on disk size: 2.2G The test was conducted with brd disk to reduce the influence of disk subsystem to the test results. The results is given in seconds. compress cmd: time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd] src.img [zlib|zstd]_compressed.img decompress cmd time ./qemu-img convert -O qcow2 [zlib|zstd]_compressed.img uncompressed.img compression decompression zlib zstd zlib zstd real 65.5 16.3 (-75 %) 1.9 1.6 (-16 %) user 65.0 15.8 5.3 2.5 sys 3.3 0.2 2.0 2.0 Both ZLIB and ZSTD gave the same compression ratio: 1.57 compressed image size in both cases: 1.4G Signed-off-by: Denis Plotnikov [...] diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt index 7cf068f814..4344e858cb 100644 --- a/docs/interop/qcow2.txt +++ b/docs/interop/qcow2.txt @@ -538,6 +538,9 @@ Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)): Another compressed cluster may map to the tail of the final sector used by this compressed cluster. + The layout of the compressed data depends on the compression + type used for the image (see compressed cluster layout). + If a cluster is unallocated, read requests shall read the data from the backing file (except if bit 0 in the Standard Cluster Descriptor is set). If there is no backing file or the backing file is smaller than the image, they shall read @@ -790,3 +793,19 @@ In the image file the 'enabled' state is reflected by the 'auto' flag. If this flag is set, the software must consider the bitmap as 'enabled' and start tracking virtual disk changes to this bitmap from the first write to the virtual disk. If this flag is not set then the bitmap is disabled. + +=== Compressed cluster layout === + +The compressed cluster data may have a different layout depending on the +compression type used for the image, and store specific data for the particular +compression type. + +Compressed data layout for the available compression types: +(x = data_space_length - 1) + + zlib: + Byte 0 - x: the compressed data content + all the space provided used for compressed data + zstd: + Byte 0 - 3: the length of compressed data + 4 - x: the compressed data content Adding <http://zlib.net/> and <http://github.com/facebook/zstd> here as well wouldn't hurt, would it? ok diff --git a/qapi/block-core.json b/qapi/block-core.json index 835dd3c37f..2021e03a84 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -4215,11 +4215,12 @@ # Compression type used in qcow2 image file # # @zlib: zlib compression, see <http://zlib.net/> +# @zstd: zstd compression, see <http://github.com/facebook/zstd> # # Since: 4.1 ## { 'enum': 'Qcow2CompressionType', - 'data': [ 'zlib' ] } + 'data': [ 'zlib', { 'name': 'zstd', 'if': 'defined(CONFIG_ZSTD)' } ] } ## # @BlockdevCreateOptionsQcow2: QAPI schema Acked-by: Markus Armbruster
Re: [Qemu-devel] [PATCH v2 0/3] add zstd cluster compression
Hi all! Is there any other comments besides Markus's one about adding zlib/zstd links to compressed cluster layout description? On Jul 4 2019, at 4:09 pm, Denis Plotnikov wrote: change log: v2: * relax the compression type setting restriction in the spec * fix qcow2 header size checking * fix error processing and messaging * fix qcow2 image specific info reporting * set Qcow2CompressionType zstd config dependant * add zstd compressed cluster format description to the spec v1: * extend qcow2 header instead of adding a new incompatible extension header specification re-written accordingly * enable zstd compression via config * fix zstd (de)compression functions * fix comments/description * fix function naming --- The goal of the patch-set is to enable qcow2 to use zstd compression for clusters. ZSTD provides better (de)compression performance than currently used ZLIB. Using it will improve perforamnce (reduce compression time) when the compressed clusters is used, e.g backup scenarios. Also, the patch-set extends qcow2 specification by adding compression_type feature. The feature enables adding ZSTD and another compression algorithms in the future. Here is some measurements ZSTD vs ZLIB: The test: Test compresses and decompresses qemu qcow2 image with just installed rhel-7.6 guest. Image cluster size: 64K. Image on disk size: 2.2G The test was conducted with brd disk to reduce the influence of disk subsystem to the test results. The results is given in seconds. compress cmd: time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd] src.img [zlib|zstd]_compressed.img decompress cmd time ./qemu-img convert -O qcow2 [zlib|zstd]_compressed.img uncompressed.img The results: compression decompression zlib zstd zlib zstd real 65.5 16.3 (-75 %) 1.9 1.6 (-16 %) user 65.0 15.8 5.3 2.5 sys 3.3 0.2 2.0 2.0 Both ZLIB and ZSTD gave the same compression ratio: ~1.5 compressed image size in both cases: ~1.4G Denis Plotnikov (3): qcow2: introduce compression type feature qcow2: rework the cluster compression routine qcow2: add zstd cluster compression block/qcow2.c | 287 +++--- block/qcow2.h | 26 +++- configure | 32 + docs/interop/qcow2.txt | 40 +- include/block/block_int.h | 1 + qapi/block-core.json | 23 ++- 6 files changed, 379 insertions(+), 30 deletions(-) -- 2.17.0
[Qemu-devel] [PATCH v8] qemu-io: add pattern file for write command
The patch allows to provide a pattern file for write command. There was no similar ability before. Signed-off-by: Denis Plotnikov --- v8: fix according to Max's comments * get rid of unnecessary buffer for the pattern * buffer allocation just in bytes * take into account the missalign offset * don't copy file name * changed char* to const char* in input params v7: * fix variable naming * make code more readable * extend help for write command v6: * the pattern file is read once to reduce io v5: * file name initiated with null to make compilers happy v4: * missing signed-off clause added v3: * missing file closing added * exclusive flags processing changed * buffer void* converted to char* to fix pointer arithmetics * file reading error processing added --- qemu-io-cmds.c | 83 ++ 1 file changed, 77 insertions(+), 6 deletions(-) diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 09750a23ce..940271ea00 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -343,6 +343,63 @@ static void *qemu_io_alloc(BlockBackend *blk, size_t len, int pattern) return buf; } +static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len, + const char *file_name) +{ +char *buf, *buf_origin; +FILE *f = fopen(file_name, "r"); +int pattern_len; + +if (!f) { +perror(file_name); +return NULL; +} + +if (qemuio_misalign) { +len += MISALIGN_OFFSET; +} + +buf_origin = buf = blk_blockalign(blk, len); + +if (qemuio_misalign) { +buf_origin += MISALIGN_OFFSET; +} + +pattern_len = fread(buf_origin, 1, len, f); + +if (ferror(f)) { +perror(file_name); +goto error; +} + +if (pattern_len == 0) { +fprintf(stderr, "%s: file is empty\n", file_name); +goto error; +} + +fclose(f); + +if (len > pattern_len) { +len -= pattern_len; +buf += pattern_len; + +while (len > 0) { +size_t len_to_copy = MIN(pattern_len, len); + +memcpy(buf, buf_origin, len_to_copy); + +len -= len_to_copy; +buf += len_to_copy; +} +} + +return buf_origin; + +error: +qemu_vfree(buf_origin); +return NULL; +} + static void qemu_io_free(void *p) { if (qemuio_misalign) { @@ -949,6 +1006,7 @@ static void write_help(void) " -n, -- with -z, don't allow slow fallback\n" " -p, -- ignored for backwards compatibility\n" " -P, -- use different pattern to fill file\n" +" -s, -- use a pattern file to fill the write buffer\n" " -C, -- report statistics in a machine parsable format\n" " -q, -- quiet mode, do not show I/O statistics\n" " -u, -- with -z, allow unmapping\n" @@ -965,7 +1023,7 @@ static const cmdinfo_t write_cmd = { .perm = BLK_PERM_WRITE, .argmin = 2, .argmax = -1, -.args = "[-bcCfnquz] [-P pattern] off len", +.args = "[-bcCfnquz] [-P pattern | -s source_file] off len", .oneline= "writes a number of bytes at a specified offset", .help = write_help, }; @@ -974,7 +1032,7 @@ static int write_f(BlockBackend *blk, int argc, char **argv) { struct timeval t1, t2; bool Cflag = false, qflag = false, bflag = false; -bool Pflag = false, zflag = false, cflag = false; +bool Pflag = false, zflag = false, cflag = false, sflag = false; int flags = 0; int c, cnt, ret; char *buf = NULL; @@ -983,8 +1041,9 @@ static int write_f(BlockBackend *blk, int argc, char **argv) /* Some compilers get confused and warn if this is not initialized. */ int64_t total = 0; int pattern = 0xcd; +const char *file_name = NULL; -while ((c = getopt(argc, argv, "bcCfnpP:quz")) != -1) { +while ((c = getopt(argc, argv, "bcCfnpP:quzs:")) != -1) { switch (c) { case 'b': bflag = true; @@ -1020,6 +1079,10 @@ static int write_f(BlockBackend *blk, int argc, char **argv) case 'z': zflag = true; break; +case 's': +sflag = true; +file_name = optarg; +break; default: qemuio_command_usage(&write_cmd); return -EINVAL; @@ -1051,8 +1114,9 @@ static int write_f(BlockBackend *blk, int argc, char **argv) return -EINVAL; } -if (zflag && Pflag) { -printf("-z and -P cannot be specified at the same time\n"); +if ((int)zflag + (int)Pflag + (int)sflag > 1) { +printf("Only one of -z, -P, and -s" + "can be specified at the same time\n"); return -EINVAL; } @@ -1088,7 +1152,14 @@ static int writ
Re: [Qemu-devel] [PATCH v2 0/3] add zstd cluster compression
Ping! On Jul 30 2019, at 5:45 pm, Denis Plotnikov wrote: Hi all! Is there any other comments besides Markus's one about adding zlib/zstd links to compressed cluster layout description? On Jul 4 2019, at 4:09 pm, Denis Plotnikov wrote: change log: v2: * relax the compression type setting restriction in the spec * fix qcow2 header size checking * fix error processing and messaging * fix qcow2 image specific info reporting * set Qcow2CompressionType zstd config dependant * add zstd compressed cluster format description to the spec v1: * extend qcow2 header instead of adding a new incompatible extension header specification re-written accordingly * enable zstd compression via config * fix zstd (de)compression functions * fix comments/description * fix function naming --- The goal of the patch-set is to enable qcow2 to use zstd compression for clusters. ZSTD provides better (de)compression performance than currently used ZLIB. Using it will improve perforamnce (reduce compression time) when the compressed clusters is used, e.g backup scenarios. Also, the patch-set extends qcow2 specification by adding compression_type feature. The feature enables adding ZSTD and another compression algorithms in the future. Here is some measurements ZSTD vs ZLIB: The test: Test compresses and decompresses qemu qcow2 image with just installed rhel-7.6 guest. Image cluster size: 64K. Image on disk size: 2.2G The test was conducted with brd disk to reduce the influence of disk subsystem to the test results. The results is given in seconds. compress cmd: time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd] src.img [zlib|zstd]_compressed.img decompress cmd time ./qemu-img convert -O qcow2 [zlib|zstd]_compressed.img uncompressed.img The results: compression decompression zlib zstd zlib zstd real 65.5 16.3 (-75 %) 1.9 1.6 (-16 %) user 65.0 15.8 5.3 2.5 sys 3.3 0.2 2.0 2.0 Both ZLIB and ZSTD gave the same compression ratio: ~1.5 compressed image size in both cases: ~1.4G Denis Plotnikov (3): qcow2: introduce compression type feature qcow2: rework the cluster compression routine qcow2: add zstd cluster compression block/qcow2.c | 287 +++--- block/qcow2.h | 26 +++- configure | 32 + docs/interop/qcow2.txt | 40 +- include/block/block_int.h | 1 + qapi/block-core.json | 23 ++- 6 files changed, 379 insertions(+), 30 deletions(-) -- 2.17.0
Re: [PATCH v1 2/4] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
On 30.01.2020 17:58, Stefan Hajnoczi wrote: On Wed, Jan 29, 2020 at 05:07:00PM +0300, Denis Plotnikov wrote: The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/core/machine.c | 3 +++ include/hw/virtio/virtio.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 3e288bfceb..8bc401d8b7 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -28,6 +28,9 @@ #include "hw/mem/nvdimm.h" GlobalProperty hw_compat_4_2[] = { +{ "virtio-blk-device", "queue-size", "128"}, +{ "virtio-scsi-device", "virtqueue_size", "128"}, +{ "vhost-blk-device", "virtqueue_size", "128"}, vhost-blk-device?! Who has this? It's not in qemu.git so please omit this line. ;-) So in this case the line: { "vhost-blk-device", "seg_max_adjust", "off"}, introduced by my patch: commit 1bf8a989a566b2ba41c197004ec2a02562a766a4 Author: Denis Plotnikov Date: Fri Dec 20 17:09:04 2019 +0300 virtio: make seg_max virtqueue size dependent is also wrong. It should be: { "vhost-scsi-device", "seg_max_adjust", "off"}, Am I right? On the other hand, do you want to do this for the vhost-user-blk, vhost-user-scsi, and vhost-scsi devices that exist in qemu.git? Those devices would benefit from better performance too. It seems to be so. We also have the test checking those settings: tests/acceptance/virtio_seg_max_adjust.py For now it checks virtio-scsi-pci and virtio-blk-pci. I'm going to extend it for the virtqueue size checking. If I change vhost-user-blk, vhost-user-scsi and vhost-scsi it's worth to check those devices too. But I don't know how to form a command line for that 3 devices since they should involve some third party components as backends (kernel modules, DPDK, etc.) and they seems to be not available in the qemu git. Is there any way to do it with some qit.qemu available stubs or something else? If so, could you please point out the proper way to do it? Thanks! Denis
Re: [PATCH v1 2/4] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
On 05.02.2020 14:19, Stefan Hajnoczi wrote: On Tue, Feb 04, 2020 at 12:59:04PM +0300, Denis Plotnikov wrote: On 30.01.2020 17:58, Stefan Hajnoczi wrote: On Wed, Jan 29, 2020 at 05:07:00PM +0300, Denis Plotnikov wrote: The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/core/machine.c | 3 +++ include/hw/virtio/virtio.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 3e288bfceb..8bc401d8b7 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -28,6 +28,9 @@ #include "hw/mem/nvdimm.h" GlobalProperty hw_compat_4_2[] = { +{ "virtio-blk-device", "queue-size", "128"}, +{ "virtio-scsi-device", "virtqueue_size", "128"}, +{ "vhost-blk-device", "virtqueue_size", "128"}, vhost-blk-device?! Who has this? It's not in qemu.git so please omit this line. ;-) So in this case the line: { "vhost-blk-device", "seg_max_adjust", "off"}, introduced by my patch: commit 1bf8a989a566b2ba41c197004ec2a02562a766a4 Author: Denis Plotnikov Date: Fri Dec 20 17:09:04 2019 +0300 virtio: make seg_max virtqueue size dependent is also wrong. It should be: { "vhost-scsi-device", "seg_max_adjust", "off"}, Am I right? It's just called "vhost-scsi": include/hw/virtio/vhost-scsi.h:#define TYPE_VHOST_SCSI "vhost-scsi" On the other hand, do you want to do this for the vhost-user-blk, vhost-user-scsi, and vhost-scsi devices that exist in qemu.git? Those devices would benefit from better performance too. After thinking about that for a while, I think we shouldn't extend queue sizes for vhost-user-blk, vhost-user-scsi and vhost-scsi. This is because increasing the queue sizes seems to be just useless for them: the all thing is about increasing the queue sizes for increasing seg_max (it limits the max block query size from the guest). For virtio-blk-device and virtio-scsi-device it makes sense, since they have seg-max-adjust property which, if true, sets seg_max to virtqueue_size-2. vhost-scsi also have this property but it seems the property just doesn't affect anything (remove it?). Also vhost-user-blk, vhost-user-scsi and vhost-scsi don't do any seg_max settings. If I understand correctly, their backends are ment to be responsible for doing that. So, what about changing the queue sizes just for virtio-blk-device and virtio-scsi-device? Denis It seems to be so. We also have the test checking those settings: tests/acceptance/virtio_seg_max_adjust.py For now it checks virtio-scsi-pci and virtio-blk-pci. I'm going to extend it for the virtqueue size checking. If I change vhost-user-blk, vhost-user-scsi and vhost-scsi it's worth to check those devices too. But I don't know how to form a command line for that 3 devices since they should involve some third party components as backends (kernel modules, DPDK, etc.) and they seems to be not available in the qemu git. Is there any way to do it with some qit.qemu available stubs or something else? If so, could you please point out the proper way to do it? qemu.git has contrib/vhost-user-blk/ and contrib/vhost-user-scsi/ if you need to test those vhost-user devices without external dependencies. Stefan
Re: [PATCH v1 2/4] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
On 09.02.2020 10:49, Michael S. Tsirkin wrote: On Fri, Feb 07, 2020 at 11:48:05AM +0300, Denis Plotnikov wrote: On 05.02.2020 14:19, Stefan Hajnoczi wrote: On Tue, Feb 04, 2020 at 12:59:04PM +0300, Denis Plotnikov wrote: On 30.01.2020 17:58, Stefan Hajnoczi wrote: On Wed, Jan 29, 2020 at 05:07:00PM +0300, Denis Plotnikov wrote: The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/core/machine.c | 3 +++ include/hw/virtio/virtio.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 3e288bfceb..8bc401d8b7 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -28,6 +28,9 @@ #include "hw/mem/nvdimm.h" GlobalProperty hw_compat_4_2[] = { +{ "virtio-blk-device", "queue-size", "128"}, +{ "virtio-scsi-device", "virtqueue_size", "128"}, +{ "vhost-blk-device", "virtqueue_size", "128"}, vhost-blk-device?! Who has this? It's not in qemu.git so please omit this line. ;-) So in this case the line: { "vhost-blk-device", "seg_max_adjust", "off"}, introduced by my patch: commit 1bf8a989a566b2ba41c197004ec2a02562a766a4 Author: Denis Plotnikov Date: Fri Dec 20 17:09:04 2019 +0300 virtio: make seg_max virtqueue size dependent is also wrong. It should be: { "vhost-scsi-device", "seg_max_adjust", "off"}, Am I right? It's just called "vhost-scsi": include/hw/virtio/vhost-scsi.h:#define TYPE_VHOST_SCSI "vhost-scsi" On the other hand, do you want to do this for the vhost-user-blk, vhost-user-scsi, and vhost-scsi devices that exist in qemu.git? Those devices would benefit from better performance too. After thinking about that for a while, I think we shouldn't extend queue sizes for vhost-user-blk, vhost-user-scsi and vhost-scsi. This is because increasing the queue sizes seems to be just useless for them: the all thing is about increasing the queue sizes for increasing seg_max (it limits the max block query size from the guest). For virtio-blk-device and virtio-scsi-device it makes sense, since they have seg-max-adjust property which, if true, sets seg_max to virtqueue_size-2. vhost-scsi also have this property but it seems the property just doesn't affect anything (remove it?). Also vhost-user-blk, vhost-user-scsi and vhost-scsi don't do any seg_max settings. If I understand correctly, their backends are ment to be responsible for doing that. The queue size is set by qemu IIRC. So, what about changing the queue sizes just for virtio-blk-device and virtio-scsi-device? Hmm that would break ability to migrate between userspace and vhost backends, would it not? I'm not sure I've understood what you meant. Just for the record, I was going to change virtqueue-size for virtio-blk-device and virtio-scsi-device since they can adjust seg_max to the specified queue size and I don't want to touch vhost-s and vhost-user-s since they don't have adjustable seg_max for now. Denis Denis It seems to be so. We also have the test checking those settings: tests/acceptance/virtio_seg_max_adjust.py For now it checks virtio-scsi-pci and virtio-blk-pci. I'm going to extend it for the virtqueue size checking. If I change vhost-user-blk, vhost-user-scsi and vhost-scsi it's worth to check those devices too. But I don't know how to form a command line for that 3 devices since they should involve some third party components as backends (kernel modules, DPDK, etc.) and they seems to be not available in the qemu git. Is there any way to do it with some qit.qemu available stubs or something else? If so, could you please point out the proper way to do it? qemu.git has contrib/vhost-user-blk/ and contrib/vhost-user-scsi/ if you need to test those vhost-user devices without external dependencies. Stefan
[PATCH] pc: remove erroneous seg_max_adjust setting for vhost-blk-device
vhost-blk-device isn't a part of qemu.git Signed-off-by: Denis Plotnikov --- hw/core/machine.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index d8e30e4895..2501b540ec 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -31,7 +31,6 @@ GlobalProperty hw_compat_4_2[] = { { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" }, { "virtio-blk-device", "seg-max-adjust", "off"}, { "virtio-scsi-device", "seg_max_adjust", "off"}, -{ "vhost-blk-device", "seg_max_adjust", "off"}, { "usb-host", "suppress-remote-wake", "off" }, { "usb-redir", "suppress-remote-wake", "off" }, }; -- 2.17.0
[PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk
The goal is to reduce the amount of requests issued by a guest on 1M reads/writes. This rises the performance up to 4% on that kind of disk access pattern. The maximum chunk size to be used for the guest disk accessing is limited with seg_max parameter, which represents the max amount of pices in the scatter-geather list in one guest disk request. Since seg_max is virqueue_size dependent, increasing the virtqueue size increases seg_max, which, in turn, increases the maximum size of data to be read/write from a guest disk. More details in the original problem statment: https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html Suggested-by: Denis V. Lunev Signed-off-by: Denis Plotnikov --- hw/block/virtio-blk.c | 4 ++-- hw/core/machine.c | 2 ++ hw/scsi/virtio-scsi.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 09f46ed85f..6df3a7a6df 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -914,7 +914,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) memset(&blkcfg, 0, sizeof(blkcfg)); virtio_stq_p(vdev, &blkcfg.capacity, capacity); virtio_stl_p(vdev, &blkcfg.seg_max, - s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2); + s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 256 - 2); virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls); virtio_stl_p(vdev, &blkcfg.blk_size, blk_size); virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size); @@ -1272,7 +1272,7 @@ static Property virtio_blk_properties[] = { DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, true), DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), -DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128), +DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true), DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD, IOThread *), diff --git a/hw/core/machine.c b/hw/core/machine.c index 2501b540ec..3427d6cf4c 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -28,6 +28,8 @@ #include "hw/mem/nvdimm.h" GlobalProperty hw_compat_4_2[] = { +{ "virtio-blk-device", "queue-size", "128"}, +{ "virtio-scsi-device", "virtqueue_size", "128"}, { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" }, { "virtio-blk-device", "seg-max-adjust", "off"}, { "virtio-scsi-device", "seg_max_adjust", "off"}, diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 3b61563609..b38f50a429 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -660,7 +660,7 @@ static void virtio_scsi_get_config(VirtIODevice *vdev, virtio_stl_p(vdev, &scsiconf->num_queues, s->conf.num_queues); virtio_stl_p(vdev, &scsiconf->seg_max, - s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 128 - 2); + s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 256 - 2); virtio_stl_p(vdev, &scsiconf->max_sectors, s->conf.max_sectors); virtio_stl_p(vdev, &scsiconf->cmd_per_lun, s->conf.cmd_per_lun); virtio_stl_p(vdev, &scsiconf->event_info_size, sizeof(VirtIOSCSIEvent)); @@ -965,7 +965,7 @@ static void virtio_scsi_device_unrealize(DeviceState *dev, Error **errp) static Property virtio_scsi_properties[] = { DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 1), DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI, - parent_obj.conf.virtqueue_size, 128), + parent_obj.conf.virtqueue_size, 256), DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI, parent_obj.conf.seg_max_adjust, true), DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors, -- 2.17.0
[PATCH v1 2/2] tests/acceptance/virtio_check_params: prepare to check different params
Signed-off-by: Denis Plotnikov --- tests/acceptance/virtio_check_params.py | 38 ++--- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tests/acceptance/virtio_check_params.py b/tests/acceptance/virtio_check_params.py index deec89bf86..e578952a97 100644 --- a/tests/acceptance/virtio_check_params.py +++ b/tests/acceptance/virtio_check_params.py @@ -43,7 +43,7 @@ VM_DEV_PARAMS = {'virtio-scsi-pci': ['-device', 'virtio-scsi-pci,id=scsi0'], EXCLUDED_MACHINES = ['none', 'isapc', 'microvm'] -class VirtioMaxSegSettingsCheck(Test): +class VirtioParamsCheck(Test): @staticmethod def make_pattern(props): pattern_items = ['{0} = \w+'.format(prop) for prop in props] @@ -75,12 +75,12 @@ class VirtioMaxSegSettingsCheck(Test): props[p[0]] = p[1] return query_ok, props, error -def check_mt(self, mt, dev_type_name): -mt['device'] = dev_type_name # Only for the debug() call. +def check_mt(self, mt, expected_vals, dev_type_name): +msg = "mt: %s dev: %s" % (mt, dev_type_name) # For debug() call only. logger = logging.getLogger('machine') -logger.debug(mt) +logger.debug(msg) with QEMUMachine(self.qemu_bin) as vm: -vm.set_machine(mt["name"]) +vm.set_machine(mt) vm.add_args('-nodefaults') for s in VM_DEV_PARAMS[dev_type_name]: vm.add_args(s) @@ -92,11 +92,15 @@ class VirtioMaxSegSettingsCheck(Test): error = sys.exc_info()[0] if not query_ok: -self.fail('machine type {0}: {1}'.format(mt['name'], error)) +self.fail('machine type {0}: {1}'.format(mt, error)) for prop_name, prop_val in props.items(): -expected_val = mt[prop_name] -self.assertEqual(expected_val, prop_val) +expected_val = expected_vals[prop_name] +msg = 'Property value mismatch for (MT: {0}, '\ + 'property name: {1}): expected value: "{2}" '\ + 'actual value: "{3}"'\ + .format(mt, prop_name, expected_val, prop_val) +self.assertEqual(expected_val, prop_val, msg) @staticmethod def seg_max_adjust_enabled(mt): @@ -128,25 +132,27 @@ class VirtioMaxSegSettingsCheck(Test): @skip("break multi-arch CI") def test_machine_types(self): -# collect all machine types except 'none', 'isapc', 'microvm' +# collect all machine types with QEMUMachine(self.qemu_bin) as vm: vm.launch() machines = [m['name'] for m in vm.command('query-machines')] vm.shutdown() +# ..and exclude non-relevant ones machines = self.filter_machines(machines) for dev_type in DEV_TYPES: -# create the list of machine types and their parameters. -mtypes = list() +# define expected parameters for each machine type +mt_expected_vals = dict() for m in machines: if self.seg_max_adjust_enabled(m): enabled = 'true' else: enabled = 'false' -mtypes.append({'name': m, - DEV_TYPES[dev_type]['seg_max_adjust']: enabled}) -# test each machine type for a device type -for mt in mtypes: -self.check_mt(mt, dev_type) +mt_expected_vals[m] = { +DEV_TYPES[dev_type]['seg_max_adjust']: enabled } + +# test each machine type +for mt in mt_expected_vals: +self.check_mt(mt, mt_expected_vals[mt], dev_type) -- 2.17.0
[PATCH v1 1/2] tests/acceptance/virtio_check_params: remove excluded machine types carefully
Before, the test failed if an excluded machine type was absent in the machine types lists. Signed-off-by: Denis Plotnikov --- tests/acceptance/virtio_check_params.py | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/acceptance/virtio_check_params.py b/tests/acceptance/virtio_check_params.py index 87e6c839d1..deec89bf86 100644 --- a/tests/acceptance/virtio_check_params.py +++ b/tests/acceptance/virtio_check_params.py @@ -40,6 +40,8 @@ VM_DEV_PARAMS = {'virtio-scsi-pci': ['-device', 'virtio-scsi-pci,id=scsi0'], '-drive', 'driver=null-co,id=drive0,if=none']} +EXCLUDED_MACHINES = ['none', 'isapc', 'microvm'] + class VirtioMaxSegSettingsCheck(Test): @staticmethod @@ -117,6 +119,13 @@ class VirtioMaxSegSettingsCheck(Test): return True return False +@staticmethod +def filter_machines(machines): +for mt in EXCLUDED_MACHINES: +if mt in machines: +machines.remove(mt) +return machines + @skip("break multi-arch CI") def test_machine_types(self): # collect all machine types except 'none', 'isapc', 'microvm' @@ -124,9 +133,8 @@ class VirtioMaxSegSettingsCheck(Test): vm.launch() machines = [m['name'] for m in vm.command('query-machines')] vm.shutdown() -machines.remove('none') -machines.remove('isapc') -machines.remove('microvm') + +machines = self.filter_machines(machines) for dev_type in DEV_TYPES: # create the list of machine types and their parameters. -- 2.17.0
[PATCH v1 0/2] Improve virtio_check_params test
* fixed failing on non-existed machine type removal * the test refactored to add more parameters to check Gereral questions left: How to restric test for using: 1. on a set of target OS-es 2. on a set target architectures Denis Plotnikov (2): tests/acceptance/virtio_check_params: remove excluded machine types carefully tests/acceptance/virtio_check_params: prepare to check different params tests/acceptance/virtio_check_params.py | 52 - 1 file changed, 33 insertions(+), 19 deletions(-) -- 2.17.0
Re: [PATCH v1 0/2] Improve virtio_check_params test
On 11.02.2020 17:37, Philippe Mathieu-Daudé wrote: Hi Denis, On 2/11/20 3:25 PM, Denis Plotnikov wrote: * fixed failing on non-existed machine type removal * the test refactored to add more parameters to check Gereral questions left: How to restric test for using: 1. on a set of target OS-es 2. on a set target architectures Denis Plotnikov (2): tests/acceptance/virtio_check_params: remove excluded machine types carefully tests/acceptance/virtio_check_params: prepare to check different params tests/acceptance/virtio_check_params.py | 52 - 1 file changed, 33 insertions(+), 19 deletions(-) Have you noticed my other series suggested by Cornelia? It runs your test on S390X and PPC: https://www.mail-archive.com/qemu-devel@nongnu.org/msg675092.html https://www.mail-archive.com/qemu-devel@nongnu.org/msg675095.html Hi, Philippe Seems that I've missed them. I just made patches upon the fresh master. Can I get a git tree which has those patches applied? Or should I wait while the patches landed to qemu master and the rebase on them? Denis
Re: [PATCH v4] qapi/qmp: Add timestamps to qmp command responses
[ping] On 01.11.2022 18:37, Denis Plotnikov wrote: Add "start" & "end" time values to QMP command responses. These time values are added to let the qemu management layer get the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals. This helps to look for problems poactively from the management layer side. The management layer would be able to detect problem cases by calculating QMP command execution time: 1. execution_time_from_mgmt_perspective - execution_time_of_qmp_command > some_threshold This detects problems with management layer or internal qemu QMP command dispatching 2. current_qmp_command_execution_time > avg_qmp_command_execution_time This detects that a certain QMP command starts to execute longer than usual In both these cases more thorough investigation of the root cases should be done by using some qemu tracepoints depending on particular QMP command under investigation or by other means. The timestamps help to avoid excessive log output when qemu tracepoints are used to address similar cases. Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The response of the QMP command contains the start & end time of the QMP command processing. Also, "start" & "end" timestaps are added to qemu guest agent responses as qemu-ga shares the same code for request dispatching. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov Reviewed-by: Daniel P. Berrangé --- v3->v4 - rewrite commit message [Markus] - use new fileds description in doc [Markus] - change type to int64_t [Markus] - simplify tests [Markus] v2->v3: - fix typo "timestaps -> timestamps" [Marc-André] v1->v2: - rephrase doc descriptions [Daniel] - add tests for qmp timestamps to qmp test and qga test [Daniel] - adjust asserts in test-qmp-cmds according to the new number of returning keys v0->v1: - remove interface to control "start" and "end" time values: return timestamps unconditionally - add description to qmp specification - leave the same timestamp format in "seconds", "microseconds" to be consistent with events timestamp - fix patch description docs/interop/qmp-spec.txt | 28 ++-- qapi/qmp-dispatch.c| 18 ++ tests/qtest/qmp-test.c | 32 tests/unit/test-qga.c | 29 + tests/unit/test-qmp-cmds.c | 4 ++-- 5 files changed, 107 insertions(+), 4 deletions(-) diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt index b0e8351d5b261..0dd8e716c02f0 100644 --- a/docs/interop/qmp-spec.txt +++ b/docs/interop/qmp-spec.txt @@ -158,7 +158,9 @@ responses that have an unknown "id" field. The format of a success response is: -{ "return": json-value, "id": json-value } +{ "return": json-value, "id": json-value, + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -169,13 +171,25 @@ The format of a success response is: command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch +- The "end" member contains the exact time of when the server + finished executing the command. This excludes any time the + command response spent queued, waiting to be sent on the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch 2.4.2 error --- The format of an error response is: -{ "error": { "class": json-string, "desc": json-string }, "id": json-value } +{ "error": { "class": json-string, "desc": json-string }, "id": json-value + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -184,6
[PING] [PATCH v4] qapi/qmp: Add timestamps to qmp command responses
On 10.01.2023 13:32, Denis Plotnikov wrote: [ping] On 01.11.2022 18:37, Denis Plotnikov wrote: Add "start" & "end" time values to QMP command responses. These time values are added to let the qemu management layer get the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals. This helps to look for problems poactively from the management layer side. The management layer would be able to detect problem cases by calculating QMP command execution time: 1. execution_time_from_mgmt_perspective - execution_time_of_qmp_command > some_threshold This detects problems with management layer or internal qemu QMP command dispatching 2. current_qmp_command_execution_time > avg_qmp_command_execution_time This detects that a certain QMP command starts to execute longer than usual In both these cases more thorough investigation of the root cases should be done by using some qemu tracepoints depending on particular QMP command under investigation or by other means. The timestamps help to avoid excessive log output when qemu tracepoints are used to address similar cases. Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The response of the QMP command contains the start & end time of the QMP command processing. Also, "start" & "end" timestaps are added to qemu guest agent responses as qemu-ga shares the same code for request dispatching. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov Reviewed-by: Daniel P. Berrangé --- v3->v4 - rewrite commit message [Markus] - use new fileds description in doc [Markus] - change type to int64_t [Markus] - simplify tests [Markus] v2->v3: - fix typo "timestaps -> timestamps" [Marc-André] v1->v2: - rephrase doc descriptions [Daniel] - add tests for qmp timestamps to qmp test and qga test [Daniel] - adjust asserts in test-qmp-cmds according to the new number of returning keys v0->v1: - remove interface to control "start" and "end" time values: return timestamps unconditionally - add description to qmp specification - leave the same timestamp format in "seconds", "microseconds" to be consistent with events timestamp - fix patch description docs/interop/qmp-spec.txt | 28 ++-- qapi/qmp-dispatch.c| 18 ++ tests/qtest/qmp-test.c | 32 tests/unit/test-qga.c | 29 + tests/unit/test-qmp-cmds.c | 4 ++-- 5 files changed, 107 insertions(+), 4 deletions(-) diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt index b0e8351d5b261..0dd8e716c02f0 100644 --- a/docs/interop/qmp-spec.txt +++ b/docs/interop/qmp-spec.txt @@ -158,7 +158,9 @@ responses that have an unknown "id" field. The format of a success response is: -{ "return": json-value, "id": json-value } +{ "return": json-value, "id": json-value, + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -169,13 +171,25 @@ The format of a success response is: command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch +- The "end" member contains the exact time of when the server + finished executing the command. This excludes any time the + command response spent queued, waiting to be sent on the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch 2.4.2 error --- The format of an error response is: -{ "error": { "class": json-string, "desc": json-string }, "id": json-value } +{ "error": { "class": json-string, "desc": json-string }, "id": json-value + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds&quo
[PATCH v1 0/2] vl: flush all task from rcu queue before exiting
v1 -> v0: * move monitor cleanup to the very end of qemu cleanup [Paolo] The goal is to notify management layer about device destruction on qemu shutdown. Without this series DEVICE_DELETED event may not be sent because of stuck tasks in the rcu thread. The rcu tasks may stuck on qemu shutdown because the rcu not always have enough time to run them. Denis Plotnikov (2): monitor: move monitor destruction to the very end of qemu cleanup vl: flush all task from rcu queue before exiting include/qemu/rcu.h | 1 + monitor/monitor.c | 6 ++ softmmu/runstate.c | 4 +++- util/rcu.c | 12 4 files changed, 22 insertions(+), 1 deletion(-) -- 2.25.1
[PATCH v1 1/2] monitor: move monitor destruction to the very end of qemu cleanup
This is needed to keep sending DEVICE_DELETED events on qemu cleanup. The event may happen in the rcu thread and we're going to flush the rcu queue explicitly before qemu exiting in the next patch. So move the monitor destruction to the very end of qemu cleanup to be able to send all the events. Signed-off-by: Denis Plotnikov --- monitor/monitor.c | 6 ++ softmmu/runstate.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/monitor/monitor.c b/monitor/monitor.c index 21c7a68758f5..b04ae4850db2 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -605,11 +605,17 @@ void monitor_data_init(Monitor *mon, bool is_qmp, bool skip_flush, mon->outbuf = g_string_new(NULL); mon->skip_flush = skip_flush; mon->use_io_thread = use_io_thread; +/* + * take an extra ref to prevent monitor's chardev + * from destroying in qemu_chr_cleanup() + */ +object_ref(OBJECT(mon->chr.chr)); } void monitor_data_destroy(Monitor *mon) { g_free(mon->mon_cpu_path); +object_unref(OBJECT(mon->chr.chr)); qemu_chr_fe_deinit(&mon->chr, false); if (monitor_is_qmp(mon)) { monitor_data_destroy_qmp(container_of(mon, MonitorQMP, common)); diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 10d9b7365aa7..8d29dd2c00e2 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -819,8 +819,8 @@ void qemu_cleanup(void) tpm_cleanup(); net_cleanup(); audio_cleanup(); -monitor_cleanup(); qemu_chr_cleanup(); user_creatable_cleanup(); +monitor_cleanup(); /* TODO: unref root container, check all devices are ok */ } -- 2.25.1
[PATCH v1 2/2] vl: flush all task from rcu queue before exiting
The device destruction may superimpose over qemu shutdown. In this case some management layer, requested a device unplug and waiting for DEVICE_DELETED event, may never get this event. This happens because device_finalize() may never be called on qemu shutdown for some devices using address_space_destroy(). The later is called from the rcu thread. On qemu shutdown, not all rcu callbacks may be called because the rcu thread may not have enough time to converge before qemu main thread exit. To resolve this issue this patch makes rcu thread to finish all its callbacks explicitly by calling a new rcu intreface function right before qemu main thread exit. Signed-off-by: Denis Plotnikov --- include/qemu/rcu.h | 1 + softmmu/runstate.c | 2 ++ util/rcu.c | 12 3 files changed, 15 insertions(+) diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h index 515d327cf11c..f7fbdc3781e5 100644 --- a/include/qemu/rcu.h +++ b/include/qemu/rcu.h @@ -134,6 +134,7 @@ struct rcu_head { extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func); extern void drain_call_rcu(void); +extern void flush_rcu(void); /* The operands of the minus operator must have the same type, * which must be the one that we specify in the cast. diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 8d29dd2c00e2..3f833678f6eb 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -821,6 +821,8 @@ void qemu_cleanup(void) audio_cleanup(); qemu_chr_cleanup(); user_creatable_cleanup(); +/* finish all the tasks from rcu queue before exiting */ +flush_rcu(); monitor_cleanup(); /* TODO: unref root container, check all devices are ok */ } diff --git a/util/rcu.c b/util/rcu.c index 13ac0f75cb2a..f047f8ee8d16 100644 --- a/util/rcu.c +++ b/util/rcu.c @@ -348,6 +348,18 @@ void drain_call_rcu(void) } +/* + * This function drains rcu queue until there are no tasks to do left + * and aims to the cases when one needs to ensure that no work hang + * in rcu thread before proceeding, e.g. on qemu shutdown. + */ +void flush_rcu(void) +{ +while (qatomic_read(&rcu_call_count) > 0) { +drain_call_rcu(); +} +} + void rcu_register_thread(void) { assert(rcu_reader.ctr == 0); -- 2.25.1
[Ping] [PATCH v1 0/2] vl: flush all task from rcu queue before exiting
Ping! On 15.11.2021 12:41, Denis Plotnikov wrote: v1 -> v0: * move monitor cleanup to the very end of qemu cleanup [Paolo] The goal is to notify management layer about device destruction on qemu shutdown. Without this series DEVICE_DELETED event may not be sent because of stuck tasks in the rcu thread. The rcu tasks may stuck on qemu shutdown because the rcu not always have enough time to run them. Denis Plotnikov (2): monitor: move monitor destruction to the very end of qemu cleanup vl: flush all task from rcu queue before exiting include/qemu/rcu.h | 1 + monitor/monitor.c | 6 ++ softmmu/runstate.c | 4 +++- util/rcu.c | 12 4 files changed, 22 insertions(+), 1 deletion(-)
[PING][Ping] [PATCH v1 0/2] vl: flush all task from rcu queue before exiting
ping ping On 19.11.2021 12:42, Denis Plotnikov wrote: Ping! On 15.11.2021 12:41, Denis Plotnikov wrote: v1 -> v0: * move monitor cleanup to the very end of qemu cleanup [Paolo] The goal is to notify management layer about device destruction on qemu shutdown. Without this series DEVICE_DELETED event may not be sent because of stuck tasks in the rcu thread. The rcu tasks may stuck on qemu shutdown because the rcu not always have enough time to run them. Denis Plotnikov (2): monitor: move monitor destruction to the very end of qemu cleanup vl: flush all task from rcu queue before exiting include/qemu/rcu.h | 1 + monitor/monitor.c | 6 ++ softmmu/runstate.c | 4 +++- util/rcu.c | 12 4 files changed, 22 insertions(+), 1 deletion(-)
[PING][PATCH v5] qapi/qmp: Add timestamps to qmp command responses
Hi all! It seems that this series has come through a number of reviews and got some "reviewed-by". Is there any flaws to fix preventing to merge this series? Thanks, Denis On 26.04.2023 17:08, Denis Plotnikov wrote: Add "start" & "end" time values to QMP command responses. These time values are added to let the qemu management layer get the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals. This helps to look for problems poactively from the management layer side. The management layer would be able to detect problem cases by calculating QMP command execution time: 1. execution_time_from_mgmt_perspective - execution_time_of_qmp_command > some_threshold This detects problems with management layer or internal qemu QMP command dispatching 2. current_qmp_command_execution_time > avg_qmp_command_execution_time This detects that a certain QMP command starts to execute longer than usual In both these cases more thorough investigation of the root cases should be done by using some qemu tracepoints depending on particular QMP command under investigation or by other means. The timestamps help to avoid excessive log output when qemu tracepoints are used to address similar cases. Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The response of the QMP command contains the start & end time of the QMP command processing. Also, "start" & "end" timestaps are added to qemu guest agent responses as qemu-ga shares the same code for request dispatching. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov Reviewed-by: Daniel P. Berrangé --- v4->v5: - use json-number instead of json-value for time values [Vladimir] - use a new util function for timestamp printing [Vladimir] v3->v4: - rewrite commit message [Markus] - use new fileds description in doc [Markus] - change type to int64_t [Markus] - simplify tests [Markus] v2->v3: - fix typo "timestaps -> timestamps" [Marc-André] v1->v2: - rephrase doc descriptions [Daniel] - add tests for qmp timestamps to qmp test and qga test [Daniel] - adjust asserts in test-qmp-cmds according to the new number of returning keys v0->v1: - remove interface to control "start" and "end" time values: return timestamps unconditionally - add description to qmp specification - leave the same timestamp format in "seconds", "microseconds" to be consistent with events timestamp - fix patch description --- docs/interop/qmp-spec.txt | 28 ++-- include/qapi/util.h| 2 ++ qapi/qapi-util.c | 11 +++ qapi/qmp-dispatch.c| 11 +++ qapi/qmp-event.c | 6 +- tests/qtest/qmp-test.c | 32 tests/unit/test-qga.c | 29 + tests/unit/test-qmp-cmds.c | 4 ++-- 8 files changed, 114 insertions(+), 9 deletions(-) diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt index b0e8351d5b261..ed204b53373e5 100644 --- a/docs/interop/qmp-spec.txt +++ b/docs/interop/qmp-spec.txt @@ -158,7 +158,9 @@ responses that have an unknown "id" field. The format of a success response is: -{ "return": json-value, "id": json-value } +{ "return": json-value, "id": json-value, + "start": {"seconds": json-number, "microseconds": json-number}, + "end": {"seconds": json-number, "microseconds": json-number} } Where, @@ -169,13 +171,25 @@ The format of a success response is: command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch +- The "end" member contains the exact time of when the server + finished executing the command. This excludes any time the + command response spent queued, waiting to be sent on the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch 2.4.2 error --- The format of an error response is: -{ &q
[PATCH v5] qapi/qmp: Add timestamps to qmp command responses
Add "start" & "end" time values to QMP command responses. These time values are added to let the qemu management layer get the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals. This helps to look for problems poactively from the management layer side. The management layer would be able to detect problem cases by calculating QMP command execution time: 1. execution_time_from_mgmt_perspective - execution_time_of_qmp_command > some_threshold This detects problems with management layer or internal qemu QMP command dispatching 2. current_qmp_command_execution_time > avg_qmp_command_execution_time This detects that a certain QMP command starts to execute longer than usual In both these cases more thorough investigation of the root cases should be done by using some qemu tracepoints depending on particular QMP command under investigation or by other means. The timestamps help to avoid excessive log output when qemu tracepoints are used to address similar cases. Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The response of the QMP command contains the start & end time of the QMP command processing. Also, "start" & "end" timestaps are added to qemu guest agent responses as qemu-ga shares the same code for request dispatching. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov Reviewed-by: Daniel P. Berrangé --- v4->v5: - use json-number instead of json-value for time values [Vladimir] - use a new util function for timestamp printing [Vladimir] v3->v4: - rewrite commit message [Markus] - use new fileds description in doc [Markus] - change type to int64_t [Markus] - simplify tests [Markus] v2->v3: - fix typo "timestaps -> timestamps" [Marc-André] v1->v2: - rephrase doc descriptions [Daniel] - add tests for qmp timestamps to qmp test and qga test [Daniel] - adjust asserts in test-qmp-cmds according to the new number of returning keys v0->v1: - remove interface to control "start" and "end" time values: return timestamps unconditionally - add description to qmp specification - leave the same timestamp format in "seconds", "microseconds" to be consistent with events timestamp - fix patch description --- docs/interop/qmp-spec.txt | 28 ++-- include/qapi/util.h| 2 ++ qapi/qapi-util.c | 11 +++ qapi/qmp-dispatch.c| 11 +++ qapi/qmp-event.c | 6 +- tests/qtest/qmp-test.c | 32 tests/unit/test-qga.c | 29 + tests/unit/test-qmp-cmds.c | 4 ++-- 8 files changed, 114 insertions(+), 9 deletions(-) diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt index b0e8351d5b261..ed204b53373e5 100644 --- a/docs/interop/qmp-spec.txt +++ b/docs/interop/qmp-spec.txt @@ -158,7 +158,9 @@ responses that have an unknown "id" field. The format of a success response is: -{ "return": json-value, "id": json-value } +{ "return": json-value, "id": json-value, + "start": {"seconds": json-number, "microseconds": json-number}, + "end": {"seconds": json-number, "microseconds": json-number} } Where, @@ -169,13 +171,25 @@ The format of a success response is: command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch +- The "end" member contains the exact time of when the server + finished executing the command. This excludes any time the + command response spent queued, waiting to be sent on the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch 2.4.2 error --- The format of an error response is: -{ "error": { "class": json-string, "desc": json-string }, "id": json-value } +{ "error": { "class": json-string, "desc": json-string }, "id": json-value + "start": {"seconds": json-number, &q
[PATCH v0 0/2] virtio-blk and vhost-user-blk cross-device migration
It might be useful for the cases when a slow block layer should be replaced with a more performant one on running VM without stopping, i.e. with very low downtime comparable with the one on migration. It's possible to achive that for two reasons: 1.The VMStates of "virtio-blk" and "vhost-user-blk" are almost the same. They consist of the identical VMSTATE_VIRTIO_DEVICE and differs from each other in the values of migration service fields only. 2.The device driver used in the guest is the same: virtio-blk In the series cross-migration is achieved by adding a new type. The new type uses virtio-blk VMState instead of vhost-user-blk specific VMstate, also it implements migration save/load callbacks to be compatible with migration stream produced by "virtio-blk" device. Adding the new type instead of modifying the existing one is convenent. It ease to differ the new virtio-blk-compatible vhost-user-blk device from the existing non-compatible one using qemu machinery without any other modifiactions. That gives all the variety of qemu device related constraints out of box. 0001: adds new type "vhost-user-virtio-blk" 0002: add new type "vhost-user-virtio-blk-pci" Denis Plotnikov (2): vhost-user-blk: add a new vhost-user-virtio-blk type vhost-user-blk-pci: add new pci device type to support vhost-user-virtio-blk hw/block/vhost-user-blk.c | 63 ++ hw/virtio/vhost-user-blk-pci.c | 43 ++-- include/hw/virtio/vhost-user-blk.h | 2 + 3 files changed, 105 insertions(+), 3 deletions(-) -- 2.25.1
[PATCH v0 2/2] vhost-user-blk-pci: add new pci device type to support vhost-user-virtio-blk
To allow the recently added vhost-user-virtio-blk work via virtio-pci. This patch refactors the vhost-user-blk-pci object model to reuse the existing code. Signed-off-by: Denis Plotnikov --- hw/virtio/vhost-user-blk-pci.c | 43 +++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/hw/virtio/vhost-user-blk-pci.c b/hw/virtio/vhost-user-blk-pci.c index 33b404d8a225..2f68296af22f 100644 --- a/hw/virtio/vhost-user-blk-pci.c +++ b/hw/virtio/vhost-user-blk-pci.c @@ -34,10 +34,18 @@ typedef struct VHostUserBlkPCI VHostUserBlkPCI; /* * vhost-user-blk-pci: This extends VirtioPCIProxy. */ +#define TYPE_VHOST_USER_BLK_PCI_ABSTRACT "vhost-user-blk-pci-abstract-base" +#define VHOST_USER_BLK_PCI_ABSTRACT(obj) \ +OBJECT_CHECK(VHostUserBlkPCI, (obj), TYPE_VHOST_USER_BLK_PCI_ABSTRACT) + #define TYPE_VHOST_USER_BLK_PCI "vhost-user-blk-pci-base" DECLARE_INSTANCE_CHECKER(VHostUserBlkPCI, VHOST_USER_BLK_PCI, TYPE_VHOST_USER_BLK_PCI) +#define TYPE_VHOST_USER_VIRTIO_BLK_PCI "vhost-user-virtio-blk-pci-base" +#define VHOST_USER_VIRTIO_BLK_PCI(obj) \ +OBJECT_CHECK(VHostUserBlkPCI, (obj), TYPE_VHOST_USER_VIRTIO_BLK_PCI) + struct VHostUserBlkPCI { VirtIOPCIProxy parent_obj; VHostUserBlk vdev; @@ -52,7 +60,7 @@ static Property vhost_user_blk_pci_properties[] = { static void vhost_user_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) { -VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(vpci_dev); +VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI_ABSTRACT(vpci_dev); DeviceState *vdev = DEVICE(&dev->vdev); if (dev->vdev.num_queues == VHOST_USER_BLK_AUTO_NUM_QUEUES) { @@ -66,7 +74,8 @@ static void vhost_user_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) qdev_realize(vdev, BUS(&vpci_dev->bus), errp); } -static void vhost_user_blk_pci_class_init(ObjectClass *klass, void *data) +static void vhost_user_blk_pci_abstract_class_init(ObjectClass *klass, + void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); @@ -81,6 +90,12 @@ static void vhost_user_blk_pci_class_init(ObjectClass *klass, void *data) pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI; } +static const VirtioPCIDeviceTypeInfo vhost_user_blk_pci_abstract_info = { +.base_name = TYPE_VHOST_USER_BLK_PCI_ABSTRACT, +.instance_size = sizeof(VHostUserBlkPCI), +.class_init = vhost_user_blk_pci_abstract_class_init, +}; + static void vhost_user_blk_pci_instance_init(Object *obj) { VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(obj); @@ -92,18 +107,40 @@ static void vhost_user_blk_pci_instance_init(Object *obj) } static const VirtioPCIDeviceTypeInfo vhost_user_blk_pci_info = { +.parent = TYPE_VHOST_USER_BLK_PCI_ABSTRACT, .base_name = TYPE_VHOST_USER_BLK_PCI, .generic_name= "vhost-user-blk-pci", .transitional_name = "vhost-user-blk-pci-transitional", .non_transitional_name = "vhost-user-blk-pci-non-transitional", .instance_size = sizeof(VHostUserBlkPCI), .instance_init = vhost_user_blk_pci_instance_init, -.class_init = vhost_user_blk_pci_class_init, +}; + +static void vhost_user_virtio_blk_pci_instance_init(Object *obj) +{ +VHostUserBlkPCI *dev = VHOST_USER_VIRTIO_BLK_PCI(obj); + +virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), +TYPE_VHOST_USER_VIRTIO_BLK); +object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), + "bootindex"); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_virtio_blk_pci_info = { +.parent = TYPE_VHOST_USER_BLK_PCI_ABSTRACT, +.base_name = TYPE_VHOST_USER_VIRTIO_BLK_PCI, +.generic_name= "vhost-user-virtio-blk-pci", +.transitional_name = "vhost-user-virtio-blk-pci-transitional", +.non_transitional_name = "vhost-user-virtio-blk-pci-non-transitional", +.instance_size = sizeof(VHostUserBlkPCI), +.instance_init = vhost_user_virtio_blk_pci_instance_init, }; static void vhost_user_blk_pci_register(void) { +virtio_pci_types_register(&vhost_user_blk_pci_abstract_info); virtio_pci_types_register(&vhost_user_blk_pci_info); +virtio_pci_types_register(&vhost_user_virtio_blk_pci_info); } type_init(vhost_user_blk_pci_register) -- 2.25.1
[PATCH v0 1/2] vhost-user-blk: add a new vhost-user-virtio-blk type
The main reason of adding a new type is to make cross-device live migration between "virtio-blk" and "vhost-user-blk" devices possible in both directions. It might be useful for the cases when a slow block layer should be replaced with a more performant one on running VM without stopping, i.e. with very low downtime comparable with the one on migration. It's possible to achive that for two reasons: 1.The VMStates of "virtio-blk" and "vhost-user-blk" are almost the same. They consist of the identical VMSTATE_VIRTIO_DEVICE and differs from each other in the values of migration service fields only. 2.The device driver used in the guest is the same: virtio-blk The new type uses virtio-blk VMState instead of vhost-user-blk specific VMstate, also it implements migration save/load callbacks to be compatible with migration stream produced by "virtio-blk" device. Adding the new vhost-user-blk type instead of modifying the existing one is convenent. It ease to differ the new virtio-blk-compatible vhost-user-blk device from the existing non-compatible one using qemu machinery without any other modifiactions. That gives all the variety of qemu device related constraints out of box. Signed-off-by: Denis Plotnikov --- hw/block/vhost-user-blk.c | 63 ++ include/hw/virtio/vhost-user-blk.h | 2 + 2 files changed, 65 insertions(+) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index ba13cb87e520..877fe54e891f 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -30,6 +30,7 @@ #include "hw/virtio/virtio-access.h" #include "sysemu/sysemu.h" #include "sysemu/runstate.h" +#include "migration/qemu-file-types.h" #define REALIZE_CONNECTION_RETRIES 3 @@ -612,9 +613,71 @@ static const TypeInfo vhost_user_blk_info = { .class_init = vhost_user_blk_class_init, }; +/* + * this is the same as vmstate_virtio_blk + * we use it to allow virtio-blk <-> vhost-user-virtio-blk migration + */ +static const VMStateDescription vmstate_vhost_user_virtio_blk = { +.name = "virtio-blk", +.minimum_version_id = 2, +.version_id = 2, +.fields = (VMStateField[]) { +VMSTATE_VIRTIO_DEVICE, +VMSTATE_END_OF_LIST() +}, +}; + +static void vhost_user_virtio_blk_save(VirtIODevice *vdev, QEMUFile *f) +{ +/* + * put a zero byte in the stream to be compatible with virtio-blk + */ +qemu_put_sbyte(f, 0); +} + +static int vhost_user_virtio_blk_load(VirtIODevice *vdev, QEMUFile *f, + int version_id) +{ +if (qemu_get_sbyte(f)) { +/* + * on virtio-blk -> vhost-user-virtio-blk migration we don't expect + * to get any infilght requests in the migration stream because + * we can't load them yet. + * TODO: consider putting those inflight requests to inflight region + */ +error_report("%s: can't load in-flight requests", + TYPE_VHOST_USER_VIRTIO_BLK); +return -EINVAL; +} + +return 0; +} + +static void vhost_user_virtio_blk_class_init(ObjectClass *klass, void *data) +{ +DeviceClass *dc = DEVICE_CLASS(klass); +VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + +/* override vmstate of vhost_user_blk */ +dc->vmsd = &vmstate_vhost_user_virtio_blk; + +/* adding callbacks to be compatible with virtio-blk migration stream */ +vdc->save = vhost_user_virtio_blk_save; +vdc->load = vhost_user_virtio_blk_load; +} + +static const TypeInfo vhost_user_virtio_blk_info = { +.name = TYPE_VHOST_USER_VIRTIO_BLK, +.parent = TYPE_VHOST_USER_BLK, +.instance_size = sizeof(VHostUserBlk), +/* instance_init is the same as in parent type */ +.class_init = vhost_user_virtio_blk_class_init, +}; + static void virtio_register_types(void) { type_register_static(&vhost_user_blk_info); +type_register_static(&vhost_user_virtio_blk_info); } type_init(virtio_register_types) diff --git a/include/hw/virtio/vhost-user-blk.h b/include/hw/virtio/vhost-user-blk.h index 7c91f15040eb..d81f18d22596 100644 --- a/include/hw/virtio/vhost-user-blk.h +++ b/include/hw/virtio/vhost-user-blk.h @@ -23,6 +23,8 @@ #include "qom/object.h" #define TYPE_VHOST_USER_BLK "vhost-user-blk" +#define TYPE_VHOST_USER_VIRTIO_BLK "vhost-user-virtio-blk" + OBJECT_DECLARE_SIMPLE_TYPE(VHostUserBlk, VHOST_USER_BLK) #define VHOST_USER_BLK_AUTO_NUM_QUEUES UINT16_MAX -- 2.25.1
[PATCH v0] vl: flush all task from rcu queue before exiting
The device destruction may superimpose over qemu shutdown. In this case some management layer, requested a device unplug and waiting for DEVICE_DELETED event, may never get this event. This happens because device_finalize() may never be called on qemu shutdown for some devices using address_space_destroy(). The later is called from the rcu thread. On qemu shutdown, not all rcu callbacks may be called because the rcu thread may not have enough time to converge before qemu main thread exit. To resolve this issue this patch makes rcu thread to finish all its callbacks explicitly by calling a new rcu intreface function right before qemu main thread exit. Signed-off-by: Denis Plotnikov --- include/qemu/rcu.h | 1 + softmmu/runstate.c | 3 +++ util/rcu.c | 12 3 files changed, 16 insertions(+) diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h index 515d327cf11c..f7fbdc3781e5 100644 --- a/include/qemu/rcu.h +++ b/include/qemu/rcu.h @@ -134,6 +134,7 @@ struct rcu_head { extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func); extern void drain_call_rcu(void); +extern void flush_rcu(void); /* The operands of the minus operator must have the same type, * which must be the one that we specify in the cast. diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 10d9b7365aa7..28f319a97a2b 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -822,5 +822,8 @@ void qemu_cleanup(void) monitor_cleanup(); qemu_chr_cleanup(); user_creatable_cleanup(); + +/* finish all the tasks from rcu queue before exiting */ +flush_rcu(); /* TODO: unref root container, check all devices are ok */ } diff --git a/util/rcu.c b/util/rcu.c index 13ac0f75cb2a..f047f8ee8d16 100644 --- a/util/rcu.c +++ b/util/rcu.c @@ -348,6 +348,18 @@ void drain_call_rcu(void) } +/* + * This function drains rcu queue until there are no tasks to do left + * and aims to the cases when one needs to ensure that no work hang + * in rcu thread before proceeding, e.g. on qemu shutdown. + */ +void flush_rcu(void) +{ +while (qatomic_read(&rcu_call_count) > 0) { +drain_call_rcu(); +} +} + void rcu_register_thread(void) { assert(rcu_reader.ctr == 0); -- 2.25.1
Re: [PATCH v0] vl: flush all task from rcu queue before exiting
On 02.11.2021 16:39, Denis Plotnikov wrote: The device destruction may superimpose over qemu shutdown. In this case some management layer, requested a device unplug and waiting for DEVICE_DELETED event, may never get this event. This happens because device_finalize() may never be called on qemu shutdown for some devices using address_space_destroy(). The later is called from the rcu thread. On qemu shutdown, not all rcu callbacks may be called because the rcu thread may not have enough time to converge before qemu main thread exit. To resolve this issue this patch makes rcu thread to finish all its callbacks explicitly by calling a new rcu intreface function right before qemu main thread exit. Signed-off-by: Denis Plotnikov --- include/qemu/rcu.h | 1 + softmmu/runstate.c | 3 +++ util/rcu.c | 12 3 files changed, 16 insertions(+) diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h index 515d327cf11c..f7fbdc3781e5 100644 --- a/include/qemu/rcu.h +++ b/include/qemu/rcu.h @@ -134,6 +134,7 @@ struct rcu_head { extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func); extern void drain_call_rcu(void); +extern void flush_rcu(void); /* The operands of the minus operator must have the same type, * which must be the one that we specify in the cast. diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 10d9b7365aa7..28f319a97a2b 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -822,5 +822,8 @@ void qemu_cleanup(void) actually, flush_rcu() should be here before monitor_cleanup to send DEVICE_DELETED monitor_cleanup(); qemu_chr_cleanup(); user_creatable_cleanup(); + +/* finish all the tasks from rcu queue before exiting */ +flush_rcu(); /* TODO: unref root container, check all devices are ok */ } diff --git a/util/rcu.c b/util/rcu.c index 13ac0f75cb2a..f047f8ee8d16 100644 --- a/util/rcu.c +++ b/util/rcu.c @@ -348,6 +348,18 @@ void drain_call_rcu(void) } +/* + * This function drains rcu queue until there are no tasks to do left + * and aims to the cases when one needs to ensure that no work hang + * in rcu thread before proceeding, e.g. on qemu shutdown. + */ +void flush_rcu(void) +{ +while (qatomic_read(&rcu_call_count) > 0) { +drain_call_rcu(); +} +} + void rcu_register_thread(void) { assert(rcu_reader.ctr == 0);
[Ping][PATCH v0] vl: flush all task from rcu queue before exiting
Ping ping! On 02.11.2021 16:39, Denis Plotnikov wrote: The device destruction may superimpose over qemu shutdown. In this case some management layer, requested a device unplug and waiting for DEVICE_DELETED event, may never get this event. This happens because device_finalize() may never be called on qemu shutdown for some devices using address_space_destroy(). The later is called from the rcu thread. On qemu shutdown, not all rcu callbacks may be called because the rcu thread may not have enough time to converge before qemu main thread exit. To resolve this issue this patch makes rcu thread to finish all its callbacks explicitly by calling a new rcu intreface function right before qemu main thread exit. Signed-off-by: Denis Plotnikov --- include/qemu/rcu.h | 1 + softmmu/runstate.c | 3 +++ util/rcu.c | 12 3 files changed, 16 insertions(+) diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h index 515d327cf11c..f7fbdc3781e5 100644 --- a/include/qemu/rcu.h +++ b/include/qemu/rcu.h @@ -134,6 +134,7 @@ struct rcu_head { extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func); extern void drain_call_rcu(void); +extern void flush_rcu(void); /* The operands of the minus operator must have the same type, * which must be the one that we specify in the cast. diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 10d9b7365aa7..28f319a97a2b 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -822,5 +822,8 @@ void qemu_cleanup(void) monitor_cleanup(); qemu_chr_cleanup(); user_creatable_cleanup(); + +/* finish all the tasks from rcu queue before exiting */ +flush_rcu(); /* TODO: unref root container, check all devices are ok */ } diff --git a/util/rcu.c b/util/rcu.c index 13ac0f75cb2a..f047f8ee8d16 100644 --- a/util/rcu.c +++ b/util/rcu.c @@ -348,6 +348,18 @@ void drain_call_rcu(void) } +/* + * This function drains rcu queue until there are no tasks to do left + * and aims to the cases when one needs to ensure that no work hang + * in rcu thread before proceeding, e.g. on qemu shutdown. + */ +void flush_rcu(void) +{ +while (qatomic_read(&rcu_call_count) > 0) { +drain_call_rcu(); +} +} + void rcu_register_thread(void) { assert(rcu_reader.ctr == 0);
Re: [Ping][PATCH v0] vl: flush all task from rcu queue before exiting
On 09.11.2021 20:46, Paolo Bonzini wrote: On 11/9/21 08:23, Denis Plotnikov wrote: Ping ping! Looks good, but can you explain why it's okay to call it before qemu_chr_cleanup() and user_creatable_cleanup()? I think a better solution to the ordering problem would be: qemu_chr_cleanup(); user_creatable_cleanup(); flush_rcu(); monitor_cleanup(); I agree, this looks better with something like this: diff --git a/chardev/char-fe.c b/chardev/char-fe.c index 7789f7be9c..f0c3ea5447 100644 --- a/chardev/char-fe.c +++ b/chardev/char-fe.c @@ -195,6 +195,7 @@ bool qemu_chr_fe_init(CharBackend *b, int tag = 0; if (s) { + object_ref(OBJECT(s)); if (CHARDEV_IS_MUX(s)) { MuxChardev *d = MUX_CHARDEV(s); @@ -241,6 +242,7 @@ void qemu_chr_fe_deinit(CharBackend *b, bool del) } else { object_unref(obj); } + object_unref(obj); } b->chr = NULL; } to keep the chardev live between qemu_chr_cleanup() and monitor_cleanup(). but frankly speaking I don't understand why we have to do ref/unref in char-fe interface functions, instead of just ref/uref-ing monitor's char device directly like this: diff --git a/monitor/monitor.c b/monitor/monitor.c index 21c7a68758f5..3692a8e15268 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -611,6 +611,7 @@ void monitor_data_destroy(Monitor *mon) { g_free(mon->mon_cpu_path); qemu_chr_fe_deinit(&mon->chr, false); + object_unref(OBJECT(&mon->chr)); if (monitor_is_qmp(mon)) { monitor_data_destroy_qmp(container_of(mon, MonitorQMP, common)); } else { @@ -737,6 +738,7 @@ int monitor_init(MonitorOptions *opts, bool allow_hmp, Error **errp) error_propagate(errp, local_err); return -1; } + object_ref(OBJECT(chr)); return 0; } May be this shows the intentions better? Denis Paolo
[PATCH v4] qapi/qmp: Add timestamps to qmp command responses
Add "start" & "end" time values to QMP command responses. These time values are added to let the qemu management layer get the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals. This helps to look for problems poactively from the management layer side. The management layer would be able to detect problem cases by calculating QMP command execution time: 1. execution_time_from_mgmt_perspective - execution_time_of_qmp_command > some_threshold This detects problems with management layer or internal qemu QMP command dispatching 2. current_qmp_command_execution_time > avg_qmp_command_execution_time This detects that a certain QMP command starts to execute longer than usual In both these cases more thorough investigation of the root cases should be done by using some qemu tracepoints depending on particular QMP command under investigation or by other means. The timestamps help to avoid excessive log output when qemu tracepoints are used to address similar cases. Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The response of the QMP command contains the start & end time of the QMP command processing. Also, "start" & "end" timestaps are added to qemu guest agent responses as qemu-ga shares the same code for request dispatching. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov Reviewed-by: Daniel P. Berrangé --- v3->v4 - rewrite commit message [Markus] - use new fileds description in doc [Markus] - change type to int64_t [Markus] - simplify tests [Markus] v2->v3: - fix typo "timestaps -> timestamps" [Marc-André] v1->v2: - rephrase doc descriptions [Daniel] - add tests for qmp timestamps to qmp test and qga test [Daniel] - adjust asserts in test-qmp-cmds according to the new number of returning keys v0->v1: - remove interface to control "start" and "end" time values: return timestamps unconditionally - add description to qmp specification - leave the same timestamp format in "seconds", "microseconds" to be consistent with events timestamp - fix patch description docs/interop/qmp-spec.txt | 28 ++-- qapi/qmp-dispatch.c| 18 ++ tests/qtest/qmp-test.c | 32 tests/unit/test-qga.c | 29 + tests/unit/test-qmp-cmds.c | 4 ++-- 5 files changed, 107 insertions(+), 4 deletions(-) diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt index b0e8351d5b261..0dd8e716c02f0 100644 --- a/docs/interop/qmp-spec.txt +++ b/docs/interop/qmp-spec.txt @@ -158,7 +158,9 @@ responses that have an unknown "id" field. The format of a success response is: -{ "return": json-value, "id": json-value } +{ "return": json-value, "id": json-value, + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -169,13 +171,25 @@ The format of a success response is: command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch +- The "end" member contains the exact time of when the server + finished executing the command. This excludes any time the + command response spent queued, waiting to be sent on the wire. + It is a json-object with the number of seconds and microseconds + since the Unix epoch 2.4.2 error --- The format of an error response is: -{ "error": { "class": json-string, "desc": json-string }, "id": json-value } +{ "error": { "class": json-string, "desc": json-string }, "id": json-value + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -184,6 +198,16 @@ The format of an error response is: not attempt to parse this message. - The "id&
[patch v0] qapi/qmp: Add timestamps to qmp command responses.
Add "start" & "end" timestamps to qmp command responses. It's disabled by default, but can be enabled with 'timestamp=on' monitor's parameter, e.g.: -chardev socket,id=mon1,path=/tmp/qmp.socket,server=on,wait=off -mon chardev=mon1,mode=control,timestamp=on Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The responce of the qmp command contains the start & end time of the qmp command processing. These times may be helpful for the management layer in understanding of the actual timeline of a qmp command processing. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov --- include/monitor/monitor.h | 2 +- include/qapi/qmp/dispatch.h | 2 +- monitor/monitor-internal.h | 1 + monitor/monitor.c | 9 - monitor/qmp.c | 5 +++-- qapi/control.json | 3 +++ qapi/qmp-dispatch.c | 28 +++- qga/main.c | 2 +- stubs/monitor-core.c| 2 +- tests/unit/test-qmp-cmds.c | 6 +++--- 10 files changed, 49 insertions(+), 11 deletions(-) diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h index a4b40e8391db4..2a18e9ee34bc2 100644 --- a/include/monitor/monitor.h +++ b/include/monitor/monitor.h @@ -19,7 +19,7 @@ bool monitor_cur_is_qmp(void); void monitor_init_globals(void); void monitor_init_globals_core(void); -void monitor_init_qmp(Chardev *chr, bool pretty, Error **errp); +void monitor_init_qmp(Chardev *chr, bool pretty, bool timestamp, Error **errp); void monitor_init_hmp(Chardev *chr, bool use_readline, Error **errp); int monitor_init(MonitorOptions *opts, bool allow_hmp, Error **errp); int monitor_init_opts(QemuOpts *opts, Error **errp); diff --git a/include/qapi/qmp/dispatch.h b/include/qapi/qmp/dispatch.h index 1e4240fd0dbc0..d07f5764271be 100644 --- a/include/qapi/qmp/dispatch.h +++ b/include/qapi/qmp/dispatch.h @@ -56,7 +56,7 @@ const char *qmp_command_name(const QmpCommand *cmd); bool qmp_has_success_response(const QmpCommand *cmd); QDict *qmp_error_response(Error *err); QDict *qmp_dispatch(const QmpCommandList *cmds, QObject *request, -bool allow_oob, Monitor *cur_mon); +bool allow_oob, bool timestamp, Monitor *cur_mon); bool qmp_is_oob(const QDict *dict); typedef void (*qmp_cmd_callback_fn)(const QmpCommand *cmd, void *opaque); diff --git a/monitor/monitor-internal.h b/monitor/monitor-internal.h index caa2e90ef22a4..69425a7bc8152 100644 --- a/monitor/monitor-internal.h +++ b/monitor/monitor-internal.h @@ -136,6 +136,7 @@ typedef struct { Monitor common; JSONMessageParser parser; bool pretty; +bool timestamp; /* * When a client connects, we're in capabilities negotiation mode. * @commands is &qmp_cap_negotiation_commands then. When command diff --git a/monitor/monitor.c b/monitor/monitor.c index 86949024f643a..85a0b6498dbc1 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -726,7 +726,7 @@ int monitor_init(MonitorOptions *opts, bool allow_hmp, Error **errp) switch (opts->mode) { case MONITOR_MODE_CONTROL: -monitor_init_qmp(chr, opts->pretty, &local_err); +monitor_init_qmp(chr, opts->pretty, opts->timestamp, &local_err); break; case MONITOR_MODE_READLINE: if (!allow_hmp) { @@ -737,6 +737,10 @@ int monitor_init(MonitorOptions *opts, bool allow_hmp, Error **errp) error_setg(errp, "'pretty' is not compatible with HMP monitors"); return -1; } +if (opts->timestamp) { +error_setg(errp, "'timestamp' is not compatible with HMP monitors"); +return -1; +} monitor_init_hmp(chr, true, &local_err); break; default: @@ -782,6 +786,9 @@ QemuOptsList qemu_mon_opts = { },{ .name = "pretty", .type = QEMU_OPT_BOOL, +},{ +.name = "timestamp", +.type = QEMU_OPT_BOOL, }, { /* end of list */ } }, diff --git a/monitor/qmp.c b/monitor/qmp.c index 092c527b6fc9c..fd487fee9f850 100644 --- a/monitor/qmp.c +++ b/monitor/qmp.c @@ -142,7 +142,7 @@ static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req) QDict *error; rsp = qmp_dispatch(mon->commands, req, qmp_oob_enabled(mon), - &mon->common); + mon->timestamp, &mon->common); if (mon->commands == &qmp_cap_negotiation_commands) {
Re: [patch v0] qapi/qmp: Add timestamps to qmp command responses.
On 27.09.2022 09:04, Markus Armbruster wrote: Daniel P. Berrangé writes: On Mon, Sep 26, 2022 at 12:59:40PM +0300, Denis Plotnikov wrote: Add "start" & "end" timestamps to qmp command responses. It's disabled by default, but can be enabled with 'timestamp=on' monitor's parameter, e.g.: -chardev socket,id=mon1,path=/tmp/qmp.socket,server=on,wait=off -mon chardev=mon1,mode=control,timestamp=on I'm not convinced a cmdline flag is the right approach here. I think it ought be something defined by the QMP spec. The QMP spec is docs/interop/qmp-spec.txt. The feature needs to be defined there regardless of how we control it. ok, thanks for pointing out The "QMP" greeting should report "timestamp" capabilities. The 'qmp_capabilities' command can be used to turn on this capability for all commands henceforth. Yes, this is how optional QMP protocol features should be controlled. Bonus: control is per connection, not just globally. As an option extra, the 'execute' command could gain a parameter to allow this to be requested for only an individual command. Needs a use case. Alternatively we could say the overhead of adding the timestmaps is small enough that we just add this unconditionally for everything hence, with no opt-in/opt-out. Yes, because the extension is backwards compatible. May be it worth to send the timestamps always in the response if doesn't contradicts with anything and doesn't bring any unnecessary data overhead. From the other hand turning it on via qmp capabilities seems to be more flexible solution. Aside: qmp-spec.txt could be clearer on what that means. Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The responce of the qmp command contains the start & end time of the qmp command processing. Seconds and microseconds since when? The update to qmp-spec.txt should tell. Why split the time into seconds and microseconds? If you use microseconds since the Unix epoch (1970-01-01 UTC), 64 bit unsigned will result in a year 586524 problem: $ date --date "@`echo '2^64/100' | bc`" Wed Jan 19 09:01:49 CET 586524 Even a mere 53 bits will last until 2255. This is Just for convenience, may be it's too much and timestamp in msec if enough These times may be helpful for the management layer in understanding of the actual timeline of a qmp command processing. Can you explain the problem scenario in more detail. Yes, please, because: The mgmt app already knows when it send the QMP command and knows when it gets the QMP reply. This covers the time the QMP was queued before processing (might be large if QMP is blocked on another slow command) , the processing time, and the time any reply was queued before sending (ought to be small). So IIUC, the value these fields add is that they let the mgmt app extract only the command processing time, eliminating any variance do to queue before/after. So the scenario is the following: we need a means to understand from the management layer prospecitive of what is the timeline of the command execution. This is needed for a problem resolving if a qmp command executes for too long from the management layer point of view. Specifically, management layer sees the execution time as "management_layer_internal_routine_time" + "qemu_dispatching_time" + "qemu_qmp_command_execution_time". Suggested qmp command timestaps gives "qemu_command_execution_time". Management layer calculates "management_layer_internal_routine_time" internally. Using those two things we can calculate "qemu_dispatching_time" and decide where the potential delays comes from. This will gives us a direction of further problem investigation. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov
[PATCH v1] qapi/qmp: Add timestamps to qmp command responses
Add "start" & "end" time values to qmp command responses. These time values are added to let the qemu management layer get the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals. This is particulary useful for the management layer logging for later problems resolving. Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The responce of the qmp command contains the start & end time of the qmp command processing. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov --- v0->v1: - remove interface to control "start" and "end" time values: return timestamps unconditionally - add description to qmp specification - leave the same timestamp format in "seconds", "microseconds" to be consistent with events timestamp - fix patch description docs/interop/qmp-spec.txt | 20 ++-- qapi/qmp-dispatch.c | 18 ++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt index b0e8351d5b261..d1cca8bc447ce 100644 --- a/docs/interop/qmp-spec.txt +++ b/docs/interop/qmp-spec.txt @@ -158,7 +158,9 @@ responses that have an unknown "id" field. The format of a success response is: -{ "return": json-value, "id": json-value } +{ "return": json-value, "id": json-value, + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -169,13 +171,21 @@ The format of a success response is: command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the command has been + stated to be processed. It is a fixed json-object with time in + seconds and microseconds relative to the Unix Epoch (1 Jan 1970) +- The "end" member contains the exact time of when the command has been + finished to be processed. It is a fixed json-object with time in + seconds and microseconds relative to the Unix Epoch (1 Jan 1970) 2.4.2 error --- The format of an error response is: -{ "error": { "class": json-string, "desc": json-string }, "id": json-value } +{ "error": { "class": json-string, "desc": json-string }, "id": json-value + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -184,6 +194,12 @@ The format of an error response is: not attempt to parse this message. - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the command has been + stated to be processed. It is a fixed json-object with time in + seconds and microseconds relative to the Unix Epoch (1 Jan 1970) +- The "end" member contains the exact time of when the command has been + finished to be processed. It is a fixed json-object with time in + seconds and microseconds relative to the Unix Epoch (1 Jan 1970) NOTE: Some errors can occur before the Server is able to read the "id" member, in these cases the "id" member will not be part of the error response, even diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c index 0990873ec8ec1..fce87416f2128 100644 --- a/qapi/qmp-dispatch.c +++ b/qapi/qmp-dispatch.c @@ -130,6 +130,22 @@ static void do_qmp_dispatch_bh(void *opaque) aio_co_wake(data->co); } +static void add_timestamps(QDict *qdict, uint64_t start_ms, uint64_t end_ms) +{ +QDict *start_dict, *end_dict; + +start_dict = qdict_new(); +qdict_put_int(start_dict, "seconds", start_ms / G_USEC_PER_SEC); +qdict_put_int(start_dict, "microseconds", start_ms % G_USEC_PER_SEC); + +end_dict = qdict_new(); +qdict_put_int(end_dict, "seconds", end_ms / G_USEC_PER_SEC); +qdict_put_int(end_dict, "microseconds", end_ms % G_USEC_PER_SEC); + +qdict_put_obj(qdict, "start", QOBJECT(start_dict)); +qdict_put_obj(qdict, "end", QOBJECT(end_dict)); +} + /* * Runs outside of coroutine context for OOB commands, but in coroutine * context for everything else. @@ -146,6 +162,7 @@ QDict *qmp_dispatch(const QmpCommandList *cmds, QObject *request, QObject *id; QObject *ret = NULL; QDict *rsp = NULL; +uint64_t ts_start = g_get_real_time(); dict = qobject_to(QDict, request); if (!dict) { @@ -270,5 +287,6 @@ out: qdict_put_obj(rsp, "id", qobject_ref(id)); } +add_timestamps(rsp, ts_start, g_get_real_time()); return rsp; } -- 2.25.1
[PATCH v2] qapi/qmp: Add timestamps to qmp command responses
Add "start" & "end" time values to qmp command responses. These time values are added to let the qemu management layer get the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals. This is particulary useful for the management layer logging for later problems resolving. Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The responce of the qmp command contains the start & end time of the qmp command processing. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov --- v0->v1: - remove interface to control "start" and "end" time values: return timestamps unconditionally - add description to qmp specification - leave the same timestamp format in "seconds", "microseconds" to be consistent with events timestamp - fix patch description v1->v2: - rephrase doc descriptions [Daniel] - add tests for qmp timestamps to qmp test and qga test [Daniel] - adjust asserts in test-qmp-cmds according to the new number of returning keys docs/interop/qmp-spec.txt | 28 ++-- qapi/qmp-dispatch.c| 18 ++ tests/qtest/qmp-test.c | 34 ++ tests/unit/test-qga.c | 31 +++ tests/unit/test-qmp-cmds.c | 4 ++-- 5 files changed, 111 insertions(+), 4 deletions(-) diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt index b0e8351d5b261..2e0b7de0c4dc7 100644 --- a/docs/interop/qmp-spec.txt +++ b/docs/interop/qmp-spec.txt @@ -158,7 +158,9 @@ responses that have an unknown "id" field. The format of a success response is: -{ "return": json-value, "id": json-value } +{ "return": json-value, "id": json-value, + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -169,13 +171,25 @@ The format of a success response is: command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a fixed json-object with time in seconds and microseconds + relative to the Unix Epoch (1 Jan 1970) +- The "end" member contains the exact time of when the server + finished executing the command. This excludes any time the + command response spent queued, waiting to be sent on the wire. + It is a fixed json-object with time in seconds and microseconds + relative to the Unix Epoch (1 Jan 1970) 2.4.2 error --- The format of an error response is: -{ "error": { "class": json-string, "desc": json-string }, "id": json-value } +{ "error": { "class": json-string, "desc": json-string }, "id": json-value + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -184,6 +198,16 @@ The format of an error response is: not attempt to parse this message. - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a fixed json-object with time in seconds and microseconds + relative to the Unix Epoch (1 Jan 1970) +- The "end" member contains the exact time of when the server + finished executing the command. This excludes any time the + command response spent queued, waiting to be sent on the wire. + It is a fixed json-object with time in seconds and microseconds + relative to the Unix Epoch (1 Jan 1970) NOTE: Some errors can occur before the Server is able to read the "id" member, in these cases the "id" member will not be part of the error response, even diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c index 0990873ec8ec1..fce87416f2128 100644 --- a/qapi/qmp-dispatch.c +++ b/qapi/qmp-dispatch.c @@ -130,6 +130,22 @@ static void do_qmp_disp
[PATCH v3] qapi/qmp: Add timestamps to qmp command responses
Add "start" & "end" time values to qmp command responses. These time values are added to let the qemu management layer get the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals. This is particulary useful for the management layer logging for later problems resolving. Example of result: ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket (QEMU) query-status {"end": {"seconds": 1650367305, "microseconds": 831032}, "start": {"seconds": 1650367305, "microseconds": 831012}, "return": {"status": "running", "singlestep": false, "running": true}} The responce of the qmp command contains the start & end time of the qmp command processing. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov Reviewed-by: Daniel P. Berrangé --- v0->v1: - remove interface to control "start" and "end" time values: return timestamps unconditionally - add description to qmp specification - leave the same timestamp format in "seconds", "microseconds" to be consistent with events timestamp - fix patch description v1->v2: - rephrase doc descriptions [Daniel] - add tests for qmp timestamps to qmp test and qga test [Daniel] - adjust asserts in test-qmp-cmds according to the new number of returning keys v2->v3: - fix typo "timestaps -> timestamps" [Marc-André] docs/interop/qmp-spec.txt | 28 ++-- qapi/qmp-dispatch.c| 18 ++ tests/qtest/qmp-test.c | 34 ++ tests/unit/test-qga.c | 31 +++ tests/unit/test-qmp-cmds.c | 4 ++-- 5 files changed, 111 insertions(+), 4 deletions(-) diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt index b0e8351d5b261..2e0b7de0c4dc7 100644 --- a/docs/interop/qmp-spec.txt +++ b/docs/interop/qmp-spec.txt @@ -158,7 +158,9 @@ responses that have an unknown "id" field. The format of a success response is: -{ "return": json-value, "id": json-value } +{ "return": json-value, "id": json-value, + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -169,13 +171,25 @@ The format of a success response is: command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a fixed json-object with time in seconds and microseconds + relative to the Unix Epoch (1 Jan 1970) +- The "end" member contains the exact time of when the server + finished executing the command. This excludes any time the + command response spent queued, waiting to be sent on the wire. + It is a fixed json-object with time in seconds and microseconds + relative to the Unix Epoch (1 Jan 1970) 2.4.2 error --- The format of an error response is: -{ "error": { "class": json-string, "desc": json-string }, "id": json-value } +{ "error": { "class": json-string, "desc": json-string }, "id": json-value + "start": {"seconds": json-value, "microseconds": json-value}, + "end": {"seconds": json-value, "microseconds": json-value} } Where, @@ -184,6 +198,16 @@ The format of an error response is: not attempt to parse this message. - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a fixed json-object with time in seconds and microseconds + relative to the Unix Epoch (1 Jan 1970) +- The "end" member contains the exact time of when the server + finished executing the command. This excludes any time the + command response spent queued, waiting to be sent on the wire. + It is a fixed json-object with time in seconds and microseconds + relative to the Unix Epoch (1 Jan 1970) NOTE: Some errors can occur before the Server is able to read the "id" member, in these cases the "id" member will not be part of the error response, even diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c index 0990873ec8ec1..fce8
Re: [PATCH v3] qapi/qmp: Add timestamps to qmp command responses
On 13.10.2022 18:00, Markus Armbruster wrote: Denis Plotnikov writes: Add "start" & "end" time values to qmp command responses. Please spell it QMP. More of the same below. ok Can you tell me about a problem you cracked (or could have cracked) with the help of this? We have a management layer which interacts with qemu via qmp. When it issues a qmp command we measure execution time which takes to perform a certain qmp command. Some of that commands seems to execute longer that expected. In that case there is a question what part of command execution takes the majority of time. Is it the flaw in the management layer or in qemu qmp command scheduling or the qmp command execution itself? The timestaps being added help to exclude the qmp command execution time from the question. Also timestamps helps to get know the exact time when the command is started and ended and put that information to a system logs properly according to timestamps. "return": {"status": "running", "singlestep": false, "running": true}} The responce of the qmp command contains the start & end time of response ok the qmp command processing. Suggested-by: Andrey Ryabinin Signed-off-by: Denis Plotnikov Reviewed-by: Daniel P. Berrangé Please spell out that this affects both QMP and qemu-ga. ok command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client +- The "start" member contains the exact time of when the server + started executing the command. This excludes any time the + command request spent queued, after reading it off the wire. + It is a fixed json-object with time in seconds and microseconds + relative to the Unix Epoch (1 Jan 1970) What's a "fixed json-object"? Hmm, I guess you're copying from the description of event member "timestamp". That's right Let's go with "a json-object with the number of seconds and microseconds since the Unix epoch" everywhere. ok Make this int64_t, because that's what g_get_real_time() returns. Same for add_timestamps() parameters. ok, will fix the type everywhere +qobject_unref(resp); I'd be tempted to fold this into existing tests. Do you want me to put timestamp checking to an existing testcase? Thanks, Denis + qtest_quit(qts); } diff --git a/tests/unit/test-qga.c b/tests/unit/test-qga.c index b4e0a145737d1..18ec9bac3650e 100644 --- a/tests/unit/test-qga.c +++ b/tests/unit/test-qga.c @@ -217,6 +217,36 @@ static void test_qga_ping(gconstpointer fix) qmp_assert_no_error(ret); } +static void test_qga_timestamps(gconstpointer fix) +{ +QDict *start, *end; +uint64_t start_s, start_us, end_s, end_us, start_ts, end_ts; +const TestFixture *fixture = fix; +g_autoptr(QDict) ret = NULL; + +ret = qmp_fd(fixture->fd, "{'execute': 'guest-ping'}"); +g_assert_nonnull(ret); +qmp_assert_no_error(ret); + +start = qdict_get_qdict(ret, "start"); +g_assert(start); +end = qdict_get_qdict(ret, "end"); +g_assert(end); + +start_s = qdict_get_try_int(start, "seconds", 0); +g_assert(start_s); +start_us = qdict_get_try_int(start, "microseconds", 0); + +end_s = qdict_get_try_int(end, "seconds", 0); +g_assert(end_s); +end_us = qdict_get_try_int(end, "microseconds", 0); + +start_ts = (start_s * G_USEC_PER_SEC) + start_us; +end_ts = (end_s * G_USEC_PER_SEC) + end_us; + +g_assert(end_ts > start_ts); +} + static void test_qga_id(gconstpointer fix) { const TestFixture *fixture = fix; @@ -948,6 +978,7 @@ int main(int argc, char **argv) g_test_add_data_func("/qga/sync-delimited", &fix, test_qga_sync_delimited); g_test_add_data_func("/qga/sync", &fix, test_qga_sync); g_test_add_data_func("/qga/ping", &fix, test_qga_ping); +g_test_add_data_func("/qga/timestamps", &fix, test_qga_timestamps); g_test_add_data_func("/qga/info", &fix, test_qga_info); g_test_add_data_func("/qga/network-get-interfaces", &fix, test_qga_network_get_interfaces); diff --git a/tests/unit/test-qmp-cmds.c b/tests/unit/test-qmp-cmds.c index 6085c099950b5..54d63bb8e346f 100644 --- a/tests/unit/test-qmp-cmds.c +++ b/tests/unit/test-qmp-cmds.c @@ -154,7 +154,7 @@ static QObject *do_qmp_dispatch(bool allow_oob, const char *template, ...) g_assert(resp); ret = qdict_get(resp, "return"); g_assert(ret); -g_assert(qdict_size(resp) == 1); +g_assert(qdict_size(resp) == 3); qobject_ref(ret); qobject_unref(resp); @@ -181,7 +181,7 @@ static void do_qmp_dispatch_error(bool allow_oob, ErrorClass cls, ==, QapiErrorClass_str(cls)); g_assert(qdict_get_try_str(error, "desc")); g_assert(qdict_size(error) == 2); -g_assert(qdict_size(resp) == 1); +g_assert(qdict_size(resp) == 3); qobject_unref(resp); qobject_unref(req);
Re: [PATCH v3] qapi/qmp: Add timestamps to qmp command responses
On 14.10.2022 16:19, Daniel P. Berrangé wrote: On Fri, Oct 14, 2022 at 02:57:06PM +0200, Markus Armbruster wrote: Daniel P. Berrangé writes: On Fri, Oct 14, 2022 at 11:31:13AM +0200, Markus Armbruster wrote: Daniel P. Berrangé writes: On Thu, Oct 13, 2022 at 05:00:26PM +0200, Markus Armbruster wrote: Denis Plotnikov writes: Add "start" & "end" time values to qmp command responses. Please spell it QMP. More of the same below. These time values are added to let the qemu management layer get the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals. This is particulary useful for the management layer logging for later problems resolving. I'm still having difficulties seeing the value add over existing tracepoints and logging. Can you tell me about a problem you cracked (or could have cracked) with the help of this? Consider your QMP client is logging all commands and replies in its own logfile (libvirt can do this). Having this start/end timestamps included means the QMP client log is self contained. A QMP client can include client-side timestamps in its log. What value is being added by server-side timestamps? According to the commit message, it's for getting "the exact command execution time without any other time variance which might be brought by other parts of management layer or qemu internals." Why is that useful? In particular, why is excluding network and QEMU queueing delays (inbound and outbound) useful? Lets, say some commands normally runs in ~100ms, but occasionally runs in 2secs, and you want to understand why. A first step is understanding whether a given command itself is slow at executing, or whether its execution has merely been delayed because some other aspect of QEMU has delayed its execution. If the server timestamps show it was very fast, then that indicates delayed processing. Thus instead of debugging the slow command, I can think about what scenarios would be responsible for the delay. Perhaps a previous QMP command was very slow, or maybe there is simply a large volume of QMP commands backlogged, or some part of QEMU got blocked. Another case would be a command that is normally fast, and sometimes is slower, but still relatively fast. The network and queueing side might be a significant enough proportion of the total time to obscure the slowdown. If you can eliminate the non-execution time, you can see the performance trends over time to spot the subtle slowdowns and detect abnormal behaviour before it becomes too terrible. This is troubleshooting. Asking for better troubleshooting tools is fair. However, the proposed timestamps provide much more limited insight than existing tracepoints. For instance, enabling tracepoints are absolutely great and let you get a hell of alot more information, *provided* you are in a position to actually use tracepoints. This is, unfortunately, frequently not the case when supporting real world production deployments. Exactly!!! Thanks for the pointing out! Bug reports from customers typically include little more than a log file they got from the mgmt client at time the problem happened. The problem experianced may no longer exist, so asking them to run a tracepoint script is not possible. They may also be reluctant to actually run tracepoint scripts on a production system, or simply lack the ability todo so at all, due to constraints of the deployment environment. Logs from libvirt are something that are collected by default for many mgmt apps, or can be turned on by the user with minimal risk of disruption. Overall, there's a compelling desire to be proactive in collecting information ahead of time, that might be useful in diagnosing future bug reports. This is the main reason. When you encounter a problem one of the first questions is "Was there something similar in the past. Another question is how often does it happen. With the timestamps these questions answering becomes easier. Another thing is that with the qmp command timestamps you can build a monitoring system which will report about the cases when execution_time_from_mgmt_perspective - excution_time_qmp_command > some_threshold which in turn proactively tell you about the potential problems. And then you'll start using the qmp tracepoints (and other means) to figure out the real reason of the execution time variance. Thanks, Denis So it isn't an 'either / or' decision of QMP reply logs vs use of tracepoints, both are beneficial, with their own pros/cons. With regards, Daniel
Re: [PATCH v2] virtio: add VIRTQUEUE_ERROR QAPI event
Reviewed-by: Denis Plotnikov On 9/12/23 20:57, Vladimir Sementsov-Ogievskiy wrote: For now we only log the vhost device error, when virtqueue is actually stopped. Let's add a QAPI event, which makes possible: - collect statistics of such errors - make immediate actions: take core dumps or do some other debugging - inform the user through a management API or UI, so that (s)he can react somehow, e.g. reset the device driver in the guest or even build up some automation to do so Note that basically every inconsistency discovered during virtqueue processing results in a silent virtqueue stop. The guest then just sees the requests getting stuck somewhere in the device for no visible reason. This event provides a means to inform the management layer of this situation in a timely fashion. The event could be reused for some other virtqueue problems (not only for vhost devices) in future. For this it gets a generic name and structure. We keep original VHOST_OPS_DEBUG(), to keep original debug output as is here, it's not the only call to VHOST_OPS_DEBUG in the file. Signed-off-by: Vladimir Sementsov-Ogievskiy --- v2: - improve commit message (just stole wording by Roman, hope he don't mind:) - add event throttling hw/virtio/vhost.c | 12 +--- monitor/monitor.c | 10 ++ qapi/qdev.json| 25 + 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index e2f6ffb446..162899feee 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -15,6 +15,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" +#include "qapi/qapi-events-qdev.h" #include "hw/virtio/vhost.h" #include "qemu/atomic.h" #include "qemu/range.h" @@ -1332,11 +1333,16 @@ static void vhost_virtqueue_error_notifier(EventNotifier *n) struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, error_notifier); struct vhost_dev *dev = vq->dev; -int index = vq - dev->vqs; if (event_notifier_test_and_clear(n) && dev->vdev) { -VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", -dev->vq_index + index); +int ind = vq - dev->vqs + dev->vq_index; +DeviceState *ds = &dev->vdev->parent_obj; + +VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", ind); +qapi_event_send_virtqueue_error(ds->id, ds->canonical_path, ind, +VIRTQUEUE_ERROR_VHOST_VRING_ERR, +"vhost reported failure through vring " +"error fd"); } } diff --git a/monitor/monitor.c b/monitor/monitor.c index 941f87815a..cb1ee31156 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -313,6 +313,7 @@ static MonitorQAPIEventConf monitor_qapi_event_conf[QAPI_EVENT__MAX] = { [QAPI_EVENT_BALLOON_CHANGE]= { 1000 * SCALE_MS }, [QAPI_EVENT_QUORUM_REPORT_BAD] = { 1000 * SCALE_MS }, [QAPI_EVENT_QUORUM_FAILURE]= { 1000 * SCALE_MS }, +[QAPI_EVENT_VIRTQUEUE_ERROR] = { 1000 * SCALE_MS }, [QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS }, [QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS }, }; @@ -497,6 +498,10 @@ static unsigned int qapi_event_throttle_hash(const void *key) hash += g_str_hash(qdict_get_str(evstate->data, "qom-path")); } +if (evstate->event == QAPI_EVENT_VIRTQUEUE_ERROR) { +hash += g_str_hash(qdict_get_str(evstate->data, "device")); +} + return hash; } @@ -524,6 +529,11 @@ static gboolean qapi_event_throttle_equal(const void *a, const void *b) qdict_get_str(evb->data, "qom-path")); } +if (eva->event == QAPI_EVENT_VIRTQUEUE_ERROR) { +return !strcmp(qdict_get_str(eva->data, "device"), + qdict_get_str(evb->data, "device")); +} + return TRUE; } diff --git a/qapi/qdev.json b/qapi/qdev.json index 6bc5a733b8..199e21cae7 100644 --- a/qapi/qdev.json +++ b/qapi/qdev.json @@ -161,3 +161,28 @@ ## { 'event': 'DEVICE_UNPLUG_GUEST_ERROR', 'data': { '*device': 'str', 'path': 'str' } } + +## +# @VirtqueueError: +# +# Since: 8.2 +## +{ 'enum': 'VirtqueueError', + 'data': [ 'vhost-vring-err' ] } + +## +# @VIRTQUEUE_ERROR: +# +# Emitted when a device virtqueue fails in runtime. +# +# @device: the device's ID if it has one +# @path: the device's QOM path +# @virtqueue: virtqueue index +# @error: error identifier +# @description: human readable description +# +# Since: 8.2 +## +{ 'event': 'VIRTQUEUE_ERROR', + 'data': { '*device': 'str', 'path': 'str', 'virtqueue': 'int', +'error': 'VirtqueueError', 'description': 'str'} }
[Qemu-devel] [PATCH] kvmclock: update system_time_msr address forcibly
Do an update of system_time_msr address every time before reading the value of tsc_timestamp from guest's kvmclock page. It should be done in a forcible manner because there is a situation when system_time_msr has been set by kvm but qemu doesn't aware of it. This leads to updates of kvmclock_offset without respect of guest's kvmclock values. The situation appears when L2 linux guest runs over L1 linux guest and the action inducing system_time_msr update is tpr access reporting. Some L1 linux guests turn off processing TPR access and when L0 gets an L2 exit induced by TPR MSR access it doesn't enter L1 and processed it by itself. Thus, L1 kvm doesn't know about that TPR access happening and doesn't exit to qemu which in turn doesn't set system_time_msr address. This patch fixes this by making sure it knows the correct address every time it is needed. Signed-off-by: Denis Plotnikov --- hw/i386/kvm/clock.c | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index e713162..035196a 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -48,11 +48,38 @@ struct pvclock_vcpu_time_info { uint8_tpad[2]; } __attribute__((__packed__)); /* 32 bytes */ +static void update_all_system_time_msr(void) +{ +CPUState *cpu; +CPUX86State *env; +struct { +struct kvm_msrs info; +struct kvm_msr_entry entries[1]; +} msr_data; +int ret; + +msr_data.info.nmsrs = 1; +msr_data.entries[0].index = MSR_KVM_SYSTEM_TIME; + +CPU_FOREACH(cpu) { +ret = kvm_vcpu_ioctl(cpu, KVM_GET_MSRS, &msr_data); + +if (ret < 0) { +fprintf(stderr, "KVM_GET_MSRS failed: %s\n", strerror(ret)); +abort(); +} + +assert(ret == 1); +env = cpu->env_ptr; +env->system_time_msr = msr_data.entries[0].data; +} +} + static uint64_t kvmclock_current_nsec(KVMClockState *s) { CPUState *cpu = first_cpu; CPUX86State *env = cpu->env_ptr; -hwaddr kvmclock_struct_pa = env->system_time_msr & ~1ULL; +hwaddr kvmclock_struct_pa; uint64_t migration_tsc = env->tsc; struct pvclock_vcpu_time_info time; uint64_t delta; @@ -60,6 +87,9 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s) uint64_t nsec_hi; uint64_t nsec; +update_all_system_time_msr(); +kvmclock_struct_pa = env->system_time_msr & ~1ULL; + if (!(env->system_time_msr & 1ULL)) { /* KVM clock not active */ return 0; -- 2.7.4
Re: [Qemu-devel] [PATCH] kvmclock: update system_time_msr address forcibly
On 24.05.2017 17:09, Denis V. Lunev wrote: On 05/24/2017 05:07 PM, Denis Plotnikov wrote: Do an update of system_time_msr address every time before reading the value of tsc_timestamp from guest's kvmclock page. It should be done in a forcible manner because there is a situation when system_time_msr has been set by kvm but qemu doesn't aware of it. This leads to updates of kvmclock_offset without respect of guest's kvmclock values. The situation appears when L2 linux guest runs over L1 linux guest and the action inducing system_time_msr update is tpr access reporting. Some L1 linux guests turn off processing TPR access and when L0 gets an L2 exit induced by TPR MSR access it doesn't enter L1 and processed it by itself. Thus, L1 kvm doesn't know about that TPR access happening and doesn't exit to qemu which in turn doesn't set system_time_msr address. This patch fixes this by making sure it knows the correct address every time it is needed. Signed-off-by: Denis Plotnikov --- hw/i386/kvm/clock.c | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index e713162..035196a 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -48,11 +48,38 @@ struct pvclock_vcpu_time_info { uint8_tpad[2]; } __attribute__((__packed__)); /* 32 bytes */ +static void update_all_system_time_msr(void) +{ +CPUState *cpu; +CPUX86State *env; +struct { +struct kvm_msrs info; +struct kvm_msr_entry entries[1]; +} msr_data; +int ret; + +msr_data.info.nmsrs = 1; +msr_data.entries[0].index = MSR_KVM_SYSTEM_TIME; + +CPU_FOREACH(cpu) { +ret = kvm_vcpu_ioctl(cpu, KVM_GET_MSRS, &msr_data); + +if (ret < 0) { +fprintf(stderr, "KVM_GET_MSRS failed: %s\n", strerror(ret)); +abort(); +} + +assert(ret == 1); +env = cpu->env_ptr; +env->system_time_msr = msr_data.entries[0].data; +} +} + static uint64_t kvmclock_current_nsec(KVMClockState *s) { CPUState *cpu = first_cpu; CPUX86State *env = cpu->env_ptr; -hwaddr kvmclock_struct_pa = env->system_time_msr & ~1ULL; +hwaddr kvmclock_struct_pa; uint64_t migration_tsc = env->tsc; struct pvclock_vcpu_time_info time; uint64_t delta; @@ -60,6 +87,9 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s) uint64_t nsec_hi; uint64_t nsec; +update_all_system_time_msr(); +kvmclock_struct_pa = env->system_time_msr & ~1ULL; + should we do this once/per guest boot? practically - yes. I can barely imagine that the pv_clock page address may be changed after being set once. But we don't know the exact moment when the guest is going to write it. And not to be dependent of any other event I decided to check it every time before using since it won't make any performance issues because this invocation happens on vm state changes only. Den if (!(env->system_time_msr & 1ULL)) { /* KVM clock not active */ return 0; -- Best, Denis
[Qemu-devel] [PATCH v2] kvmclock: update system_time_msr address forcibly
Do an update of system_time_msr address every time before reading the value of tsc_timestamp from guest's kvmclock page. There is no other code paths which ensure that qemu has an up-to-date value of system_time_msr. So, force this update on guest's tsc_timestamp reading. This bug causes effect on those nested setups which turn off TPR access interception for L2 guests and that access being intercepted by L0 doesn't show up in L1. Linux bootstrap initiate kvmclock before APIC initializing causing TPR access. That's why on L1 guests, having TPR interception turned on for L2, the effect of the bug is not revealed. This patch fixes this problem by making sure it knows the correct system_time_msr address every time it is needed. Signed-off-by: Denis Plotnikov --- hw/i386/kvm/clock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 0f75dd3..875d85f 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -61,6 +61,8 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s) uint64_t nsec_hi; uint64_t nsec; +cpu_synchronize_state(cpu); + if (!(env->system_time_msr & 1ULL)) { /* KVM clock not active */ return 0; -- 2.7.4
[Qemu-devel] [PATCH v3] kvmclock: update system_time_msr address forcibly
Do an update of system_time_msr address every time before reading the value of tsc_timestamp from guest's kvmclock page. There is no other code paths which ensure that qemu has an up-to-date value of system_time_msr. So, force this update on guest's tsc_timestamp reading. This bug causes effect on those nested setups which turn off TPR access interception for L2 guests and that access being intercepted by L0 doesn't show up in L1. Linux bootstrap initiate kvmclock before APIC initializing causing TPR access. That's why on L1 guests, having TPR interception turned on for L2, the effect of the bug is not revealed. This patch fixes this problem by making sure it knows the correct system_time_msr address every time it is needed. Signed-off-by: Denis Plotnikov --- hw/i386/kvm/clock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 13eca37..363d1b5 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -19,6 +19,7 @@ #include "qemu/host-utils.h" #include "sysemu/sysemu.h" #include "sysemu/kvm.h" +#include "sysemu/hw_accel.h" #include "kvm_i386.h" #include "hw/sysbus.h" #include "hw/kvm/clock.h" @@ -69,6 +70,8 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s) uint64_t nsec_hi; uint64_t nsec; +cpu_synchronize_state(cpu); + if (!(env->system_time_msr & 1ULL)) { /* KVM clock not active */ return 0; -- 2.7.4
[Qemu-devel] [PATCH] i386: turn off l3-cache property by default
Commit 14c985cffa "target-i386: present virtual L3 cache info for vcpus" introduced and set by default exposing l3 to the guest. The motivation behind it was that in the Linux scheduler, when waking up a task on a sibling CPU, the task was put onto the target CPU's runqueue directly, without sending a reschedule IPI. Reduction in the IPI count led to performance gain. However, this isn't the whole story. Once the task is on the target CPU's runqueue, it may have to preempt the current task on that CPU, be it the idle task putting the CPU to sleep or just another running task. For that a reschedule IPI will have to be issued, too. Only when that other CPU is running a normal task for too little time, the fairness constraints will prevent the preemption and thus the IPI. This boils down to the improvement being only achievable in workloads with many actively switching tasks. We had no access to the (proprietary?) SAP HANA benchmark the commit referred to, but the pattern is also reproduced with "perf bench sched messaging -g 1" on 1 socket, 8 cores vCPU topology, we see indeed: l3-cache#res IPI /s #time / 1 loops off 560K1.8 sec on 40K 0.9 sec Now there's a downside: with L3 cache the Linux scheduler is more eager to wake up tasks on sibling CPUs, resulting in unnecessary cross-vCPU interactions and therefore exessive halts and IPIs. E.g. "perf bench sched pipe -i 10" gives l3-cache#res IPI /s #HLT /s #time /10 loops off 200 (no K) 230 0.2 sec on 400K330K0.5 sec In a more realistic test, we observe 15% degradation in VM density (measured as the number of VMs, each running Drupal CMS serving 2 http requests per second to its main page, with 95%-percentile response latency under 100 ms) with l3-cache=on. We think that mostly-idle scenario is more common in cloud and personal usage, and should be optimized for by default; users of highly loaded VMs should be able to tune them up themselves. So switch l3-cache off by default, and add a compat clause for the range of machine types where it was on. Signed-off-by: Denis Plotnikov Reviewed-by: Roman Kagan --- include/hw/i386/pc.h | 7 ++- target/i386/cpu.c| 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 087d184..1d2dcae 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -375,7 +375,12 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); .driver = TYPE_X86_CPU,\ .property = "x-hv-max-vps",\ .value= "0x40",\ -}, +},\ +{\ +.driver = TYPE_X86_CPU,\ +.property = "l3-cache",\ +.value= "on",\ +},\ #define PC_COMPAT_2_9 \ HW_COMPAT_2_9 \ diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 1edcf29..95a51bd 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -4154,7 +4154,7 @@ static Property x86_cpu_properties[] = { DEFINE_PROP_STRING("hv-vendor-id", X86CPU, hyperv_vendor_id), DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true), DEFINE_PROP_BOOL("lmce", X86CPU, enable_lmce, false), -DEFINE_PROP_BOOL("l3-cache", X86CPU, enable_l3_cache, true), +DEFINE_PROP_BOOL("l3-cache", X86CPU, enable_l3_cache, false), DEFINE_PROP_BOOL("kvm-no-smi-migration", X86CPU, kvm_no_smi_migration, false), DEFINE_PROP_BOOL("vmware-cpuid-freq", X86CPU, vmware_cpuid_freq, true), -- 2.7.4
[Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"
At the time, the "drained section" doesn't protect Block Driver State from the requests appearing in the vCPU threads. This could lead to the data loss because of request coming to an unexpected BDS. For example, when a request comes to ide controller from the guest, the controller creates a request coroutine and executes the coroutine in the vCPU thread. If another thread(iothread) has entered the "drained section" on a BDS with bdrv_drained_begin, which protects BDS' AioContext from external requests, and released the AioContext because of finishing some coroutine by the moment of the request appearing at the ide controller, the controller acquires the AioContext and executes its request without any respect to the entered "drained section" producing any kinds of data inconsistency. The patch prevents this case by putting requests from external threads to the queue on AioContext while the context is protected for external requests and executes those requests later on the external requests protection removing. Also, the patch marks requests generated in a vCPU thread as external ones to make use of the request postponing. How to reproduce: 1. start vm with an ide disk and a linux guest 2. in the guest run: dd if=... of=... bs=4K count=10 oflag=direct 3. (qemu) drive_mirror "disk-name" 4. wait until block job can receive block_job_complete 5. (qemu) block_job_complete "disk-name" 6. blk_aio_p[read|write]v may appear in vCPU context (here is the race) Signed-off-by: Denis Plotnikov --- block/block-backend.c | 31 +++ block/io.c| 3 ++- dma-helpers.c | 4 ++-- hw/block/nvme.c | 8 hw/block/xen_disk.c | 8 hw/ide/core.c | 6 -- hw/scsi/scsi-disk.c | 10 ++ include/block/aio.h | 37 - include/block/block.h | 8 +++- util/async.c | 2 ++ 10 files changed, 90 insertions(+), 27 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 60d37a0c3d..10f7dd357d 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1148,6 +1148,8 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset, return 0; } +static void coroutine_fn blk_postpone_request(BlockBackend *blk); + int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) @@ -1157,6 +1159,10 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, trace_blk_co_preadv(blk, bs, offset, bytes, flags); +if ((flags & BDRV_REQ_EXTERNAL)) { +blk_postpone_request(blk); +} + ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; @@ -1184,6 +1190,10 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, trace_blk_co_pwritev(blk, bs, offset, bytes, flags); +if ((flags & BDRV_REQ_EXTERNAL)) { +blk_postpone_request(blk); +} + ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; @@ -1304,6 +1314,27 @@ static void blk_dec_in_flight(BlockBackend *blk) aio_wait_kick(); } +static void coroutine_fn blk_postpone_request(BlockBackend *blk) +{ +AioContext *ctx; + +assert(qemu_in_coroutine()); +ctx = qemu_coroutine_get_aio_context(qemu_coroutine_self()); + +/* + * Put the request to the postponed queue if + * external requests is not allowed currenly + * The request is continued when the context + * leaves the bdrv "drained" section allowing + * external requests + */ +if (aio_external_disabled(ctx)) { +blk_dec_in_flight(blk); +qemu_co_queue_wait(&ctx->postponed_reqs, NULL); +blk_inc_in_flight(blk); +} +} + static void error_callback_bh(void *opaque) { struct BlockBackendAIOCB *acb = opaque; diff --git a/block/io.c b/block/io.c index bd9d688f8b..019da464a2 100644 --- a/block/io.c +++ b/block/io.c @@ -1318,7 +1318,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, * potential fallback support, if we ever implement any read flags * to pass through to drivers. For now, there aren't any * passthrough flags. */ -assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); +assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ | + BDRV_REQ_EXTERNAL))); /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { diff --git a/dma-helpers.c b/dma-helpers.c index 2d7e02d35e..53706031c5 100644 --- a/dma-helpers.c +++ b/dma-helpers.c @@ -235,7 +235,7 @@ BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov, voi
Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"
On 07.12.2018 15:26, Kevin Wolf wrote: > Am 05.12.2018 um 13:23 hat Denis Plotnikov geschrieben: >> At the time, the "drained section" doesn't protect Block Driver State >> from the requests appearing in the vCPU threads. >> This could lead to the data loss because of request coming to >> an unexpected BDS. >> >> For example, when a request comes to ide controller from the guest, >> the controller creates a request coroutine and executes the coroutine >> in the vCPU thread. If another thread(iothread) has entered the >> "drained section" on a BDS with bdrv_drained_begin, which protects >> BDS' AioContext from external requests, and released the AioContext >> because of finishing some coroutine by the moment of the request >> appearing at the ide controller, the controller acquires the AioContext >> and executes its request without any respect to the entered >> "drained section" producing any kinds of data inconsistency. >> >> The patch prevents this case by putting requests from external threads to >> the queue on AioContext while the context is protected for external requests >> and executes those requests later on the external requests protection >> removing. In general, I agree with the comments and going to make changes in the patches accordingly. Also, I'd like to ask a question below >> >> Also, the patch marks requests generated in a vCPU thread as external ones >> to make use of the request postponing. >> >> How to reproduce: >> 1. start vm with an ide disk and a linux guest >> 2. in the guest run: dd if=... of=... bs=4K count=10 oflag=direct >> 3. (qemu) drive_mirror "disk-name" >> 4. wait until block job can receive block_job_complete >> 5. (qemu) block_job_complete "disk-name" >> 6. blk_aio_p[read|write]v may appear in vCPU context (here is the race) >> >> Signed-off-by: Denis Plotnikov > > This is getting closer, but I'd like to see two more major changes: > >> diff --git a/include/block/aio.h b/include/block/aio.h >> index 0ca25dfec6..8512bda44e 100644 >> --- a/include/block/aio.h >> +++ b/include/block/aio.h >> @@ -19,6 +19,7 @@ >> #include "qemu/event_notifier.h" >> #include "qemu/thread.h" >> #include "qemu/timer.h" >> +#include "qemu/coroutine.h" >> >> typedef struct BlockAIOCB BlockAIOCB; >> typedef void BlockCompletionFunc(void *opaque, int ret); >> @@ -130,6 +131,11 @@ struct AioContext { >> QEMUTimerListGroup tlg; >> >> int external_disable_cnt; >> +/* Queue to store the requests coming when the context is disabled for >> + * external requests. >> + * Don't use a separate lock for protection relying the context lock >> + */ >> +CoQueue postponed_reqs; > > Why involve the AioContext at all? This could all be kept at the > BlockBackend level without extending the layering violation that > aio_disable_external() is. > > BlockBackends get notified when their root node is drained, so hooking > things up there should be as easy, if not even easier than in > AioContext. > >> /* Number of AioHandlers without .io_poll() */ >> int poll_disable_cnt; >> @@ -483,6 +489,15 @@ static inline void aio_timer_init(AioContext *ctx, >>*/ >> int64_t aio_compute_timeout(AioContext *ctx); >> >> +/** >> + * aio_co_enter: >> + * @ctx: the context to run the coroutine >> + * @co: the coroutine to run >> + * >> + * Enter a coroutine in the specified AioContext. >> + */ >> +void aio_co_enter(AioContext *ctx, struct Coroutine *co); >> + >> /** >>* aio_disable_external: >>* @ctx: the aio context >> @@ -491,9 +506,17 @@ int64_t aio_compute_timeout(AioContext *ctx); >>*/ >> static inline void aio_disable_external(AioContext *ctx) >> { >> +aio_context_acquire(ctx); >> atomic_inc(&ctx->external_disable_cnt); >> +aio_context_release(ctx); >> } > > This acquire/release pair looks rather useless? I'm not sure that I understand everything correctly... but can a thread (context) try to disable external in another context? > >> +static void run_postponed_co(void *opaque) >> +{ >> +AioContext *ctx = (AioContext *) opaque; >> + >> +qemu_co_queue_restart_all(&ctx->postponed_reqs); >> +} >> /** >>* aio_enable_external: >>* @ctx: the aio context >> @@ -504,12 +527,17 @@ static inline voi
Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"
On 07.12.2018 15:26, Kevin Wolf wrote: > Am 05.12.2018 um 13:23 hat Denis Plotnikov geschrieben: >> At the time, the "drained section" doesn't protect Block Driver State >> from the requests appearing in the vCPU threads. >> This could lead to the data loss because of request coming to >> an unexpected BDS. >> >> For example, when a request comes to ide controller from the guest, >> the controller creates a request coroutine and executes the coroutine >> in the vCPU thread. If another thread(iothread) has entered the >> "drained section" on a BDS with bdrv_drained_begin, which protects >> BDS' AioContext from external requests, and released the AioContext >> because of finishing some coroutine by the moment of the request >> appearing at the ide controller, the controller acquires the AioContext >> and executes its request without any respect to the entered >> "drained section" producing any kinds of data inconsistency. >> >> The patch prevents this case by putting requests from external threads to >> the queue on AioContext while the context is protected for external requests >> and executes those requests later on the external requests protection >> removing. >> >> Also, the patch marks requests generated in a vCPU thread as external ones >> to make use of the request postponing. >> >> How to reproduce: >> 1. start vm with an ide disk and a linux guest >> 2. in the guest run: dd if=... of=... bs=4K count=10 oflag=direct >> 3. (qemu) drive_mirror "disk-name" >> 4. wait until block job can receive block_job_complete >> 5. (qemu) block_job_complete "disk-name" >> 6. blk_aio_p[read|write]v may appear in vCPU context (here is the race) >> >> Signed-off-by: Denis Plotnikov > > This is getting closer, but I'd like to see two more major changes: > >> diff --git a/include/block/aio.h b/include/block/aio.h >> index 0ca25dfec6..8512bda44e 100644 >> --- a/include/block/aio.h >> +++ b/include/block/aio.h >> @@ -19,6 +19,7 @@ >> #include "qemu/event_notifier.h" >> #include "qemu/thread.h" >> #include "qemu/timer.h" >> +#include "qemu/coroutine.h" >> >> typedef struct BlockAIOCB BlockAIOCB; >> typedef void BlockCompletionFunc(void *opaque, int ret); >> @@ -130,6 +131,11 @@ struct AioContext { >> QEMUTimerListGroup tlg; >> >> int external_disable_cnt; >> +/* Queue to store the requests coming when the context is disabled for >> + * external requests. >> + * Don't use a separate lock for protection relying the context lock >> + */ >> +CoQueue postponed_reqs; > > Why involve the AioContext at all? This could all be kept at the > BlockBackend level without extending the layering violation that > aio_disable_external() is. > > BlockBackends get notified when their root node is drained, so hooking > things up there should be as easy, if not even easier than in > AioContext. Just want to make sure that I understood correctly what you meant by "BlockBackends get notified". Did you mean that bdrv_drain_end calls child's role callback blk_root_drained_end by calling bdrv_parent_drained_end? In case if it's so, it won't work if resume postponed requests in blk_root_drained_end since we can't know if external is disabled for the context because the counter showing that is decreased only after roles' drained callbacks are finished at bdrv_do_drained_end. Please correct me if I'm wrong. Looking at the patch again, I think that it might be useful to keep the requests in the structure that limits their execution and also protects the access (context acquire/release) although it's indeed the layering violation but at least we can store the parts related at the same place and later on move somewhere else alongside the request restrictor. Denis > >> /* Number of AioHandlers without .io_poll() */ >> int poll_disable_cnt; >> @@ -483,6 +489,15 @@ static inline void aio_timer_init(AioContext *ctx, >>*/ >> int64_t aio_compute_timeout(AioContext *ctx); >> >> +/** >> + * aio_co_enter: >> + * @ctx: the context to run the coroutine >> + * @co: the coroutine to run >> + * >> + * Enter a coroutine in the specified AioContext. >> + */ >> +void aio_co_enter(AioContext *ctx, struct Coroutine *co); >> + >> /** >>* aio_disable_external: >>* @ctx: the aio context >> @@ -491,9 +506,17 @@ int64_t aio_compute_
Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"
On 12.12.2018 15:24, Kevin Wolf wrote: > Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben: >>> Why involve the AioContext at all? This could all be kept at the >>> BlockBackend level without extending the layering violation that >>> aio_disable_external() is. >>> >>> BlockBackends get notified when their root node is drained, so hooking >>> things up there should be as easy, if not even easier than in >>> AioContext. >> >> Just want to make sure that I understood correctly what you meant by >> "BlockBackends get notified". Did you mean that bdrv_drain_end calls >> child's role callback blk_root_drained_end by calling >> bdrv_parent_drained_end? > > Yes, blk_root_drained_begin/end calls are all you need. Specifically, > their adjustments to blk->quiesce_counter that are already there, and in > the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end() > we can resume the queued requests. Sounds it should be so, but it doesn't work that way and that's why: when doing mirror we may resume postponed coroutines too early when the underlying bs is protected from writing at and thus we encounter the assert on a write request execution at bdrv_co_write_req_prepare when resuming the postponed coroutines. The thing is that the bs is protected for writing before execution of bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls bdrv_replace_child_noperm which, in turn, calls child->role->drained_end where one of the callbacks is blk_root_drained_end which check if(--blk->quiesce_counter == 0) and runs the postponed requests (coroutines) if the coundition is true. In seems that if the external requests disabled on the context we can't rely on anything or should check where the underlying bs and its underlying nodes are ready to receive requests which sounds quite complicated. Please correct me if still don't understand something in that routine. Denis >> In case if it's so, it won't work if resume postponed requests in >> blk_root_drained_end since we can't know if external is disabled for the >> context because the counter showing that is decreased only after roles' >> drained callbacks are finished at bdrv_do_drained_end. >> Please correct me if I'm wrong. > > You don't need to know about the AioContext state, this is the whole > point. blk->quiesce_counter is what tells you whether to postpone > requests. > >> Looking at the patch again, I think that it might be useful to keep the >> requests in the structure that limits their execution and also protects >> the access (context acquire/release) although it's indeed the layering >> violation but at least we can store the parts related at the same place >> and later on move somewhere else alongside the request restrictor. > > You can keep everything you need in BlockBackend (and that's also where > your code is that really postpones request). > > Kevin > -- Best, Denis
Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"
On 13.12.2018 15:20, Kevin Wolf wrote: > Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben: >> On 12.12.2018 15:24, Kevin Wolf wrote: >>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben: >>>>> Why involve the AioContext at all? This could all be kept at the >>>>> BlockBackend level without extending the layering violation that >>>>> aio_disable_external() is. >>>>> >>>>> BlockBackends get notified when their root node is drained, so hooking >>>>> things up there should be as easy, if not even easier than in >>>>> AioContext. >>>> >>>> Just want to make sure that I understood correctly what you meant by >>>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls >>>> child's role callback blk_root_drained_end by calling >>>> bdrv_parent_drained_end? >>> >>> Yes, blk_root_drained_begin/end calls are all you need. Specifically, >>> their adjustments to blk->quiesce_counter that are already there, and in >>> the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end() >>> we can resume the queued requests. >> Sounds it should be so, but it doesn't work that way and that's why: >> when doing mirror we may resume postponed coroutines too early when the >> underlying bs is protected from writing at and thus we encounter the >> assert on a write request execution at bdrv_co_write_req_prepare when >> resuming the postponed coroutines. >> >> The thing is that the bs is protected for writing before execution of >> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls >> bdrv_replace_child_noperm which, in turn, calls child->role->drained_end >> where one of the callbacks is blk_root_drained_end which check >> if(--blk->quiesce_counter == 0) and runs the postponed requests >> (coroutines) if the coundition is true. > > Hm, so something is messed up with the drain sections in the mirror > driver. We have: > > bdrv_drained_begin(target_bs); > bdrv_replace_node(to_replace, target_bs, &local_err); > bdrv_drained_end(target_bs); > > Obviously, the intention was to keep the BlockBackend drained during > bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0 > inside bdrv_replace_node() when target_bs is drained? > > Looking at bdrv_replace_child_noperm(), it seems that the function has > a bug: Even if old_bs and new_bs are both drained, the quiesce_counter > for the parent reaches 0 for a moment because we call .drained_end for > the old child first and .drained_begin for the new one later. > > So it seems the fix would be to reverse the order and first call > .drained_begin for the new child and then .drained_end for the old > child. Sounds like a good new testcase for tests/test-bdrv-drain.c, too. Yes, it's true, but it's not enough... In mirror_exit_common() we actively manipulate with block driver states. When we replaced a node in the snippet you showed we can't allow the postponed coroutines to run because the block tree isn't ready to receive the requests yet. To be ready, we need to insert a proper block driver state to the block backend which is done here blk_remove_bs(bjob->blk); blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort); blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); << << << << bs_opaque->job = NULL; bdrv_drained_end(src); If the tree isn't ready and we resume the coroutines, we'll end up with the request landed in a wrong block driver state. So, we explicitly should stop all activities on all the driver states and its parents and allow the activities when everything is ready to go. Why explicitly, because the block driver states may belong to different block backends at the moment of the manipulation beginning. So, it seems we need to disable all their contexts until the manipulation ends. Please, correct me if I'm wrong. > >> In seems that if the external requests disabled on the context we can't >> rely on anything or should check where the underlying bs and its >> underlying nodes are ready to receive requests which sounds quite >> complicated. >> Please correct me if still don't understand something in that routine. > > I think the reason why reyling on aio_disable_external() works is simply > because src is also drained, which keeps external events in the > AioContext disabled despite the bug in draining the target node. > > The bug would become apparent even with aio_disable_external() if we > didn't drain src, or even if we just supported src and target being in > different AioContexts. Why don't we disable all those contexts involved until the end of the block device tree reconstruction? Thanks! Denis > > Kevin > -- Best, Denis
Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"
ping ping On 14.12.2018 14:54, Denis Plotnikov wrote: > > > On 13.12.2018 15:20, Kevin Wolf wrote: >> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben: >>> On 12.12.2018 15:24, Kevin Wolf wrote: >>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben: >>>>>> Why involve the AioContext at all? This could all be kept at the >>>>>> BlockBackend level without extending the layering violation that >>>>>> aio_disable_external() is. >>>>>> >>>>>> BlockBackends get notified when their root node is drained, so >>>>>> hooking >>>>>> things up there should be as easy, if not even easier than in >>>>>> AioContext. >>>>> >>>>> Just want to make sure that I understood correctly what you meant by >>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls >>>>> child's role callback blk_root_drained_end by calling >>>>> bdrv_parent_drained_end? >>>> >>>> Yes, blk_root_drained_begin/end calls are all you need. Specifically, >>>> their adjustments to blk->quiesce_counter that are already there, >>>> and in >>>> the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end() >>>> we can resume the queued requests. >>> Sounds it should be so, but it doesn't work that way and that's why: >>> when doing mirror we may resume postponed coroutines too early when the >>> underlying bs is protected from writing at and thus we encounter the >>> assert on a write request execution at bdrv_co_write_req_prepare when >>> resuming the postponed coroutines. >>> >>> The thing is that the bs is protected for writing before execution of >>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls >>> bdrv_replace_child_noperm which, in turn, calls child->role->drained_end >>> where one of the callbacks is blk_root_drained_end which check >>> if(--blk->quiesce_counter == 0) and runs the postponed requests >>> (coroutines) if the coundition is true. >> >> Hm, so something is messed up with the drain sections in the mirror >> driver. We have: >> >> bdrv_drained_begin(target_bs); >> bdrv_replace_node(to_replace, target_bs, &local_err); >> bdrv_drained_end(target_bs); >> >> Obviously, the intention was to keep the BlockBackend drained during >> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0 >> inside bdrv_replace_node() when target_bs is drained? >> >> Looking at bdrv_replace_child_noperm(), it seems that the function has >> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter >> for the parent reaches 0 for a moment because we call .drained_end for >> the old child first and .drained_begin for the new one later. >> >> So it seems the fix would be to reverse the order and first call >> .drained_begin for the new child and then .drained_end for the old >> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, too. > Yes, it's true, but it's not enough... > In mirror_exit_common() we actively manipulate with block driver states. > When we replaced a node in the snippet you showed we can't allow the > postponed coroutines to run because the block tree isn't ready to > receive the requests yet. > To be ready, we need to insert a proper block driver state to the block > backend which is done here > > blk_remove_bs(bjob->blk); > blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort); > blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); << << << << > > bs_opaque->job = NULL; > > bdrv_drained_end(src); > > If the tree isn't ready and we resume the coroutines, we'll end up with > the request landed in a wrong block driver state. > > So, we explicitly should stop all activities on all the driver states > and its parents and allow the activities when everything is ready to go. > > Why explicitly, because the block driver states may belong to different > block backends at the moment of the manipulation beginning. > > So, it seems we need to disable all their contexts until the > manipulation ends. > > Please, correct me if I'm wrong. > >> >>> In seems that if the external requests disabled on the context we can't >>> rely on anything or should check where the underlying bs and its >>> underlying nodes are ready to receive requests which sounds quite >>> complicated. >>> Please correct me if still don't understand something in that routine. >> >> I think the reason why reyling on aio_disable_external() works is simply >> because src is also drained, which keeps external events in the >> AioContext disabled despite the bug in draining the target node. >> >> The bug would become apparent even with aio_disable_external() if we >> didn't drain src, or even if we just supported src and target being in >> different AioContexts. > > Why don't we disable all those contexts involved until the end of the > block device tree reconstruction? > > Thanks! > > Denis >> >> Kevin >> > -- Best, Denis
Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"
ping ping ping ping On 09.01.2019 11:18, Denis Plotnikov wrote: > ping ping!!! > > On 18.12.2018 11:53, Denis Plotnikov wrote: >> ping ping >> >> On 14.12.2018 14:54, Denis Plotnikov wrote: >>> >>> >>> On 13.12.2018 15:20, Kevin Wolf wrote: >>>> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben: >>>>> On 12.12.2018 15:24, Kevin Wolf wrote: >>>>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben: >>>>>>>> Why involve the AioContext at all? This could all be kept at the >>>>>>>> BlockBackend level without extending the layering violation that >>>>>>>> aio_disable_external() is. >>>>>>>> >>>>>>>> BlockBackends get notified when their root node is drained, so >>>>>>>> hooking >>>>>>>> things up there should be as easy, if not even easier than in >>>>>>>> AioContext. >>>>>>> >>>>>>> Just want to make sure that I understood correctly what you meant by >>>>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls >>>>>>> child's role callback blk_root_drained_end by calling >>>>>>> bdrv_parent_drained_end? >>>>>> >>>>>> Yes, blk_root_drained_begin/end calls are all you need. Specifically, >>>>>> their adjustments to blk->quiesce_counter that are already there, >>>>>> and in >>>>>> the 'if (--blk->quiesce_counter == 0)' block of >>>>>> blk_root_drained_end() >>>>>> we can resume the queued requests. >>>>> Sounds it should be so, but it doesn't work that way and that's why: >>>>> when doing mirror we may resume postponed coroutines too early when >>>>> the >>>>> underlying bs is protected from writing at and thus we encounter the >>>>> assert on a write request execution at bdrv_co_write_req_prepare when >>>>> resuming the postponed coroutines. >>>>> >>>>> The thing is that the bs is protected for writing before execution of >>>>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls >>>>> bdrv_replace_child_noperm which, in turn, calls >>>>> child->role->drained_end >>>>> where one of the callbacks is blk_root_drained_end which check >>>>> if(--blk->quiesce_counter == 0) and runs the postponed requests >>>>> (coroutines) if the coundition is true. >>>> >>>> Hm, so something is messed up with the drain sections in the mirror >>>> driver. We have: >>>> >>>> bdrv_drained_begin(target_bs); >>>> bdrv_replace_node(to_replace, target_bs, &local_err); >>>> bdrv_drained_end(target_bs); >>>> >>>> Obviously, the intention was to keep the BlockBackend drained during >>>> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0 >>>> inside bdrv_replace_node() when target_bs is drained? >>>> >>>> Looking at bdrv_replace_child_noperm(), it seems that the function has >>>> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter >>>> for the parent reaches 0 for a moment because we call .drained_end for >>>> the old child first and .drained_begin for the new one later. >>>> >>>> So it seems the fix would be to reverse the order and first call >>>> .drained_begin for the new child and then .drained_end for the old >>>> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, >>>> too. >>> Yes, it's true, but it's not enough... >>> In mirror_exit_common() we actively manipulate with block driver states. >>> When we replaced a node in the snippet you showed we can't allow the >>> postponed coroutines to run because the block tree isn't ready to >>> receive the requests yet. >>> To be ready, we need to insert a proper block driver state to the >>> block backend which is done here >>> >>> blk_remove_bs(bjob->blk); >>> blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort); >>> blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); << << << << >>> >>> bs_opaque->job = NULL; >>> >>> bdr
[Qemu-devel] PING: [PATCH] blk: postpone request execution on a context protected with "drained section"
Kevin, could you please take a look at my last comments? Thanks! Denis On 15.01.2019 10:22, Denis Plotnikov wrote: > ping ping ping ping > > On 09.01.2019 11:18, Denis Plotnikov wrote: >> ping ping!!! >> >> On 18.12.2018 11:53, Denis Plotnikov wrote: >>> ping ping >>> >>> On 14.12.2018 14:54, Denis Plotnikov wrote: >>>> >>>> >>>> On 13.12.2018 15:20, Kevin Wolf wrote: >>>>> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben: >>>>>> On 12.12.2018 15:24, Kevin Wolf wrote: >>>>>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben: >>>>>>>>> Why involve the AioContext at all? This could all be kept at the >>>>>>>>> BlockBackend level without extending the layering violation that >>>>>>>>> aio_disable_external() is. >>>>>>>>> >>>>>>>>> BlockBackends get notified when their root node is drained, so >>>>>>>>> hooking >>>>>>>>> things up there should be as easy, if not even easier than in >>>>>>>>> AioContext. >>>>>>>> >>>>>>>> Just want to make sure that I understood correctly what you >>>>>>>> meant by >>>>>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end >>>>>>>> calls >>>>>>>> child's role callback blk_root_drained_end by calling >>>>>>>> bdrv_parent_drained_end? >>>>>>> >>>>>>> Yes, blk_root_drained_begin/end calls are all you need. >>>>>>> Specifically, >>>>>>> their adjustments to blk->quiesce_counter that are already there, >>>>>>> and in >>>>>>> the 'if (--blk->quiesce_counter == 0)' block of >>>>>>> blk_root_drained_end() >>>>>>> we can resume the queued requests. >>>>>> Sounds it should be so, but it doesn't work that way and that's why: >>>>>> when doing mirror we may resume postponed coroutines too early >>>>>> when the >>>>>> underlying bs is protected from writing at and thus we encounter the >>>>>> assert on a write request execution at bdrv_co_write_req_prepare when >>>>>> resuming the postponed coroutines. >>>>>> >>>>>> The thing is that the bs is protected for writing before execution of >>>>>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls >>>>>> bdrv_replace_child_noperm which, in turn, calls >>>>>> child->role->drained_end >>>>>> where one of the callbacks is blk_root_drained_end which check >>>>>> if(--blk->quiesce_counter == 0) and runs the postponed requests >>>>>> (coroutines) if the coundition is true. >>>>> >>>>> Hm, so something is messed up with the drain sections in the mirror >>>>> driver. We have: >>>>> >>>>> bdrv_drained_begin(target_bs); >>>>> bdrv_replace_node(to_replace, target_bs, &local_err); >>>>> bdrv_drained_end(target_bs); >>>>> >>>>> Obviously, the intention was to keep the BlockBackend drained during >>>>> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0 >>>>> inside bdrv_replace_node() when target_bs is drained? >>>>> >>>>> Looking at bdrv_replace_child_noperm(), it seems that the function has >>>>> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter >>>>> for the parent reaches 0 for a moment because we call .drained_end for >>>>> the old child first and .drained_begin for the new one later. >>>>> >>>>> So it seems the fix would be to reverse the order and first call >>>>> .drained_begin for the new child and then .drained_end for the old >>>>> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, >>>>> too. >>>> Yes, it's true, but it's not enough... >>>> In mirror_exit_common() we actively manipulate with block driver >>>> states. >>>> When we replaced a node in the snippet you showed we can't allow the >>>> postponed coroutines to run because the block tree isn
Re: [Qemu-devel] PING: [PATCH] blk: postpone request execution on a context protected with "drained section"
On 17.01.2019 17:23, Kevin Wolf wrote: > Am 17.01.2019 um 13:57 hat Denis Plotnikov geschrieben: >> Kevin, >> >> could you please take a look at my last comments? > > I read it, and what it told me is essentially that I need to work on it > myself to fully understand the problem and possible acceptable solutions > because you can't seem to find one yourself. I will, but I can't > guarantee when I can find the time for it. > > Kevin ok. Thanks! Denis > >> On 15.01.2019 10:22, Denis Plotnikov wrote: >>> ping ping ping ping >>> >>> On 09.01.2019 11:18, Denis Plotnikov wrote: >>>> ping ping!!! >>>> >>>> On 18.12.2018 11:53, Denis Plotnikov wrote: >>>>> ping ping >>>>> >>>>> On 14.12.2018 14:54, Denis Plotnikov wrote: >>>>>> >>>>>> >>>>>> On 13.12.2018 15:20, Kevin Wolf wrote: >>>>>>> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben: >>>>>>>> On 12.12.2018 15:24, Kevin Wolf wrote: >>>>>>>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben: >>>>>>>>>>> Why involve the AioContext at all? This could all be kept at the >>>>>>>>>>> BlockBackend level without extending the layering violation that >>>>>>>>>>> aio_disable_external() is. >>>>>>>>>>> >>>>>>>>>>> BlockBackends get notified when their root node is drained, so >>>>>>>>>>> hooking >>>>>>>>>>> things up there should be as easy, if not even easier than in >>>>>>>>>>> AioContext. >>>>>>>>>> >>>>>>>>>> Just want to make sure that I understood correctly what you >>>>>>>>>> meant by >>>>>>>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end >>>>>>>>>> calls >>>>>>>>>> child's role callback blk_root_drained_end by calling >>>>>>>>>> bdrv_parent_drained_end? >>>>>>>>> >>>>>>>>> Yes, blk_root_drained_begin/end calls are all you need. >>>>>>>>> Specifically, >>>>>>>>> their adjustments to blk->quiesce_counter that are already there, >>>>>>>>> and in >>>>>>>>> the 'if (--blk->quiesce_counter == 0)' block of >>>>>>>>> blk_root_drained_end() >>>>>>>>> we can resume the queued requests. >>>>>>>> Sounds it should be so, but it doesn't work that way and that's why: >>>>>>>> when doing mirror we may resume postponed coroutines too early >>>>>>>> when the >>>>>>>> underlying bs is protected from writing at and thus we encounter the >>>>>>>> assert on a write request execution at bdrv_co_write_req_prepare when >>>>>>>> resuming the postponed coroutines. >>>>>>>> >>>>>>>> The thing is that the bs is protected for writing before execution of >>>>>>>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls >>>>>>>> bdrv_replace_child_noperm which, in turn, calls >>>>>>>> child->role->drained_end >>>>>>>> where one of the callbacks is blk_root_drained_end which check >>>>>>>> if(--blk->quiesce_counter == 0) and runs the postponed requests >>>>>>>> (coroutines) if the coundition is true. >>>>>>> >>>>>>> Hm, so something is messed up with the drain sections in the mirror >>>>>>> driver. We have: >>>>>>> >>>>>>> bdrv_drained_begin(target_bs); >>>>>>> bdrv_replace_node(to_replace, target_bs, &local_err); >>>>>>> bdrv_drained_end(target_bs); >>>>>>> >>>>>>> Obviously, the intention was to keep the BlockBackend drained during >>>>>>> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0 >>>>>>> inside bdrv_replace_node() when target_bs is drained? >>>>>>> >>>>>>> Looking at bdrv_replace
Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"
ping ping!!! On 18.12.2018 11:53, Denis Plotnikov wrote: > ping ping > > On 14.12.2018 14:54, Denis Plotnikov wrote: >> >> >> On 13.12.2018 15:20, Kevin Wolf wrote: >>> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben: >>>> On 12.12.2018 15:24, Kevin Wolf wrote: >>>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben: >>>>>>> Why involve the AioContext at all? This could all be kept at the >>>>>>> BlockBackend level without extending the layering violation that >>>>>>> aio_disable_external() is. >>>>>>> >>>>>>> BlockBackends get notified when their root node is drained, so >>>>>>> hooking >>>>>>> things up there should be as easy, if not even easier than in >>>>>>> AioContext. >>>>>> >>>>>> Just want to make sure that I understood correctly what you meant by >>>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls >>>>>> child's role callback blk_root_drained_end by calling >>>>>> bdrv_parent_drained_end? >>>>> >>>>> Yes, blk_root_drained_begin/end calls are all you need. Specifically, >>>>> their adjustments to blk->quiesce_counter that are already there, >>>>> and in >>>>> the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end() >>>>> we can resume the queued requests. >>>> Sounds it should be so, but it doesn't work that way and that's why: >>>> when doing mirror we may resume postponed coroutines too early when the >>>> underlying bs is protected from writing at and thus we encounter the >>>> assert on a write request execution at bdrv_co_write_req_prepare when >>>> resuming the postponed coroutines. >>>> >>>> The thing is that the bs is protected for writing before execution of >>>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls >>>> bdrv_replace_child_noperm which, in turn, calls >>>> child->role->drained_end >>>> where one of the callbacks is blk_root_drained_end which check >>>> if(--blk->quiesce_counter == 0) and runs the postponed requests >>>> (coroutines) if the coundition is true. >>> >>> Hm, so something is messed up with the drain sections in the mirror >>> driver. We have: >>> >>> bdrv_drained_begin(target_bs); >>> bdrv_replace_node(to_replace, target_bs, &local_err); >>> bdrv_drained_end(target_bs); >>> >>> Obviously, the intention was to keep the BlockBackend drained during >>> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0 >>> inside bdrv_replace_node() when target_bs is drained? >>> >>> Looking at bdrv_replace_child_noperm(), it seems that the function has >>> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter >>> for the parent reaches 0 for a moment because we call .drained_end for >>> the old child first and .drained_begin for the new one later. >>> >>> So it seems the fix would be to reverse the order and first call >>> .drained_begin for the new child and then .drained_end for the old >>> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, too. >> Yes, it's true, but it's not enough... >> In mirror_exit_common() we actively manipulate with block driver states. >> When we replaced a node in the snippet you showed we can't allow the >> postponed coroutines to run because the block tree isn't ready to >> receive the requests yet. >> To be ready, we need to insert a proper block driver state to the >> block backend which is done here >> >> blk_remove_bs(bjob->blk); >> blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort); >> blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); << << << << >> >> bs_opaque->job = NULL; >> >> bdrv_drained_end(src); >> >> If the tree isn't ready and we resume the coroutines, we'll end up >> with the request landed in a wrong block driver state. >> >> So, we explicitly should stop all activities on all the driver states >> and its parents and allow the activities when everything is ready to go. >> >> Why explicitly, because the block driver states may belong to >> diffe
Re: [Qemu-devel] [Qemu-block] [PATCH v0 0/2] Postponed actions
PING! PING! On 14.08.2018 10:08, Denis Plotnikov wrote: On 13.08.2018 19:30, Kevin Wolf wrote: Am 13.08.2018 um 10:32 hat Denis Plotnikov geschrieben: Ping ping! On 16.07.2018 21:59, John Snow wrote: On 07/16/2018 11:01 AM, Denis Plotnikov wrote: Ping! I never saw a reply to Stefan's question on July 2nd, did you reply off-list? --js Yes, I did. I talked to Stefan why the patch set appeared. The rest of us still don't know the answer. I had the same question. Kevin Yes, that's my fault. I should have post it earlier. I reviewed the problem once again and come up with the following explanation. Indeed, if the global lock has been taken by the main thread the vCPU threads won't be able to execute mmio ide. But, if the main thread will release the lock then nothing will prevent vCPU threads form execution what they want, e.g writing to the block device. In case of running the mirroring it is possible. Let's take a look at the following snippet of mirror_run. This is a part the mirroring completion part. bdrv_drained_begin(bs); cnt = bdrv_get_dirty_count(s->dirty_bitmap); >>>>>> if (cnt > 0 || mirror_flush(s) < 0) { bdrv_drained_end(bs); continue; } (X) >>>> assert(QLIST_EMPTY(&bs->tracked_requests)); mirror_flush here can yield the current coroutine so nothing more can be executed. We could end up with the situation when the main loop have to revolve to poll for another timer/bh to process. While revolving it releases the global lock. If the global lock is waited for by a vCPU (any other) thread, the waiting thread will get the lock and make what it intends. This is something that I can observe: mirror_flush yields coroutine, the main thread revolves and locks because a vCPU was waiting for the lock. Now the vCPU thread owns the lock and the main thread waits for the lock releasing. The vCPU thread does cmd_write_dma and releases the lock. Then, the main thread gets the lock and continues to run eventually proceeding with the coroutine yeiled. If the vCPU requests aren't completed by the moment we will assert at (X). If the vCPU requests are completed we won't even notice that we had some writes while in the drained section. Denis On 29.06.2018 15:40, Denis Plotnikov wrote: There are cases when a request to a block driver state shouldn't have appeared producing dangerous race conditions. This misbehaviour is usually happens with storage devices emulated without eventfd for guest to host notifications like IDE. The issue arises when the context is in the "drained" section and doesn't expect the request to come, but request comes from the device not using iothread and which context is processed by the main loop. The main loop apart of the iothread event loop isn't blocked by the "drained" section. The request coming and processing while in "drained" section can spoil the block driver state consistency. This behavior can be observed in the following KVM-based case: 1. Setup a VM with an IDE disk. 2. Inside a VM start a disk writing load for the IDE device e.g: dd if= of= bs=X count=Y oflag=direct 3. On the host create a mirroring block job for the IDE device e.g: drive_mirror 4. On the host finish the block job e.g: block_job_complete Having done the 4th action, you could get an assert: assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run. On my setup, the assert is 1/3 reproducible. The patch series introduces the mechanism to postpone the requests until the BDS leaves "drained" section for the devices not using iothreads. Also, it modifies the asynchronous block backend infrastructure to use that mechanism to release the assert bug for IDE devices. Denis Plotnikov (2): async: add infrastructure for postponed actions block: postpone the coroutine executing if the BDS's is drained block/block-backend.c | 58 ++- include/block/aio.h | 63 +++ util/async.c | 33 +++ 3 files changed, 142 insertions(+), 12 deletions(-) -- Best, Denis -- Best, Denis
Re: [Qemu-devel] [Qemu-block] [PATCH v0 0/2] Postponed actions
On 27.08.2018 19:05, John Snow wrote: On 08/27/2018 03:05 AM, Denis Plotnikov wrote: PING! PING! Sorry, Kevin and Stefan are both on PTO right now, I think. I can't promise I have the time to look soon, but you at least deserve an answer for the radio silence the last week. --js Thanks for the response! I'll be waiting for some comments! Denis On 14.08.2018 10:08, Denis Plotnikov wrote: On 13.08.2018 19:30, Kevin Wolf wrote: Am 13.08.2018 um 10:32 hat Denis Plotnikov geschrieben: Ping ping! On 16.07.2018 21:59, John Snow wrote: On 07/16/2018 11:01 AM, Denis Plotnikov wrote: Ping! I never saw a reply to Stefan's question on July 2nd, did you reply off-list? --js Yes, I did. I talked to Stefan why the patch set appeared. The rest of us still don't know the answer. I had the same question. Kevin Yes, that's my fault. I should have post it earlier. I reviewed the problem once again and come up with the following explanation. Indeed, if the global lock has been taken by the main thread the vCPU threads won't be able to execute mmio ide. But, if the main thread will release the lock then nothing will prevent vCPU threads form execution what they want, e.g writing to the block device. In case of running the mirroring it is possible. Let's take a look at the following snippet of mirror_run. This is a part the mirroring completion part. bdrv_drained_begin(bs); cnt = bdrv_get_dirty_count(s->dirty_bitmap); >>>>>> if (cnt > 0 || mirror_flush(s) < 0) { bdrv_drained_end(bs); continue; } (X) >>>> assert(QLIST_EMPTY(&bs->tracked_requests)); mirror_flush here can yield the current coroutine so nothing more can be executed. We could end up with the situation when the main loop have to revolve to poll for another timer/bh to process. While revolving it releases the global lock. If the global lock is waited for by a vCPU (any other) thread, the waiting thread will get the lock and make what it intends. This is something that I can observe: mirror_flush yields coroutine, the main thread revolves and locks because a vCPU was waiting for the lock. Now the vCPU thread owns the lock and the main thread waits for the lock releasing. The vCPU thread does cmd_write_dma and releases the lock. Then, the main thread gets the lock and continues to run eventually proceeding with the coroutine yeiled. If the vCPU requests aren't completed by the moment we will assert at (X). If the vCPU requests are completed we won't even notice that we had some writes while in the drained section. Denis On 29.06.2018 15:40, Denis Plotnikov wrote: There are cases when a request to a block driver state shouldn't have appeared producing dangerous race conditions. This misbehaviour is usually happens with storage devices emulated without eventfd for guest to host notifications like IDE. The issue arises when the context is in the "drained" section and doesn't expect the request to come, but request comes from the device not using iothread and which context is processed by the main loop. The main loop apart of the iothread event loop isn't blocked by the "drained" section. The request coming and processing while in "drained" section can spoil the block driver state consistency. This behavior can be observed in the following KVM-based case: 1. Setup a VM with an IDE disk. 2. Inside a VM start a disk writing load for the IDE device e.g: dd if= of= bs=X count=Y oflag=direct 3. On the host create a mirroring block job for the IDE device e.g: drive_mirror 4. On the host finish the block job e.g: block_job_complete Having done the 4th action, you could get an assert: assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run. On my setup, the assert is 1/3 reproducible. The patch series introduces the mechanism to postpone the requests until the BDS leaves "drained" section for the devices not using iothreads. Also, it modifies the asynchronous block backend infrastructure to use that mechanism to release the assert bug for IDE devices. Denis Plotnikov (2): async: add infrastructure for postponed actions block: postpone the coroutine executing if the BDS's is drained block/block-backend.c | 58 ++- include/block/aio.h | 63 +++ util/async.c | 33 +++ 3 files changed, 142 insertions(+), 12 deletions(-) -- Best, Denis -- Best, Denis
Re: [Qemu-devel] [PATCH v1 00/17] Background snapshots
Hi Peter I moved the code to the repository https://github.com/denis-plotnikov/qemu/tree/background-snapshot-kvm. the current version includes fixes with respect to your comments for version 1. I moved KVM related patches to the end of the branch (formerly patch series). Since, the KVM patches and the other parts to modify (vhost an others) are needless in favor of upcoming userfaltfd, I would ask you to review the general framework which is able to work with tcg. Thanks in advance! Denis On 20.07.2018 12:27, Peter Xu wrote: On Wed, Jul 18, 2018 at 06:41:43PM +0300, Denis Plotnikov wrote: The workflow to make a snapshot is the following: 1. Pause the vm 2. Make a snapshot of block devices using the scheme of your choice 3. Turn on background-snapshot migration capability 4. Start the migration using the destination (migration stream) of your choice. The migration will resume the vm execution by itself when it has the devices' states saved and is ready to start ram writing to the migration stream. 5. Listen to the migration finish event The bakground snapshot works with support of KVM patch: "x86: mmu: report failed memory access to the userspace" (not applied to the mainstream, it's in the kvm mailing list) Hello, Denis, Do you mind to push your tree to an online repository in case to make review easier? Thanks, -- Best, Denis
Re: [Qemu-devel] [PATCH v1 00/17] Background snapshots
Hi Peter, Thanks for the reply. Ok, I understand about tcg. So my only option is to wait for userfaultfd-wp. Do you know if anyone is currently working on this? And if so, then is there any estimations when the userfaultfd is ready? Denis On 05.09.2018 06:32, Peter Xu wrote: On Tue, Sep 04, 2018 at 04:00:31PM +0300, Denis Plotnikov wrote: Hi Peter Hi, Denis, I moved the code to the repository https://github.com/denis-plotnikov/qemu/tree/background-snapshot-kvm. the current version includes fixes with respect to your comments for version 1. I moved KVM related patches to the end of the branch (formerly patch series). Since, the KVM patches and the other parts to modify (vhost an others) are needless in favor of upcoming userfaltfd, I would ask you to review the general framework which is able to work with tcg. Thanks in advance! Thank you for pushing the tree. I might have made a mistake before that I thought this work is at least working for TCG, but I think I was wrong. The problem is (I'm trying to repeat Dave's question that you seems haven't yet answered): even for TCG there could be use cases where the process might access guest memory from the kernel space (e.g., vhost, or any system calls that with a guest memory buffer passed in). I'm afraid mprotect() and the whole signal-based mechanism cannot be able to address these page faults, then we'll encounter adhoc errors and we'll need to fix all these places up. Userfaultfd-wp should not have this problem. I think the general idea of the work is good, but I'm not sure whether we can merge the work if we don't settle these issues. Regards, -- Best, Denis
Re: [Qemu-devel] [Qemu-block] [PATCH v0 0/2] Postponed actions
PING PING! On 28.08.2018 13:23, Denis Plotnikov wrote: On 27.08.2018 19:05, John Snow wrote: On 08/27/2018 03:05 AM, Denis Plotnikov wrote: PING! PING! Sorry, Kevin and Stefan are both on PTO right now, I think. I can't promise I have the time to look soon, but you at least deserve an answer for the radio silence the last week. --js Thanks for the response! I'll be waiting for some comments! Denis On 14.08.2018 10:08, Denis Plotnikov wrote: On 13.08.2018 19:30, Kevin Wolf wrote: Am 13.08.2018 um 10:32 hat Denis Plotnikov geschrieben: Ping ping! On 16.07.2018 21:59, John Snow wrote: On 07/16/2018 11:01 AM, Denis Plotnikov wrote: Ping! I never saw a reply to Stefan's question on July 2nd, did you reply off-list? --js Yes, I did. I talked to Stefan why the patch set appeared. The rest of us still don't know the answer. I had the same question. Kevin Yes, that's my fault. I should have post it earlier. I reviewed the problem once again and come up with the following explanation. Indeed, if the global lock has been taken by the main thread the vCPU threads won't be able to execute mmio ide. But, if the main thread will release the lock then nothing will prevent vCPU threads form execution what they want, e.g writing to the block device. In case of running the mirroring it is possible. Let's take a look at the following snippet of mirror_run. This is a part the mirroring completion part. bdrv_drained_begin(bs); cnt = bdrv_get_dirty_count(s->dirty_bitmap); >>>>>> if (cnt > 0 || mirror_flush(s) < 0) { bdrv_drained_end(bs); continue; } (X) >>>> assert(QLIST_EMPTY(&bs->tracked_requests)); mirror_flush here can yield the current coroutine so nothing more can be executed. We could end up with the situation when the main loop have to revolve to poll for another timer/bh to process. While revolving it releases the global lock. If the global lock is waited for by a vCPU (any other) thread, the waiting thread will get the lock and make what it intends. This is something that I can observe: mirror_flush yields coroutine, the main thread revolves and locks because a vCPU was waiting for the lock. Now the vCPU thread owns the lock and the main thread waits for the lock releasing. The vCPU thread does cmd_write_dma and releases the lock. Then, the main thread gets the lock and continues to run eventually proceeding with the coroutine yeiled. If the vCPU requests aren't completed by the moment we will assert at (X). If the vCPU requests are completed we won't even notice that we had some writes while in the drained section. Denis On 29.06.2018 15:40, Denis Plotnikov wrote: There are cases when a request to a block driver state shouldn't have appeared producing dangerous race conditions. This misbehaviour is usually happens with storage devices emulated without eventfd for guest to host notifications like IDE. The issue arises when the context is in the "drained" section and doesn't expect the request to come, but request comes from the device not using iothread and which context is processed by the main loop. The main loop apart of the iothread event loop isn't blocked by the "drained" section. The request coming and processing while in "drained" section can spoil the block driver state consistency. This behavior can be observed in the following KVM-based case: 1. Setup a VM with an IDE disk. 2. Inside a VM start a disk writing load for the IDE device e.g: dd if= of= bs=X count=Y oflag=direct 3. On the host create a mirroring block job for the IDE device e.g: drive_mirror 4. On the host finish the block job e.g: block_job_complete Having done the 4th action, you could get an assert: assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run. On my setup, the assert is 1/3 reproducible. The patch series introduces the mechanism to postpone the requests until the BDS leaves "drained" section for the devices not using iothreads. Also, it modifies the asynchronous block backend infrastructure to use that mechanism to release the assert bug for IDE devices. Denis Plotnikov (2): async: add infrastructure for postponed actions block: postpone the coroutine executing if the BDS's is drained block/block-backend.c | 58 ++- include/block/aio.h | 63 +++ util/async.c | 33 +++ 3 files changed, 142 insertions(+), 12 deletions(-) -- Best, Denis -- Best, Denis
Re: [Qemu-devel] [PATCH v0 2/2] block: postpone the coroutine executing if the BDS's is drained
On 10.09.2018 15:41, Kevin Wolf wrote: Am 29.06.2018 um 14:40 hat Denis Plotnikov geschrieben: Fixes the problem of ide request appearing when the BDS is in the "drained section". Without the patch the request can come and be processed by the main event loop, as the ide requests are processed by the main event loop and the main event loop doesn't stop when its context is in the "drained section". The request execution is postponed until the end of "drained section". The patch doesn't modify ide specific code, as well as any other device code. Instead, it modifies the infrastructure of asynchronous Block Backend requests, in favor of postponing the requests arisen when in "drained section" to remove the possibility of request appearing for all the infrastructure clients. This approach doesn't make vCPU processing the request wait untill the end of request processing. Signed-off-by: Denis Plotnikov I generally agree with the idea that requests should be queued during a drained section. However, I think there are a few fundamental problems with the implementation in this series: 1) aio_disable_external() is already a layering violation and we'd like to get rid of it (by replacing it with a BlockDevOps callback from BlockBackend to the devices), so adding more functionality there feels like a step in the wrong direction. 2) Only blk_aio_* are fixed, while we also have synchronous public interfaces (blk_pread/pwrite) as well as coroutine-based ones (blk_co_*). They need to be postponed as well. Good point! Thanks! blk_co_preadv/pwritev() are the common point in the call chain for all of these variants, so this is where the fix needs to live. Using the common point might be a good idea, but in case aio requests we also have to mane completions which out of the scope of blk_co_p(read|write)v: static void blk_aio_write_entry(void *opaque) { ... rwco->ret = blk_co_pwritev(...); blk_aio_complete(acb); ... } This makes the difference. I would suggest adding waiting until "drained_end" is done on the synchronous read/write at blk_prw > 3) Within a drained section, you want requests from other users to be blocked, but not your own ones (essentially you want exclusive access). We don't have blk_drained_begin/end() yet, so this is not something to implement right now, but let's keep this requirement in mind and choose a design that allows this. There is an idea to distinguish the requests that should be done without respect to "drained section" by using a flag in BdrvRequestFlags. The requests with a flag set should be processed anyway. I believe the whole logic should be kept local to BlockBackend, and blk_root_drained_begin/end() should be the functions that start queuing requests or let queued requests resume. As we are already in coroutine context in blk_co_preadv/pwritev(), after checking that blk->quiesce_counter > 0, we can enter the coroutine object into a list and yield. blk_root_drained_end() calls aio_co_wake() for each of the queued coroutines. This should be all that we need to manage. In my understanding by using brdv_drained_begin/end we want to protect a certain BlockDriverState from external access but not the whole BlockBackend which may involve using a number of BlockDriverState-s. I though it because we could possibly change a backing file for some BlockDriverState. And for the time of changing we need to prevent external access to it but keep the io going. By using blk_root_drained_begin/end() we put to "drained section" all the BlockDriverState-s linked to that root. Does it have to be so? Denis Kevin -- Best, Denis
Re: [Qemu-devel] [PATCH v0 2/2] block: postpone the coroutine executing if the BDS's is drained
On 12.09.2018 16:15, Kevin Wolf wrote: Am 12.09.2018 um 14:03 hat Denis Plotnikov geschrieben: On 10.09.2018 15:41, Kevin Wolf wrote: Am 29.06.2018 um 14:40 hat Denis Plotnikov geschrieben: Fixes the problem of ide request appearing when the BDS is in the "drained section". Without the patch the request can come and be processed by the main event loop, as the ide requests are processed by the main event loop and the main event loop doesn't stop when its context is in the "drained section". The request execution is postponed until the end of "drained section". The patch doesn't modify ide specific code, as well as any other device code. Instead, it modifies the infrastructure of asynchronous Block Backend requests, in favor of postponing the requests arisen when in "drained section" to remove the possibility of request appearing for all the infrastructure clients. This approach doesn't make vCPU processing the request wait untill the end of request processing. Signed-off-by: Denis Plotnikov I generally agree with the idea that requests should be queued during a drained section. However, I think there are a few fundamental problems with the implementation in this series: 1) aio_disable_external() is already a layering violation and we'd like to get rid of it (by replacing it with a BlockDevOps callback from BlockBackend to the devices), so adding more functionality there feels like a step in the wrong direction. 2) Only blk_aio_* are fixed, while we also have synchronous public interfaces (blk_pread/pwrite) as well as coroutine-based ones (blk_co_*). They need to be postponed as well. Good point! Thanks! blk_co_preadv/pwritev() are the common point in the call chain for all of these variants, so this is where the fix needs to live. Using the common point might be a good idea, but in case aio requests we also have to mane completions which out of the scope of blk_co_p(read|write)v: I don't understand what you mean here (possibly because I fail to understand the word "mane") and what completions have to do with mane = make queueing of requests. Just to clarify, we are talking about the following situation, right? bdrv_drain_all_begin() has returned, so all the old requests have already been drained and their completion callback has already been called. For any new requests that come in, we need to queue them until the drained section ends. In other words, they won't reach the point where they could possibly complete before .drained_end. Yes To make it clear: I'm trying to defend the idea that putting the postponing routine in blk_co_preadv/pwritev is not the best choice and that's why: If I understood your idea correctly, if we do the postponing inside blk_co_p(write|read)v we don't know whether we do synchronous or asynchronous request. We need to know this because if we postpone an async request then, later, on the postponed requests processing, we must to make "a completion" for that request stating that it's finally "done". Furthermore, for sync requests if we postpone them, we must block the clients issued them until the requests postponed have been processed on drained section leaving. This would ask an additional notification mechanism. Instead, we can just check whether we could proceed in blk_p(write|read) and if not (we're in drained) to wait there. We avoid the things above if we postponing in blk_aio_prwv and waiting in blk_prw without postponing. What do you think? static void blk_aio_write_entry(void *opaque) { ... rwco->ret = blk_co_pwritev(...); blk_aio_complete(acb); ... } This makes the difference. I would suggest adding waiting until "drained_end" is done on the synchronous read/write at blk_prw It is possible, but then the management becomes a bit more complicated because you have more than just a list of Coroutines that you need to wake up. One thing that could be problematic in blk_co_preadv/pwritev is that blk->in_flight would count even requests that are queued if we're not careful. Then a nested drain would deadlock because the BlockBackend would never say that it's quiesced. > 3) Within a drained section, you want requests from other users to be blocked, but not your own ones (essentially you want exclusive access). We don't have blk_drained_begin/end() yet, so this is not something to implement right now, but let's keep this requirement in mind and choose a design that allows this. There is an idea to distinguish the requests that should be done without respect to "drained section" by using a flag in BdrvRequestFlags. The requests with a flag set should be processed anyway. I don't think that would work because the accesses can be nested qui
Re: [Qemu-devel] [PATCH v0 0/7] Background snapshots
On 13.07.2018 08:20, Peter Xu wrote: On Fri, Jun 29, 2018 at 11:03:13AM +0300, Denis Plotnikov wrote: The patch set adds the ability to make external snapshots while VM is running. The workflow to make a snapshot is the following: 1. Pause the vm 2. Make a snapshot of block devices using the scheme of your choice 3. Turn on background-snapshot migration capability 4. Start the migration using the destination (migration stream) of your choice. The migration will resume the vm execution by itself when it has the devices' states saved and is ready to start ram writing to the migration stream. 5. Listen to the migration finish event The feature relies on KVM unapplied ability to report the faulting address. Please find the KVM patch snippet to make the patchset work below: +++ b/arch/x86/kvm/vmx.c @@ -,X +,XX @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) vcpu->arch.exit_qualification = exit_qualification; - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); + r = kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); +if (r == -EFAULT) { + unsigned long hva = kvm_vcpu_gfn_to_hva(vcpu, gpa >> PAGE_SHIFT); + + vcpu->run->exit_reason = KVM_EXIT_FAIL_MEM_ACCESS; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + vcpu->run->fail_mem_access.hva = hva | (gpa & (PAGE_SIZE-1)); + r = 0; + + } + return r; The patch to KVM can be sent if the patch set approved Hi, Denis, If the work will definitely require KVM to cooperate, AFAIU the thing we normally do is that we first propose the kernel counterpart on kvm list, then it'll be easier to review the QEMU counterpart (or, propose both kvm/qemu changes at the same time, always the QEMU changes can be RFC, as a reference to prove the kvm change is valid and useful). Not sure whether you should do this as well for this live snapshot work. Since we might have two backends in the future, my major question for that counterpart series would be whether we need to support both in the future (mprotect, and userfaultfd), and the differences between the two methods from kernel's point of view. I would vaguely guess that we can at least firstly have mprotect work then userfaultfd then we can automatically choose the backend when both are provided, but I guess that discussion might still better happen on the kvm list. Also I would also guess that in that work you'd better consider no-ept case as well for Intel, even for AMD. But not sure we can at least start a RFC with the simplest scenario and prove its validity. Regards, Hi, Peter, I think this is a good idea to go through the KVM path firstly. When the discussion come to some conclusion further steps may become more clear. I'll send the patch there shortly to start the discussion. Thanks! Best, Denis
Re: [Qemu-devel] [PATCH v0 0/2] Postponed actions
Ping! On 29.06.2018 15:40, Denis Plotnikov wrote: There are cases when a request to a block driver state shouldn't have appeared producing dangerous race conditions. This misbehaviour is usually happens with storage devices emulated without eventfd for guest to host notifications like IDE. The issue arises when the context is in the "drained" section and doesn't expect the request to come, but request comes from the device not using iothread and which context is processed by the main loop. The main loop apart of the iothread event loop isn't blocked by the "drained" section. The request coming and processing while in "drained" section can spoil the block driver state consistency. This behavior can be observed in the following KVM-based case: 1. Setup a VM with an IDE disk. 2. Inside a VM start a disk writing load for the IDE device e.g: dd if= of= bs=X count=Y oflag=direct 3. On the host create a mirroring block job for the IDE device e.g: drive_mirror 4. On the host finish the block job e.g: block_job_complete Having done the 4th action, you could get an assert: assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run. On my setup, the assert is 1/3 reproducible. The patch series introduces the mechanism to postpone the requests until the BDS leaves "drained" section for the devices not using iothreads. Also, it modifies the asynchronous block backend infrastructure to use that mechanism to release the assert bug for IDE devices. Denis Plotnikov (2): async: add infrastructure for postponed actions block: postpone the coroutine executing if the BDS's is drained block/block-backend.c | 58 ++- include/block/aio.h | 63 +++ util/async.c | 33 +++ 3 files changed, 142 insertions(+), 12 deletions(-) -- Best, Denis
Re: [Qemu-devel] [Qemu-block] [PATCH v0 0/2] Postponed actions
On 16.07.2018 21:59, John Snow wrote: On 07/16/2018 11:01 AM, Denis Plotnikov wrote: Ping! I never saw a reply to Stefan's question on July 2nd, did you reply off-list? For some reason, there are no Stefan's replies on my server. Found it in the web. Will respond to it shortly. Thanks! Denis --js On 29.06.2018 15:40, Denis Plotnikov wrote: There are cases when a request to a block driver state shouldn't have appeared producing dangerous race conditions. This misbehaviour is usually happens with storage devices emulated without eventfd for guest to host notifications like IDE. The issue arises when the context is in the "drained" section and doesn't expect the request to come, but request comes from the device not using iothread and which context is processed by the main loop. The main loop apart of the iothread event loop isn't blocked by the "drained" section. The request coming and processing while in "drained" section can spoil the block driver state consistency. This behavior can be observed in the following KVM-based case: 1. Setup a VM with an IDE disk. 2. Inside a VM start a disk writing load for the IDE device e.g: dd if= of= bs=X count=Y oflag=direct 3. On the host create a mirroring block job for the IDE device e.g: drive_mirror 4. On the host finish the block job e.g: block_job_complete Having done the 4th action, you could get an assert: assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run. On my setup, the assert is 1/3 reproducible. The patch series introduces the mechanism to postpone the requests until the BDS leaves "drained" section for the devices not using iothreads. Also, it modifies the asynchronous block backend infrastructure to use that mechanism to release the assert bug for IDE devices. Denis Plotnikov (2): async: add infrastructure for postponed actions block: postpone the coroutine executing if the BDS's is drained block/block-backend.c | 58 ++- include/block/aio.h | 63 +++ util/async.c | 33 +++ 3 files changed, 142 insertions(+), 12 deletions(-) -- Best, Denis
[Qemu-devel] [PATCH v1 09/17] background snapshot: extend RAM request for holding a page copy pointer
This pointer is going to be used to transfer a memory. Once the memory page is copied the content the snapshot interested in is saved for writing and we can make the page writable again. Signed-off-by: Denis Plotnikov --- migration/ram.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index ce3dead932..dc7dfe0726 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -188,6 +188,7 @@ struct RAMSrcPageRequest { RAMBlock *rb; hwaddroffset; hwaddrlen; +void *page_copy; QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; }; @@ -265,6 +266,8 @@ struct PageSearchStatus { unsigned long page; /* Set once we wrap around */ bool complete_round; +/* Pointer to the cached page */ +void *page_copy; }; typedef struct PageSearchStatus PageSearchStatus; -- 2.17.0
[Qemu-devel] [PATCH v1 15/17] kvm: add vCPU failed memeory access processing
Is done with support of the KVM patch returning the faulting address. Signed-off-by: Denis Plotnikov --- target/i386/kvm.c | 17 + 1 file changed, 17 insertions(+) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 3ac5302bc5..55b8860d1a 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -45,6 +45,8 @@ #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "migration/blocker.h" +#include "migration/savevm.h" +#include "migration/ram.h" #include "exec/memattrs.h" #include "trace.h" @@ -3130,6 +3132,18 @@ static bool host_supports_vmx(void) return ecx & CPUID_EXT_VMX; } +static int kvm_handle_fail_mem_access(CPUState *cpu) +{ +struct kvm_run *run = cpu->kvm_run; +int ret = ram_process_page_fault((void *)run->fail_mem_access.hva); + +if (ret >= 0) { +cpu_resume(cpu); +} + +return ret; +} + #define VMX_INVALID_GUEST_STATE 0x8021 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) @@ -3188,6 +3202,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) ioapic_eoi_broadcast(run->eoi.vector); ret = 0; break; +case KVM_EXIT_FAIL_MEM_ACCESS: +ret = kvm_handle_fail_mem_access(cs); +break; default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); ret = -1; -- 2.17.0
[Qemu-devel] [PATCH v1 05/17] ram: extend the data structures for background snapshotting
Signed-off-by: Denis Plotnikov --- include/exec/ram_addr.h | 7 +++ 1 file changed, 7 insertions(+) diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 6cbc02aa0f..5b403d537d 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -36,6 +36,8 @@ struct RAMBlock { char idstr[256]; /* RCU-enabled, writes protected by the ramlist lock */ QLIST_ENTRY(RAMBlock) next; +/* blocks used for background snapshot */ +QLIST_ENTRY(RAMBlock) bgs_next; QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers; int fd; size_t page_size; @@ -49,6 +51,11 @@ struct RAMBlock { unsigned long *unsentmap; /* bitmap of already received pages in postcopy */ unsigned long *receivedmap; +/* The following 2 are for background snapshot */ +/* Pages currently being copied */ +unsigned long *touched_map; +/* Pages has been copied already */ +unsigned long *copied_map; }; static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset) -- 2.17.0