[Qemu-devel] [PATCH] monitor: increase amount of data for monitor to read

2019-06-10 Thread Denis Plotnikov
Right now QMP and HMP monitors read 1 byte at a time from the socket, which
is very inefficient. With 100+ VMs on the host this easily reasults in
a lot of unnecessary system calls and CPU usage in the system.

This patch changes the amount of data to read to 4096 bytes, which matches
buffer size on the channel level. Fortunately, monitor protocol is
synchronous right now thus we should not face side effects in reality.

Signed-off-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
 include/monitor/monitor.h | 2 +-
 monitor.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index c1b40a9cac..afa1ed34a4 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -14,7 +14,7 @@ extern __thread Monitor *cur_mon;
 #define MONITOR_USE_CONTROL   0x04
 #define MONITOR_USE_PRETTY0x08
 
-#define QMP_REQ_QUEUE_LEN_MAX 8
+#define QMP_REQ_QUEUE_LEN_MAX 4096
 
 bool monitor_cur_is_qmp(void);
 
diff --git a/monitor.c b/monitor.c
index 4807bbe811..a08e020b61 100644
--- a/monitor.c
+++ b/monitor.c
@@ -4097,7 +4097,7 @@ static int monitor_can_read(void *opaque)
 {
 Monitor *mon = opaque;
 
-return !atomic_mb_read(&mon->suspend_cnt);
+return !atomic_mb_read(&mon->suspend_cnt) ? 4096 : 0;
 }
 
 /*
-- 
2.17.0




Re: [Qemu-devel] [PATCH v5] qemu-io: add pattern file for write command

2019-06-10 Thread Denis Plotnikov


On 31.05.2019 18:13, Eric Blake wrote:
> On 5/31/19 2:46 AM, Denis Plotnikov wrote:
>> The patch allows to provide a pattern file for write
>> command. There was no similar ability before.
>>
>> Signed-off-by: Denis Plotnikov 
>> ---
>> v5:
>>* file name initiated with null to make compilers happy
>>
> 
>> +static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len,
>> + char *file_name)
>> +{
>> +char *buf, *buf_pos;
>> +FILE *f = fopen(file_name, "r");
>> +int l;
>> +
>> +if (!f) {
>> +printf("'%s': %s\n", file_name, strerror(errno));
>> +return NULL;
>> +}
>> +
>> +if (qemuio_misalign) {
>> +len += MISALIGN_OFFSET;
>> +}
>> +buf = blk_blockalign(blk, len);
>> +memset(buf, 0, len);
>> +
>> +buf_pos = buf;
>> +
>> +while (len > 0) {
>> +l = fread(buf_pos, sizeof(char), len, f);
>> +
>> +if (feof(f)) {
>> +rewind(f);
>> +}
> 
> Why are we reading the file more than once? Once we've read it once,
> it's more efficient to switch to a loop that memcpy()s the prefix into
> the rest of the buffer, rather than to perform repeated I/O.
> 
Yes, it is. Will change it.

Denis


[Qemu-devel] [PATCH v6] qemu-io: add pattern file for write command

2019-06-10 Thread Denis Plotnikov
The patch allows to provide a pattern file for write
command. There was no similar ability before.

Signed-off-by: Denis Plotnikov 
---
v6:
  * the pattern file is read once to reduce io

v5:
  * file name initiated with null to make compilers happy

v4:
  * missing signed-off clause added

v3:
  * missing file closing added
  * exclusive flags processing changed
  * buffer void* converted to char* to fix pointer arithmetics
  * file reading error processing added
---
 qemu-io-cmds.c | 88 ++
 1 file changed, 82 insertions(+), 6 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 09750a23ce..e27203f747 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -343,6 +343,69 @@ static void *qemu_io_alloc(BlockBackend *blk, size_t len, 
int pattern)
 return buf;
 }
 
+static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len,
+ char *file_name)
+{
+char *buf, *buf_origin;
+FILE *f = fopen(file_name, "r");
+int l;
+
+if (!f) {
+printf("'%s': %s\n", file_name, strerror(errno));
+return NULL;
+}
+
+if (qemuio_misalign) {
+len += MISALIGN_OFFSET;
+}
+buf_origin = blk_blockalign(blk, len);
+memset(buf_origin, 0, len);
+
+buf = buf_origin;
+
+l = fread(buf, sizeof(char), len, f);
+
+if (ferror(f)) {
+printf("'%s': %s\n", file_name, strerror(errno));
+goto error;
+}
+
+if (l == 0) {
+printf("'%s' is empty\n", file_name);
+goto error;
+}
+
+if (l < len) {
+char *file_buf = g_malloc(sizeof(char) * l);
+memcpy(file_buf, buf, l);
+len -= l;
+buf += l;
+
+while (len > 0) {
+size_t len_to_copy = len > l ? l : len;
+
+memcpy(buf, file_buf, len_to_copy);
+
+len -= len_to_copy;
+buf += len_to_copy;
+}
+qemu_vfree(file_buf);
+}
+
+if (qemuio_misalign) {
+buf_origin += MISALIGN_OFFSET;
+}
+
+goto out;
+
+error:
+qemu_vfree(buf);
+buf_origin = NULL;
+out:
+fclose(f);
+return buf_origin;
+}
+
 static void qemu_io_free(void *p)
 {
 if (qemuio_misalign) {
@@ -965,7 +1028,7 @@ static const cmdinfo_t write_cmd = {
 .perm   = BLK_PERM_WRITE,
 .argmin = 2,
 .argmax = -1,
-.args   = "[-bcCfnquz] [-P pattern] off len",
+.args   = "[-bcCfnquz] [-P pattern | -s source_file] off len",
 .oneline= "writes a number of bytes at a specified offset",
 .help   = write_help,
 };
@@ -974,7 +1037,7 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 {
 struct timeval t1, t2;
 bool Cflag = false, qflag = false, bflag = false;
-bool Pflag = false, zflag = false, cflag = false;
+bool Pflag = false, zflag = false, cflag = false, sflag = false;
 int flags = 0;
 int c, cnt, ret;
 char *buf = NULL;
@@ -983,8 +1046,9 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 /* Some compilers get confused and warn if this is not initialized.  */
 int64_t total = 0;
 int pattern = 0xcd;
+char *file_name = NULL;
 
-while ((c = getopt(argc, argv, "bcCfnpP:quz")) != -1) {
+while ((c = getopt(argc, argv, "bcCfnpP:quzs:")) != -1) {
 switch (c) {
 case 'b':
 bflag = true;
@@ -1020,6 +1084,10 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 case 'z':
 zflag = true;
 break;
+case 's':
+sflag = true;
+file_name = g_strdup(optarg);
+break;
 default:
 qemuio_command_usage(&write_cmd);
 return -EINVAL;
@@ -1051,8 +1119,9 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 return -EINVAL;
 }
 
-if (zflag && Pflag) {
-printf("-z and -P cannot be specified at the same time\n");
+if ((int)zflag + (int)Pflag + (int)sflag > 1) {
+printf("Only one of -z, -P, and -s"
+   "can be specified at the same time\n");
 return -EINVAL;
 }
 
@@ -1088,7 +1157,14 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 }
 
 if (!zflag) {
-buf = qemu_io_alloc(blk, count, pattern);
+if (sflag) {
+buf = qemu_io_alloc_from_file(blk, count, file_name);
+if (!buf) {
+return -EINVAL;
+}
+} else {
+buf = qemu_io_alloc(blk, count, pattern);
+}
 }
 
 gettimeofday(&t1, NULL);
-- 
2.17.0




Re: [Qemu-devel] [PATCH 0/3] migration: add sztd compression

2019-03-11 Thread Denis Plotnikov
ping ping ping!

On 04.03.2019 18:10, Denis Plotnikov wrote:
> ping!
> 
> On 26.02.2019 16:15, Denis Plotnikov wrote:
>> zstd date compression algorithm shows better performance on data compression.
>> It might be useful to employ the algorithm in VM migration to reduce CPU 
>> usage.
>> A user will be able to choose between those algorithms, therefor 
>> compress-type
>> migration parameter is added.
>>
>> Here are some results of performance comparison zstd vs gzip:
>>
>> host: i7-4790 8xCPU @ 3.60GHz, 16G RAM
>> migration to the same host
>> VM: 2xVCPU, 8G RAM total
>> 5G RAM used, memory populated with postgreqsl data
>> produced by pgbench performance benchmark
>>
>>
>> Threads: 1 compress – 1 decompress
>>
>> zstd provides slightly less compression ratio with almost the same
>> CPU usage but copes with RAM  compression roghly 2 times faster
>>
>> compression type  zlib   |  zstd
>> -
>> compression level  1   5 |   1   5
>> compression ratio  6.927.05  |   6.696.89
>> cpu idle, %82  83|   86  80
>> time, sec  49  71|   26  31
>> time diff to zlib, sec  -25 -41
>>
>>
>> Threads: 8 compress – 2 decompress
>>
>> zstd provides the same migration time with less cpu consumption
>>
>> compression type none  |gzip(zlib)|  zstd
>> --
>> compression level- |  1  5   9|   1   5   15
>> compression ratio- |  6.94   6.997.14 |   6.646.89
>> 6.93
>> time, sec154   |  22 23  27   |   23  23  25
>> cpu idle, %  99|  45 30  12   |   70  52  23
>> cpu idle diff to zlib  |  |  -25%-22%-11%
>>
>>
>> Denis Plotnikov (3):
>> migration: rework compression code for adding more data compressors
>> hmp: add compress-type parameter to migration parameters
>> migration: add zstd compression
>>
>>configure |  26 
>>hmp.c |   8 ++
>>migration/migration.c |  45 ++-
>>migration/migration.h |   1 +
>>migration/qemu-file.c |  39 ++
>>migration/qemu-file.h |  18 ++-
>>migration/ram.c   | 291 ++
>>qapi/migration.json   |  26 +++-
>>8 files changed, 369 insertions(+), 85 deletions(-)
>>
> 

-- 
Best,
Denis


[PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839

2019-10-18 Thread Denis Plotnikov
From: "Denis V. Lunev" 

Linux guests submit IO requests no longer than PAGE_SIZE * max_seg
field reported by SCSI controler. Thus typical sequential read with
1 MB size results in the following pattern of the IO from the guest:
  8,16   115754 2.766095122  2071  D   R 2095104 + 1008 [dd]
  8,16   115755 2.766108785  2071  D   R 2096112 + 1008 [dd]
  8,16   115756 2.766113486  2071  D   R 2097120 + 32 [dd]
  8,16   115757 2.767668961 0  C   R 2095104 + 1008 [0]
  8,16   115758 2.768534315 0  C   R 2096112 + 1008 [0]
  8,16   115759 2.768539782 0  C   R 2097120 + 32 [0]
The IO was generated by
  dd if=/dev/sda of=/dev/null bs=1024 iflag=direct

This effectively means that on rotational disks we will observe 3 IOPS
for each 2 MBs processed. This definitely negatively affects both
guest and host IO performance.

The cure is relatively simple - we should report lengthy scatter-gather
ability of the SCSI controller. Fortunately the situation here is very
good. VirtIO transport layer can accomodate 1024 items in one request
while we are using only 128. This situation is present since almost
very beginning. 2 items are dedicated for request metadata thus we
should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg.

The following pattern is observed after the patch:
  8,16   1 9921 2.662721340  2063  D   R 2095104 + 1024 [dd]
  8,16   1 9922 2.662737585  2063  D   R 2096128 + 1024 [dd]
  8,16   1 9923 2.665188167 0  C   R 2095104 + 1024 [0]
  8,16   1 9924 2.665198777 0  C   R 2096128 + 1024 [0]
which is much better.

The dark side of this patch is that we are tweaking guest visible
parameter, though this should be relatively safe as above transport
layer support is present in QEMU/host Linux for a very long time.
The patch adds configurable property for VirtIO SCSI with a new default
and hardcode option for VirtBlock which does not provide good
configurable framework.

Unfortunately the commit can not be applied as is. For the real cure we
need guest to be fixed to accomodate that queue length, which is done
only in the latest 4.14 kernel. Thus we are going to expose the property
and tweak it on machine type level.

The problem with the old kernels is that they have
max_segments <= virtqueue_size restriction which cause the guest
crashing in the case of violation.
To fix the case described above in the old kernels we can increase
virtqueue_size to 256 and max_segments to 254. The pitfall here is
that seabios allows the virtqueue_size-s < 128, however, the seabios
patch extending that value to 256 is pending.

CC: "Michael S. Tsirkin" 
CC: Stefan Hajnoczi 
CC: Kevin Wolf 
CC: Max Reitz 
CC: Gerd Hoffmann 
Signed-off-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
 hw/block/virtio-blk.c   | 3 ++-
 hw/scsi/vhost-scsi.c| 2 ++
 hw/scsi/virtio-scsi.c   | 4 +++-
 include/hw/virtio/virtio-blk.h  | 1 +
 include/hw/virtio/virtio-scsi.h | 1 +
 5 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 06e57a4d39..b2eaeeaf67 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -903,7 +903,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, 
uint8_t *config)
 blk_get_geometry(s->blk, &capacity);
 memset(&blkcfg, 0, sizeof(blkcfg));
 virtio_stq_p(vdev, &blkcfg.capacity, capacity);
-virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
+virtio_stl_p(vdev, &blkcfg.seg_max, s->conf.max_segments);
 virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
 virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
 virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
@@ -1240,6 +1240,7 @@ static Property virtio_blk_properties[] = {
conf.max_discard_sectors, BDRV_REQUEST_MAX_SECTORS),
 DEFINE_PROP_UINT32("max-write-zeroes-sectors", VirtIOBlock,
conf.max_write_zeroes_sectors, 
BDRV_REQUEST_MAX_SECTORS),
+DEFINE_PROP_UINT32("max_segments", VirtIOBlock, conf.max_segments, 126),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 61e2e57da9..fa3b377807 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -242,6 +242,8 @@ static Property vhost_scsi_properties[] = {
 DEFINE_PROP_BIT64("t10_pi", VHostSCSICommon, host_features,
  VIRTIO_SCSI_F_T10_PI,
  false),
+DEFINE_PROP_UINT32("max_segments", VirtIOSCSICommon, conf.max_segments,
+   126),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 839f120256..8b070ddeed 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -650,7 +650,

[PATCH] blockdev: modify blockdev-change-medium to change non-removable device

2019-10-18 Thread Denis Plotnikov
The modification is useful to workaround exclusive file access restrictions,
e.g. to implement VM migration with shared disk stored on a storage with
the exclusive file opening model: a destination VM is started waiting for
incomming migration with a fake image drive, and later, on the last migration
phase, the fake image file is replaced with the real one.

Signed-off-by: Denis Plotnikov 
---
 blockdev.c   | 69 +++-
 hmp.c|  2 ++
 qapi/block-core.json |  7 +++--
 qmp.c|  3 +-
 4 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index d358169995..23f3465cfc 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2609,6 +2609,8 @@ void qmp_blockdev_change_medium(bool has_device, const 
char *device,
 bool has_format, const char *format,
 bool has_read_only,
 BlockdevChangeReadOnlyMode read_only,
+bool has_medium_name,
+const char *medium_name,
 Error **errp)
 {
 BlockBackend *blk;
@@ -2667,29 +2669,56 @@ void qmp_blockdev_change_medium(bool has_device, const 
char *device,
 goto fail;
 }
 
-rc = do_open_tray(has_device ? device : NULL,
-  has_id ? id : NULL,
-  false, &err);
-if (rc && rc != -ENOSYS) {
-error_propagate(errp, err);
-goto fail;
-}
-error_free(err);
-err = NULL;
+if (blk_dev_has_removable_media(blk)) {
+rc = do_open_tray(has_device ? device : NULL,
+  has_id ? id : NULL,
+  false, &err);
+if (rc && rc != -ENOSYS) {
+error_propagate(errp, err);
+goto fail;
+}
+error_free(err);
+err = NULL;
 
-blockdev_remove_medium(has_device, device, has_id, id, &err);
-if (err) {
-error_propagate(errp, err);
-goto fail;
-}
+blockdev_remove_medium(has_device, device, has_id, id, &err);
+if (err) {
+error_propagate(errp, err);
+goto fail;
+}
 
-qmp_blockdev_insert_anon_medium(blk, medium_bs, &err);
-if (err) {
-error_propagate(errp, err);
-goto fail;
-}
+qmp_blockdev_insert_anon_medium(blk, medium_bs, &err);
+if (err) {
+error_propagate(errp, err);
+goto fail;
+}
+
+qmp_blockdev_close_tray(has_device, device, has_id, id, errp);
+} else {
+if (!medium_name) {
+error_setg(errp, "A medium name should be given");
+goto fail;
+}
 
-qmp_blockdev_close_tray(has_device, device, has_id, id, errp);
+if (runstate_is_running()) {
+error_setg(errp, "Can't set a medium for non-removable device "
+"in a running VM");
+goto fail;
+}
+
+if (strlen(blk_name(blk))) {
+error_setg(errp, "The device already has a medium");
+goto fail;
+}
+
+if (blk_insert_bs(blk, medium_bs, &err) < 0) {
+error_propagate(errp, err);
+goto fail;
+}
+
+if (!monitor_add_blk(blk, medium_name, &err)) {
+error_propagate(errp, err);
+}
+}
 
 fail:
 /* If the medium has been inserted, the device has its own reference, so
diff --git a/hmp.c b/hmp.c
index 8eec768088..fc7bac5b4b 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1948,6 +1948,7 @@ void hmp_change(Monitor *mon, const QDict *qdict)
 const char *target = qdict_get_str(qdict, "target");
 const char *arg = qdict_get_try_str(qdict, "arg");
 const char *read_only = qdict_get_try_str(qdict, "read-only-mode");
+const char *target_name = qdict_get_try_str(qdict, "target-name");
 BlockdevChangeReadOnlyMode read_only_mode = 0;
 Error *err = NULL;
 
@@ -1982,6 +1983,7 @@ void hmp_change(Monitor *mon, const QDict *qdict)
 
 qmp_blockdev_change_medium(true, device, false, NULL, target,
!!arg, arg, !!read_only, read_only_mode,
+   !!target_name, target_name,
&err);
 }
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 7ccbfff9d0..f493a7c737 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -4769,6 +4769,8 @@
 # @read-only-mode:  change the read-only mode of the device; defaults
 #   to 'retain'
 #
+# @medium-name: drive-name when changing the media in non-removable devices
+#   ignored when changing media in removable devices
 # Since: 2.5
 #
 # Examples:
@@ -4807,9 +4809,8 @@
 '*id'

Re: [PATCH] blockdev: modify blockdev-change-medium to change non-removable device

2019-10-20 Thread Denis Plotnikov

On 18.10.2019 18:02, Max Reitz wrote:
> On 18.10.19 14:09, Denis Plotnikov wrote:
>> The modification is useful to workaround exclusive file access restrictions,
>> e.g. to implement VM migration with shared disk stored on a storage with
>> the exclusive file opening model: a destination VM is started waiting for
>> incomming migration with a fake image drive, and later, on the last migration
>> phase, the fake image file is replaced with the real one.
>>
>> Signed-off-by: Denis Plotnikov 
> Isn’t this what we would want to use reopen for?
>
> Max

Could you please explain what is "use reopen"?

Denis

>


Re: [PATCH] blockdev: modify blockdev-change-medium to change non-removable device

2019-10-22 Thread Denis Plotnikov

On 22.10.2019 14:05, Max Reitz wrote:
> On 21.10.19 08:50, Denis Plotnikov wrote:
>> On 18.10.2019 18:02, Max Reitz wrote:
>>> On 18.10.19 14:09, Denis Plotnikov wrote:
>>>> The modification is useful to workaround exclusive file access 
>>>> restrictions,
>>>> e.g. to implement VM migration with shared disk stored on a storage with
>>>> the exclusive file opening model: a destination VM is started waiting for
>>>> incomming migration with a fake image drive, and later, on the last 
>>>> migration
>>>> phase, the fake image file is replaced with the real one.
>>>>
>>>> Signed-off-by: Denis Plotnikov 
>>> Isn’t this what we would want to use reopen for?
>>>
>>> Max
>> Could you please explain what is "use reopen"?
> I was thinking of using (x-)blockdev-reopen to change the file that is
> used by the format node (e.g. from a null-co node to a real file); or to
> change the filename of the protocol node.
>
> Kevin has pointed out (on IRC) that this will not allow you to change
> the node that is directly attached to the device.  While I don’t know
> whether that’s really necessary in this case, if it were indeed
> necessary, I’d prefer a method to change a guest device’s @drive option
> because that seems more natural to me.
>
> In contrast, the approach taken in this patch seems not quite right to
> me, because it overloads the whole blockdev-change-medium command with a
> completely new and different implementation based on whether there’s a
> removable medium or not.  If the implementation is so different (and the
> interface is, too, because in one path you must give @medium whereas the
> other doesn’t evaluate it at all), it should be a new command.
>
> I don’t know whether we need a new command at all, though.  On the node
> level, we have (x-)blockdev-reopen.  So assuming we need something to
> change the link between the guest device and the block layer, I wonder
> whether there isn’t something similar; specifically, I’d prefer
> something to simply change the device’s @drive option.
>
> Kevin has pointed out (on IRC again) that there is indeed one such
> command, and that’s qom-set.  Unfortunately, this is what happens if you
> try to use it for @drive:
>
> {"error": {"class": "GenericError", "desc": "Attempt to set property
> 'drive' on anonymous device (type 'virtio-blk-device') after it was
> realized"}}
>
> However, Kevin has claimed it would be technically possible to make an
> exception for @drive.  Maybe this is worth investigating?

Is there any guess how complex it might be? In the case if it's quite 
complex may be it's worth to make the separate command?

>
>
> (As for blockdev-change-medium, as I’ve said, I don’t really think this
> fits there.  Furthermore, blockdev-change-medium is kind of a legacy
> command because I think every command but blockdev-add that does a
> bdrv_open() kind of is a legacy command.
Out of curiosity, could you please explain why it's decided to be so?
> So if anything, it should be a
> new command that then takes a node-name.
> But OTOH, it would be a bit strange to add a separate command for
> something that in theory should be covered by qom-set @drive.)
>
> Max
>


Re: [PATCH] blockdev: modify blockdev-change-medium to change non-removable device

2019-10-22 Thread Denis Plotnikov

On 22.10.2019 16:18, Max Reitz wrote:
> On 22.10.19 14:53, Denis Plotnikov wrote:
>> On 22.10.2019 14:05, Max Reitz wrote:
>>> On 21.10.19 08:50, Denis Plotnikov wrote:
>>>> On 18.10.2019 18:02, Max Reitz wrote:
>>>>> On 18.10.19 14:09, Denis Plotnikov wrote:
>>>>>> The modification is useful to workaround exclusive file access 
>>>>>> restrictions,
>>>>>> e.g. to implement VM migration with shared disk stored on a storage with
>>>>>> the exclusive file opening model: a destination VM is started waiting for
>>>>>> incomming migration with a fake image drive, and later, on the last 
>>>>>> migration
>>>>>> phase, the fake image file is replaced with the real one.
>>>>>>
>>>>>> Signed-off-by: Denis Plotnikov 
>>>>> Isn’t this what we would want to use reopen for?
>>>>>
>>>>> Max
>>>> Could you please explain what is "use reopen"?
>>> I was thinking of using (x-)blockdev-reopen to change the file that is
>>> used by the format node (e.g. from a null-co node to a real file); or to
>>> change the filename of the protocol node.
>>>
>>> Kevin has pointed out (on IRC) that this will not allow you to change
>>> the node that is directly attached to the device.  While I don’t know
>>> whether that’s really necessary in this case, if it were indeed
>>> necessary, I’d prefer a method to change a guest device’s @drive option
>>> because that seems more natural to me.
>>>
>>> In contrast, the approach taken in this patch seems not quite right to
>>> me, because it overloads the whole blockdev-change-medium command with a
>>> completely new and different implementation based on whether there’s a
>>> removable medium or not.  If the implementation is so different (and the
>>> interface is, too, because in one path you must give @medium whereas the
>>> other doesn’t evaluate it at all), it should be a new command.
>>>
>>> I don’t know whether we need a new command at all, though.  On the node
>>> level, we have (x-)blockdev-reopen.  So assuming we need something to
>>> change the link between the guest device and the block layer, I wonder
>>> whether there isn’t something similar; specifically, I’d prefer
>>> something to simply change the device’s @drive option.
>>>
>>> Kevin has pointed out (on IRC again) that there is indeed one such
>>> command, and that’s qom-set.  Unfortunately, this is what happens if you
>>> try to use it for @drive:
>>>
>>> {"error": {"class": "GenericError", "desc": "Attempt to set property
>>> 'drive' on anonymous device (type 'virtio-blk-device') after it was
>>> realized"}}
>>>
>>> However, Kevin has claimed it would be technically possible to make an
>>> exception for @drive.  Maybe this is worth investigating?
>> Is there any guess how complex it might be? In the case if it's quite
>> complex may be it's worth to make the separate command?
> I can translate the chat log for you:
>
>  In theory that’s called qom-set
>  However, I believe it doesn’t support qdev properties
>  Hm, but that could be changed specifically for the drive property
>  qdev keeps confusing me.  Drive isn’t supposed to call
> qdev_prop_set_after_realize(), but the error message’s still there.
> Where is that hidden call...?
>  Ah, set_pointer() does
>  Yes, then it should be possible to make that work rather locally
>
> And that took him about 10 minutes.
>
> So I suppose it would be to check in set_drive() and
> set_drive_iothread() whether the device is already realized, and if so,
> divert it to some other function that does the runtime change?
ok, that might be a good starting point for me. Thanks.
>
> (No idea how the qdev maintainers think about doing that in set_drive()
> and set_drive_iothread(), though)
>
>>> (As for blockdev-change-medium, as I’ve said, I don’t really think this
>>> fits there.  Furthermore, blockdev-change-medium is kind of a legacy
>>> command because I think every command but blockdev-add that does a
>>> bdrv_open() kind of is a legacy command.
>> Out of curiosity, could you please explain why it's decided to be so?
> Because we have blockdev-add, which supports all block device options
> there are and so on.  blockdev-change-medium (which is basically just a
> more rigid “change”) only gets filename, which isn’t as expressive.
>
> We generally want users to add new nodes with blockdev-add and let all
> other commands only take node-names.
>
> (There’s also the fact that historically we’ve used filenames to
> identify BlockDriverStates, but that doesn’t work so well.  Thus I think
> we should get away from using filenames as much as we can so people
> don’t use them for identification again.)
>
> Max

Thanks for the explanation, Max!

Denis

>


Re: [PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839

2019-10-23 Thread Denis Plotnikov


On 21.10.2019 16:24, Stefan Hajnoczi wrote:
> On Fri, Oct 18, 2019 at 02:55:47PM +0300, Denis Plotnikov wrote:
>> From: "Denis V. Lunev" 
>>
>> Linux guests submit IO requests no longer than PAGE_SIZE * max_seg
>> field reported by SCSI controler. Thus typical sequential read with
>> 1 MB size results in the following pattern of the IO from the guest:
>>8,16   115754 2.766095122  2071  D   R 2095104 + 1008 [dd]
>>8,16   115755 2.766108785  2071  D   R 2096112 + 1008 [dd]
>>8,16   115756 2.766113486  2071  D   R 2097120 + 32 [dd]
>>8,16   115757 2.767668961 0  C   R 2095104 + 1008 [0]
>>8,16   115758 2.768534315 0  C   R 2096112 + 1008 [0]
>>8,16   115759 2.768539782 0  C   R 2097120 + 32 [0]
>> The IO was generated by
>>dd if=/dev/sda of=/dev/null bs=1024 iflag=direct
>>
>> This effectively means that on rotational disks we will observe 3 IOPS
>> for each 2 MBs processed. This definitely negatively affects both
>> guest and host IO performance.
>>
>> The cure is relatively simple - we should report lengthy scatter-gather
>> ability of the SCSI controller. Fortunately the situation here is very
>> good. VirtIO transport layer can accomodate 1024 items in one request
>> while we are using only 128. This situation is present since almost
>> very beginning. 2 items are dedicated for request metadata thus we
>> should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg.
>>
>> The following pattern is observed after the patch:
>>8,16   1 9921 2.662721340  2063  D   R 2095104 + 1024 [dd]
>>8,16   1 9922 2.662737585  2063  D   R 2096128 + 1024 [dd]
>>8,16   1 9923 2.665188167 0  C   R 2095104 + 1024 [0]
>>8,16   1 9924 2.665198777 0  C   R 2096128 + 1024 [0]
>> which is much better.
>>
>> The dark side of this patch is that we are tweaking guest visible
>> parameter, though this should be relatively safe as above transport
>> layer support is present in QEMU/host Linux for a very long time.
>> The patch adds configurable property for VirtIO SCSI with a new default
>> and hardcode option for VirtBlock which does not provide good
>> configurable framework.
>>
>> Unfortunately the commit can not be applied as is. For the real cure we
>> need guest to be fixed to accomodate that queue length, which is done
>> only in the latest 4.14 kernel. Thus we are going to expose the property
>> and tweak it on machine type level.
>>
>> The problem with the old kernels is that they have
>> max_segments <= virtqueue_size restriction which cause the guest
>> crashing in the case of violation.
>> To fix the case described above in the old kernels we can increase
>> virtqueue_size to 256 and max_segments to 254. The pitfall here is
>> that seabios allows the virtqueue_size-s < 128, however, the seabios
>> patch extending that value to 256 is pending.
> If I understand correctly you are relying on Indirect Descriptor support
> in the guest driver in order to exceed the Virtqueue Descriptor Table
> size.
>
> Unfortunately the "max_segments <= virtqueue_size restriction" is
> required by the VIRTIO 1.1 specification:
>
>2.6.5.3.1 Driver Requirements: Indirect Descriptors
>
>A driver MUST NOT create a descriptor chain longer than the Queue
>Size of the device.
>
> So this idea seems to be in violation of the specification?
>
> There is a bug in hw/block/virtio-blk.c:virtio_blk_update_config() and
> hw/scsi/virtio-scsi.c:virtio_scsi_get_config():
>
>virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
>
> This number should be the minimum of blk_get_max_iov() and
> virtio_queue_get_num(), minus 2 for the header and footer.

Stefan,

It seems VitrioSCSI don't have a direct link to blk, apart of 
VirtIOBlock->blk, and the link to a blk comes with each scsi request. I 
suspect that idea here is that a single virtioscsi can serve several 
blk-s. If my assumption is corect, then we can't get blk_get_max_iov() 
on virtioscsi configuration stage and we shouldn't take into account 
max_iov and limit max_segments with virtio_queue_get_num()-2 only.

Is it so, or is there any other details to take into account?

Thanks!

Denis

>
> I looked at the Linux SCSI driver code and it seems each HBA has a
> single max_segments number - it does not vary on a per-device basis.
> This could be a problem if two host block device with different
> max_segments are exposed to the guest through the same virtio-scsi
> controller.  Another bug? :(
>
> Anyway, if you want ~1024 descriptors you should set Queue Size to 1024.
> I don't see a spec-compliant way of doing it otherwise.  Hopefully I
> have overlooked something and there is a nice way to solve this.
>
> Stefan



Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-13 Thread Denis Plotnikov




On 12.02.2020 18:43, Stefan Hajnoczi wrote:

On Tue, Feb 11, 2020 at 05:14:14PM +0300, Denis Plotnikov wrote:

The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from a guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
  hw/block/virtio-blk.c | 4 ++--
  hw/core/machine.c | 2 ++
  hw/scsi/virtio-scsi.c | 4 ++--
  3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 09f46ed85f..6df3a7a6df 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -914,7 +914,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, 
uint8_t *config)
  memset(&blkcfg, 0, sizeof(blkcfg));
  virtio_stq_p(vdev, &blkcfg.capacity, capacity);
  virtio_stl_p(vdev, &blkcfg.seg_max,
- s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2);
+ s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 256 - 2);

This value must not change on older machine types.

Yes, that's true, but ..

So does this patch
need to turn seg-max-adjust *on* in hw_compat_4_2 so that old machine
types get 126 instead of 254?
If we set seg-max-adjust "on" in older machine types, the setups using 
them and having queue_sizes set , for example, 1024 will also set 
seg_max to 1024 - 2 which isn't the expected behavior: older mt didn't 
change seg_max in that case and stuck with 128 - 2.

So, should we, instead, leave the default 128 - 2, for seg_max?

Denis



  virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
  virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
  virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
@@ -1272,7 +1272,7 @@ static Property virtio_blk_properties[] = {
  DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
  true),
  DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
-DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
+DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256),
  DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, 
true),
  DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
   IOThread *),
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 2501b540ec..3427d6cf4c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -28,6 +28,8 @@
  #include "hw/mem/nvdimm.h"
  
  GlobalProperty hw_compat_4_2[] = {

+{ "virtio-blk-device", "queue-size", "128"},
+{ "virtio-scsi-device", "virtqueue_size", "128"},
  { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" },
  { "virtio-blk-device", "seg-max-adjust", "off"},
  { "virtio-scsi-device", "seg_max_adjust", "off"},
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 3b61563609..b38f50a429 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -660,7 +660,7 @@ static void virtio_scsi_get_config(VirtIODevice *vdev,
  
  virtio_stl_p(vdev, &scsiconf->num_queues, s->conf.num_queues);

  virtio_stl_p(vdev, &scsiconf->seg_max,
- s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 128 - 
2);
+ s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 256 - 
2);
  virtio_stl_p(vdev, &scsiconf->max_sectors, s->conf.max_sectors);
  virtio_stl_p(vdev, &scsiconf->cmd_per_lun, s->conf.cmd_per_lun);
  virtio_stl_p(vdev, &scsiconf->event_info_size, sizeof(VirtIOSCSIEvent));
@@ -965,7 +965,7 @@ static void virtio_scsi_device_unrealize(DeviceState *dev, 
Error **errp)
  static Property virtio_scsi_properties[] = {
  DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 
1),
  DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI,
- parent_obj.conf.virtqueue_size, 128),
+ parent_obj.conf.virtqueue_size, 256),
  DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI,
parent_obj.conf.seg_max_adjust, true),
  DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors,
--
2.17.0







Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-13 Thread Denis Plotnikov




On 13.02.2020 12:08, Stefan Hajnoczi wrote:

On Thu, Feb 13, 2020 at 11:08:35AM +0300, Denis Plotnikov wrote:

On 12.02.2020 18:43, Stefan Hajnoczi wrote:

On Tue, Feb 11, 2020 at 05:14:14PM +0300, Denis Plotnikov wrote:

The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from a guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
   hw/block/virtio-blk.c | 4 ++--
   hw/core/machine.c | 2 ++
   hw/scsi/virtio-scsi.c | 4 ++--
   3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 09f46ed85f..6df3a7a6df 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -914,7 +914,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, 
uint8_t *config)
   memset(&blkcfg, 0, sizeof(blkcfg));
   virtio_stq_p(vdev, &blkcfg.capacity, capacity);
   virtio_stl_p(vdev, &blkcfg.seg_max,
- s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2);
+ s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 256 - 2);

This value must not change on older machine types.

Yes, that's true, but ..

So does this patch
need to turn seg-max-adjust *on* in hw_compat_4_2 so that old machine
types get 126 instead of 254?

If we set seg-max-adjust "on" in older machine types, the setups using them
and having queue_sizes set , for example, 1024 will also set seg_max to 1024
- 2 which isn't the expected behavior: older mt didn't change seg_max in
that case and stuck with 128 - 2.
So, should we, instead, leave the default 128 - 2, for seg_max?

Argh!  Good point :-).

How about a seg_max_default property that is initialized to 254 for
modern machines and 126 to old machines?

Hmm, but we'll achieve the same but with more code changes, don't we?
254 is because the queue-size is 256. We gonna leave 128-2 for older 
machine types
just for not breaking anything. All other seg_max adjustment is provided 
by seg_max_adjust which is "on" by default in modern machine types.


to summarize:

modern mt defaults:
seg_max_adjust = on
queue_size = 256

=> default seg_max = 254
=> changing queue-size will change seg_max = queue_size - 2

old mt defaults:
seg_max_adjust = off
queue_size = 128

=> default seg_max = 126
=> changing queue-size won't change seg_max, it's always = 126 like it 
was before


Denis


Stefan





Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-13 Thread Denis Plotnikov




On 13.02.2020 14:45, Stefan Hajnoczi wrote:

On Thu, Feb 13, 2020 at 12:28:25PM +0300, Denis Plotnikov wrote:


On 13.02.2020 12:08, Stefan Hajnoczi wrote:

On Thu, Feb 13, 2020 at 11:08:35AM +0300, Denis Plotnikov wrote:

On 12.02.2020 18:43, Stefan Hajnoczi wrote:

On Tue, Feb 11, 2020 at 05:14:14PM +0300, Denis Plotnikov wrote:

The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from a guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
hw/block/virtio-blk.c | 4 ++--
hw/core/machine.c | 2 ++
hw/scsi/virtio-scsi.c | 4 ++--
3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 09f46ed85f..6df3a7a6df 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -914,7 +914,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, 
uint8_t *config)
memset(&blkcfg, 0, sizeof(blkcfg));
virtio_stq_p(vdev, &blkcfg.capacity, capacity);
virtio_stl_p(vdev, &blkcfg.seg_max,
- s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2);
+ s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 256 - 2);

This value must not change on older machine types.

Yes, that's true, but ..

So does this patch
need to turn seg-max-adjust *on* in hw_compat_4_2 so that old machine
types get 126 instead of 254?

If we set seg-max-adjust "on" in older machine types, the setups using them
and having queue_sizes set , for example, 1024 will also set seg_max to 1024
- 2 which isn't the expected behavior: older mt didn't change seg_max in
that case and stuck with 128 - 2.
So, should we, instead, leave the default 128 - 2, for seg_max?

Argh!  Good point :-).

How about a seg_max_default property that is initialized to 254 for
modern machines and 126 to old machines?

Hmm, but we'll achieve the same but with more code changes, don't we?
254 is because the queue-size is 256. We gonna leave 128-2 for older machine
types
just for not breaking anything. All other seg_max adjustment is provided by
seg_max_adjust which is "on" by default in modern machine types.

to summarize:

modern mt defaults:
seg_max_adjust = on
queue_size = 256

=> default seg_max = 254
=> changing queue-size will change seg_max = queue_size - 2

old mt defaults:
seg_max_adjust = off
queue_size = 128

=> default seg_max = 126
=> changing queue-size won't change seg_max, it's always = 126 like it was
before

You're right!  The only strange case is a modern machine type with
seg_max_adjust=off, where queue_size will be 256 but seg_max will be
126.  But no user would want to disable seg_max_adjust, so it's okay.

I agree with you that the line of code can remain unchanged:

   /*
* Only old machine types use seg_max_adjust=off and there the default
* value of queue_size is 128.
*/
   virtio_stl_p(vdev, &blkcfg.seg_max,
s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2);

Stefan

Ok, I'll resend the patch sortly
Thanks!

Denis



[PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-13 Thread Denis Plotnikov
v1:
  * seg_max default value changing removed

---
The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from a guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
 hw/block/virtio-blk.c | 2 +-
 hw/core/machine.c | 2 ++
 hw/scsi/virtio-scsi.c | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 09f46ed85f..142863a3b2 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -1272,7 +1272,7 @@ static Property virtio_blk_properties[] = {
 DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
 true),
 DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
-DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
+DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256),
 DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true),
 DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
  IOThread *),
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 2501b540ec..3427d6cf4c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -28,6 +28,8 @@
 #include "hw/mem/nvdimm.h"
 
 GlobalProperty hw_compat_4_2[] = {
+{ "virtio-blk-device", "queue-size", "128"},
+{ "virtio-scsi-device", "virtqueue_size", "128"},
 { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" },
 { "virtio-blk-device", "seg-max-adjust", "off"},
 { "virtio-scsi-device", "seg_max_adjust", "off"},
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 3b61563609..472bbd233b 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -965,7 +965,7 @@ static void virtio_scsi_device_unrealize(DeviceState *dev, 
Error **errp)
 static Property virtio_scsi_properties[] = {
 DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 
1),
 DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI,
- parent_obj.conf.virtqueue_size, 128),
+ parent_obj.conf.virtqueue_size, 256),
 DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI,
   parent_obj.conf.seg_max_adjust, true),
 DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors,
-- 
2.17.0




[PATCH v3] virtio: increase virtqueue size for virtio-scsi and virtio-blk

2020-02-13 Thread Denis Plotnikov
The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from a guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 

---

v3:
  * typos fixed

v2:
  * seg_max default value changing removed
---
 hw/block/virtio-blk.c | 2 +-
 hw/core/machine.c | 2 ++
 hw/scsi/virtio-scsi.c | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 09f46ed85f..142863a3b2 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -1272,7 +1272,7 @@ static Property virtio_blk_properties[] = {
 DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
 true),
 DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
-DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
+DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256),
 DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true),
 DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
  IOThread *),
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 2501b540ec..3427d6cf4c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -28,6 +28,8 @@
 #include "hw/mem/nvdimm.h"
 
 GlobalProperty hw_compat_4_2[] = {
+{ "virtio-blk-device", "queue-size", "128"},
+{ "virtio-scsi-device", "virtqueue_size", "128"},
 { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" },
 { "virtio-blk-device", "seg-max-adjust", "off"},
 { "virtio-scsi-device", "seg_max_adjust", "off"},
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 3b61563609..472bbd233b 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -965,7 +965,7 @@ static void virtio_scsi_device_unrealize(DeviceState *dev, 
Error **errp)
 static Property virtio_scsi_properties[] = {
 DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 
1),
 DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI,
- parent_obj.conf.virtqueue_size, 128),
+ parent_obj.conf.virtqueue_size, 256),
 DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI,
   parent_obj.conf.seg_max_adjust, true),
 DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors,
-- 
2.17.0




Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-18 Thread Denis Plotnikov




On 18.02.2020 16:53, Stefan Hajnoczi wrote:

On Thu, Feb 13, 2020 at 05:59:27PM +0300, Denis Plotnikov wrote:

v1:
   * seg_max default value changing removed

---
The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from a guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
  hw/block/virtio-blk.c | 2 +-
  hw/core/machine.c | 2 ++
  hw/scsi/virtio-scsi.c | 2 +-
  3 files changed, 4 insertions(+), 2 deletions(-)

I fixed up the "virtuqueue" typo in the commit message and the
mis-formatted commit description (git-am(1) stops including lines after
the first "---").
Actually, I sent the corrected version v3 of the patch last week. But it 
seems it got lost among that gigantic patch flow in the mailing list :)

Thanks for applying!

Denis


Thanks, applied to my block tree:
https://github.com/stefanha/qemu/commits/block

Stefan





Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-18 Thread Denis Plotnikov




On 18.02.2020 16:59, Denis Plotnikov wrote:



On 18.02.2020 16:53, Stefan Hajnoczi wrote:

On Thu, Feb 13, 2020 at 05:59:27PM +0300, Denis Plotnikov wrote:

v1:
   * seg_max default value changing removed

---
The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from a guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
  hw/block/virtio-blk.c | 2 +-
  hw/core/machine.c | 2 ++
  hw/scsi/virtio-scsi.c | 2 +-
  3 files changed, 4 insertions(+), 2 deletions(-)

I fixed up the "virtuqueue" typo in the commit message and the
mis-formatted commit description (git-am(1) stops including lines after
the first "---").
Actually, I sent the corrected version v3 of the patch last week. But 
it seems it got lost among that gigantic patch flow in the mailing 
list :)

Thanks for applying!

Denis


Thanks, applied to my block tree:
https://github.com/stefanha/qemu/commits/block

Stefan
I'm going to send the test checking the virtqueue-sizes for machine 
types a little bit later.


Denis




Re: [PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839

2019-12-05 Thread Denis Plotnikov
Ping!

On 25.11.2019 12:16, Denis Plotnikov wrote:
>
>
> On 06.11.2019 15:03, Michael S. Tsirkin wrote:
>> On Thu, Oct 24, 2019 at 11:34:34AM +, Denis Lunev wrote:
>>> On 10/24/19 12:28 AM, Michael S. Tsirkin wrote:
>>>> On Fri, Oct 18, 2019 at 02:55:47PM +0300, Denis Plotnikov wrote:
>>>>> From: "Denis V. Lunev" 
>>>>>
>>>>> Linux guests submit IO requests no longer than PAGE_SIZE * max_seg
>>>>> field reported by SCSI controler. Thus typical sequential read with
>>>>> 1 MB size results in the following pattern of the IO from the guest:
>>>>>    8,16   1    15754 2.766095122  2071  D   R 2095104 + 1008 [dd]
>>>>>    8,16   1    15755 2.766108785  2071  D   R 2096112 + 1008 [dd]
>>>>>    8,16   1    15756 2.766113486  2071  D   R 2097120 + 32 [dd]
>>>>>    8,16   1    15757 2.767668961 0  C   R 2095104 + 1008 [0]
>>>>>    8,16   1    15758 2.768534315 0  C   R 2096112 + 1008 [0]
>>>>>    8,16   1    15759 2.768539782 0  C   R 2097120 + 32 [0]
>>>>> The IO was generated by
>>>>>    dd if=/dev/sda of=/dev/null bs=1024 iflag=direct
>>>>>
>>>>> This effectively means that on rotational disks we will observe 3 
>>>>> IOPS
>>>>> for each 2 MBs processed. This definitely negatively affects both
>>>>> guest and host IO performance.
>>>>>
>>>>> The cure is relatively simple - we should report lengthy 
>>>>> scatter-gather
>>>>> ability of the SCSI controller. Fortunately the situation here is 
>>>>> very
>>>>> good. VirtIO transport layer can accomodate 1024 items in one request
>>>>> while we are using only 128. This situation is present since almost
>>>>> very beginning. 2 items are dedicated for request metadata thus we
>>>>> should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg.
>>>>>
>>>>> The following pattern is observed after the patch:
>>>>>    8,16   1 9921 2.662721340  2063  D   R 2095104 + 1024 [dd]
>>>>>    8,16   1 9922 2.662737585  2063  D   R 2096128 + 1024 [dd]
>>>>>    8,16   1 9923 2.665188167 0  C   R 2095104 + 1024 [0]
>>>>>    8,16   1 9924 2.665198777 0  C   R 2096128 + 1024 [0]
>>>>> which is much better.
>>>>>
>>>>> The dark side of this patch is that we are tweaking guest visible
>>>>> parameter, though this should be relatively safe as above transport
>>>>> layer support is present in QEMU/host Linux for a very long time.
>>>>> The patch adds configurable property for VirtIO SCSI with a new 
>>>>> default
>>>>> and hardcode option for VirtBlock which does not provide good
>>>>> configurable framework.
>>>>>
>>>>> Unfortunately the commit can not be applied as is. For the real 
>>>>> cure we
>>>>> need guest to be fixed to accomodate that queue length, which is done
>>>>> only in the latest 4.14 kernel. Thus we are going to expose the 
>>>>> property
>>>>> and tweak it on machine type level.
>>>>>
>>>>> The problem with the old kernels is that they have
>>>>> max_segments <= virtqueue_size restriction which cause the guest
>>>>> crashing in the case of violation.
>>>> This isn't just in the guests: virtio spec also seems to imply this,
>>>> or at least be vague on this point.
>>>>
>>>> So I think it'll need a feature bit.
>>>> Doing that in a safe way will also allow being compatible with old 
>>>> guests.
>>>>
>>>> The only downside is it's a bit more work as we need to
>>>> spec this out and add guest support.
>>>>
>>>>> To fix the case described above in the old kernels we can increase
>>>>> virtqueue_size to 256 and max_segments to 254. The pitfall here is
>>>>> that seabios allows the virtqueue_size-s < 128, however, the seabios
>>>>> patch extending that value to 256 is pending.
>>>> And the fix here is just to limit large vq size to virtio 1.0.
>>>> In that mode it's fine I think:
>>>>
>>>>
>>>>     /* check if the queue is available */
>>>>     if (vp->use_modern) {
>>>>     num 

[PING]Re: [PATCH v0 2/2] block: allow to set 'drive' property on a realized block device

2019-12-12 Thread Denis Plotnikov


On 18.11.2019 13:50, Denis Plotnikov wrote:
>
>
> On 10.11.2019 22:08, Denis Plotnikov wrote:
>>
>> On 10.11.2019 22:03, Denis Plotnikov wrote:
>>> This allows to change (replace) the file on a block device and is 
>>> useful
>>> to workaround exclusive file access restrictions, e.g. to implement VM
>>> migration with a shared disk stored on some storage with the exclusive
>>> file opening model: a destination VM is started waiting for incomming
>>> migration with a fake image drive, and later, on the last migration
>>> phase, the fake image file is replaced with the real one.
>>>
>>> Signed-off-by: Denis Plotnikov 
>>> ---
>>>   hw/core/qdev-properties-system.c | 89 
>>> +++-
>>>   1 file changed, 77 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/hw/core/qdev-properties-system.c 
>>> b/hw/core/qdev-properties-system.c
>>> index c534590dcd..aaab1370a4 100644
>>> --- a/hw/core/qdev-properties-system.c
>>> +++ b/hw/core/qdev-properties-system.c
>>> @@ -79,8 +79,55 @@ static void set_pointer(Object *obj, Visitor *v, 
>>> Property *prop,
>>>     /* --- drive --- */
>>>   -static void do_parse_drive(DeviceState *dev, const char *str, 
>>> void **ptr,
>>> -   const char *propname, bool iothread, 
>>> Error **errp)
>>> +static void do_parse_drive_realized(DeviceState *dev, const char *str,
>>> +    void **ptr, const char *propname,
>>> +    bool iothread, Error **errp)
>>> +{
>>> +    BlockBackend *blk = *ptr;
>>> +    BlockDriverState *bs = bdrv_lookup_bs(NULL, str, NULL);
>>> +    int ret;
>>> +    bool blk_created = false;
>>> +
>>> +    if (!bs) {
>>> +    error_setg(errp, "Can't find blockdev '%s'", str);
>>> +    return;
>>> +    }
>>> +
>>> +    if (!blk) {
>>> +    AioContext *ctx = iothread ? bdrv_get_aio_context(bs) :
>>> + qemu_get_aio_context();
>>> +    blk = blk_new(ctx, BLK_PERM_ALL, BLK_PERM_ALL);
>>> +    blk_created = true;
>>
>> Actually, I have concerns about situation where blk=null.
>>
>> Is there any case when scsi-hd (or others) doesn't have a blk 
>> assigned and it's legal?
>>
>>> +    } else {
>>> +    if (blk_bs(blk)) {
>>> +    blk_remove_bs(blk);
>>> +    }
>>> +    }
>>> +
>>> +    ret = blk_insert_bs(blk, bs, errp);
>>> +
>>> +    if (!ret && blk_created) {
>>> +    if (blk_attach_dev(blk, dev) < 0) {
>>> +    /*
>>> + * Shouldn't be any errors here since we just created
>>> + * the new blk because the device doesn't have any.
>>> + * Leave the message here in case blk_attach_dev is 
>>> changed
>>> + */
>>> + error_setg(errp, "Can't attach drive '%s' to device 
>>> '%s'",
>>> +    str, object_get_typename(OBJECT(dev)));
>>> +    } else {
>>> +    *ptr = blk;
>>> +    }
>>> +    }
> Another problem here, is that the "size" of the device dev may not 
> match after setting a drive.
> So, we should update it after the drive setting.
> It was found, that it could be done by calling 
> BlockDevOps.bdrv_parent_cb_resize.
>
> But I have some concerns about doing it so. In the case of virtio scsi 
> disk we have the following callstack
>
>     bdrv_parent_cb_resize calls() ->
>     scsi_device_report_change(dev, SENSE_CODE(CAPACITY_CHANGED)) ->
>             virtio_scsi_change ->
>     virtio_scsi_push_event(s, dev, 
> VIRTIO_SCSI_T_PARAM_CHANGE,
>                             sense.asc | 
> (sense.ascq << 8));
>
>
> virtio_scsi_change  pushes the event to the guest to make the guest 
> ask for size refreshing.
> If I'm not mistaken, here we can get a race condition when some 
> another request is processed with an unchanged
> size and then the size changing request is processed.
>
> I didn't find a better way to update device size so any comments are 
> welcome.
>
> Thanks!
>
> Denis
>>> +
>>> +    if (blk_created) {
>>> +    blk_unref(blk);
>&

[PING] [PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839

2019-12-13 Thread Denis Plotnikov


On 05.12.2019 10:59, Denis Plotnikov wrote:
> Ping!
>
> On 25.11.2019 12:16, Denis Plotnikov wrote:
>>
>>
>> On 06.11.2019 15:03, Michael S. Tsirkin wrote:
>>> On Thu, Oct 24, 2019 at 11:34:34AM +, Denis Lunev wrote:
>>>> On 10/24/19 12:28 AM, Michael S. Tsirkin wrote:
>>>>> On Fri, Oct 18, 2019 at 02:55:47PM +0300, Denis Plotnikov wrote:
>>>>>> From: "Denis V. Lunev" 
>>>>>>
>>>>>> Linux guests submit IO requests no longer than PAGE_SIZE * max_seg
>>>>>> field reported by SCSI controler. Thus typical sequential read with
>>>>>> 1 MB size results in the following pattern of the IO from the guest:
>>>>>>    8,16   1    15754 2.766095122  2071  D   R 2095104 + 1008 
>>>>>> [dd]
>>>>>>    8,16   1    15755 2.766108785  2071  D   R 2096112 + 1008 
>>>>>> [dd]
>>>>>>    8,16   1    15756 2.766113486  2071  D   R 2097120 + 32 [dd]
>>>>>>    8,16   1    15757 2.767668961 0  C   R 2095104 + 1008 [0]
>>>>>>    8,16   1    15758 2.768534315 0  C   R 2096112 + 1008 [0]
>>>>>>    8,16   1    15759 2.768539782 0  C   R 2097120 + 32 [0]
>>>>>> The IO was generated by
>>>>>>    dd if=/dev/sda of=/dev/null bs=1024 iflag=direct
>>>>>>
>>>>>> This effectively means that on rotational disks we will observe 3 
>>>>>> IOPS
>>>>>> for each 2 MBs processed. This definitely negatively affects both
>>>>>> guest and host IO performance.
>>>>>>
>>>>>> The cure is relatively simple - we should report lengthy 
>>>>>> scatter-gather
>>>>>> ability of the SCSI controller. Fortunately the situation here is 
>>>>>> very
>>>>>> good. VirtIO transport layer can accomodate 1024 items in one 
>>>>>> request
>>>>>> while we are using only 128. This situation is present since almost
>>>>>> very beginning. 2 items are dedicated for request metadata thus we
>>>>>> should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg.
>>>>>>
>>>>>> The following pattern is observed after the patch:
>>>>>>    8,16   1 9921 2.662721340  2063  D   R 2095104 + 1024 
>>>>>> [dd]
>>>>>>    8,16   1 9922 2.662737585  2063  D   R 2096128 + 1024 
>>>>>> [dd]
>>>>>>    8,16   1 9923 2.665188167 0  C   R 2095104 + 1024 [0]
>>>>>>    8,16   1 9924 2.665198777 0  C   R 2096128 + 1024 [0]
>>>>>> which is much better.
>>>>>>
>>>>>> The dark side of this patch is that we are tweaking guest visible
>>>>>> parameter, though this should be relatively safe as above transport
>>>>>> layer support is present in QEMU/host Linux for a very long time.
>>>>>> The patch adds configurable property for VirtIO SCSI with a new 
>>>>>> default
>>>>>> and hardcode option for VirtBlock which does not provide good
>>>>>> configurable framework.
>>>>>>
>>>>>> Unfortunately the commit can not be applied as is. For the real 
>>>>>> cure we
>>>>>> need guest to be fixed to accomodate that queue length, which is 
>>>>>> done
>>>>>> only in the latest 4.14 kernel. Thus we are going to expose the 
>>>>>> property
>>>>>> and tweak it on machine type level.
>>>>>>
>>>>>> The problem with the old kernels is that they have
>>>>>> max_segments <= virtqueue_size restriction which cause the guest
>>>>>> crashing in the case of violation.
>>>>> This isn't just in the guests: virtio spec also seems to imply this,
>>>>> or at least be vague on this point.
>>>>>
>>>>> So I think it'll need a feature bit.
>>>>> Doing that in a safe way will also allow being compatible with old 
>>>>> guests.
>>>>>
>>>>> The only downside is it's a bit more work as we need to
>>>>> spec this out and add guest support.
>>>>>
>>>>>> To fix the case described above in the old kernels we can increase
>>>>>> virtqueue_size to 256 an

[PATCH v4 0/2] virtio: make seg_max virtqueue size dependent

2019-12-16 Thread Denis Plotnikov
v4:
  * rebased on 4.2 [MST]

v3:
  * add property to set in machine type [MST]
  * add min queue size check [Stefan]
  * add avocado based test [Max, Stefan, Eduardo, Cleber]

v2:
  * the standalone patch to make seg_max virtqueue size dependent
  * other patches are postponed

v1:
  the initial series

Denis Plotnikov (2):
  virtio: make seg_max virtqueue size dependent
  tests: add virtio-scsi and virtio-blk seg_max_adjust test

 hw/block/virtio-blk.c |   9 +-
 hw/core/machine.c |   3 +
 hw/scsi/vhost-scsi.c  |   2 +
 hw/scsi/virtio-scsi.c |  10 +-
 include/hw/virtio/virtio-blk.h|   1 +
 include/hw/virtio/virtio-scsi.h   |   1 +
 tests/acceptance/virtio_seg_max_adjust.py | 135 ++
 7 files changed, 159 insertions(+), 2 deletions(-)
 create mode 100755 tests/acceptance/virtio_seg_max_adjust.py

-- 
2.17.0




[PATCH v4 2/2] tests: add virtio-scsi and virtio-blk seg_max_adjust test

2019-12-16 Thread Denis Plotnikov
It tests proper seg_max_adjust settings for all machine types except
'none', 'isapc', 'microvm'

Signed-off-by: Denis Plotnikov 
---
 tests/acceptance/virtio_seg_max_adjust.py | 135 ++
 1 file changed, 135 insertions(+)
 create mode 100755 tests/acceptance/virtio_seg_max_adjust.py

diff --git a/tests/acceptance/virtio_seg_max_adjust.py 
b/tests/acceptance/virtio_seg_max_adjust.py
new file mode 100755
index 00..00cf2565d9
--- /dev/null
+++ b/tests/acceptance/virtio_seg_max_adjust.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+#
+# Test virtio-scsi and virtio-blk queue settings for all machine types
+#
+# Copyright (c) 2019 Virtuozzo International GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import sys
+import os
+import re
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'python'))
+from qemu.machine import QEMUMachine
+from avocado_qemu import Test
+
+#list of machine types and virtqueue properties to test
+VIRTIO_SCSI_PROPS = {'seg_max_adjust': 'seg_max_adjust'}
+VIRTIO_BLK_PROPS = {'seg_max_adjust': 'seg-max-adjust'}
+
+DEV_TYPES = {'virtio-scsi-pci': VIRTIO_SCSI_PROPS,
+ 'virtio-blk-pci': VIRTIO_BLK_PROPS}
+
+VM_DEV_PARAMS = {'virtio-scsi-pci': ['-device', 'virtio-scsi-pci,id=scsi0'],
+ 'virtio-blk-pci': ['-device',
+'virtio-blk-pci,id=scsi0,drive=drive0',
+'-drive',
+'driver=null-co,id=drive0,if=none']}
+
+
+class VirtioMaxSegSettingsCheck(Test):
+@staticmethod
+def make_pattern(props):
+pattern_items = ['{0} = \w+'.format(prop) for prop in props]
+return '|'.join(pattern_items)
+
+def query_virtqueue(self, vm, dev_type_name):
+query_ok = False
+error = None
+props = None
+
+output = vm.command('human-monitor-command',
+command_line = 'info qtree')
+props_list = DEV_TYPES[dev_type_name].values();
+pattern = self.make_pattern(props_list)
+res = re.findall(pattern, output)
+
+if len(res) != len(props_list):
+props_list = set(props_list)
+res = set(res)
+not_found = props_list.difference(res)
+not_found = ', '.join(not_found)
+error = '({0}): The following properties not found: {1}'\
+ .format(dev_type_name, not_found)
+else:
+query_ok = True
+props = dict()
+for prop in res:
+p = prop.split(' = ')
+props[p[0]] = p[1]
+return query_ok, props, error
+
+def check_mt(self, mt, dev_type_name):
+with QEMUMachine(self.qemu_bin) as vm:
+vm.set_machine(mt["name"])
+for s in VM_DEV_PARAMS[dev_type_name]:
+vm.add_args(s)
+vm.launch()
+query_ok, props, error = self.query_virtqueue(vm, dev_type_name)
+
+if not query_ok:
+self.fail('machine type {0}: {1}'.format(mt['name'], error))
+
+for prop_name, prop_val in props.items():
+expected_val = mt[prop_name]
+self.assertEqual(expected_val, prop_val)
+
+@staticmethod
+def seg_max_adjust_enabled(mt):
+# machine types > 4.2 should have seg_max_adjust = true
+# others seg_max_adjust = false
+mt = mt.split("-")
+
+# machine types with one line name and name like pc-x.x
+if len(mt) <= 2:
+return False
+
+# machine types like pc--x.x[.x]
+ver = mt[2]
+ver = ver.split(".");
+
+# all versions greater than 4.2 goes with seg_max_adjust enabled
+major = int(ver[0])
+minor = int(ver[1])
+
+if major > 4 or (major == 4 and minor > 2):
+return True
+return False
+
+def test_machine_types(self):
+# collect all machine types except 'none', 'isapc', 'microvm'
+with QEMUMachine(self.qemu_

[PATCH v4 1/2] virtio: make seg_max virtqueue size dependent

2019-12-16 Thread Denis Plotnikov
Before the patch, seg_max parameter was immutable and hardcoded
to 126 (128 - 2) without respect to queue size. This has two negative effects:

1. when queue size is < 128, we have Virtio 1.1 specfication violation:
   (2.6.5.3.1 Driver Requirements) seq_max must be <= queue_size.
   This violation affects the old Linux guests (ver < 4.14). These guests
   crash on these queue_size setups.

2. when queue_size > 128, as was pointed out by Denis Lunev 
,
   seg_max restrics guest's block request length which affects guests'
   performance making them issues more block request than needed.
   https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

To mitigate this two effects, the patch adds the property adjusting seg_max
to queue size automaticaly. Since seg_max is a guest visible parameter,
the property is machine type managable and allows to choose between
old (seg_max = 126 always) and new (seg_max = queue_size - 2) behaviors.

Not to change the behavior of the older VMs, prevent setting the default
seg_max_adjust value for older machine types.

Signed-off-by: Denis Plotnikov 
---
 hw/block/virtio-blk.c   |  9 -
 hw/core/machine.c   |  3 +++
 hw/scsi/vhost-scsi.c|  2 ++
 hw/scsi/virtio-scsi.c   | 10 +-
 include/hw/virtio/virtio-blk.h  |  1 +
 include/hw/virtio/virtio-scsi.h |  1 +
 6 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index d62e6377c2..0f6f8113b7 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -908,7 +908,8 @@ static void virtio_blk_update_config(VirtIODevice *vdev, 
uint8_t *config)
 blk_get_geometry(s->blk, &capacity);
 memset(&blkcfg, 0, sizeof(blkcfg));
 virtio_stq_p(vdev, &blkcfg.capacity, capacity);
-virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
+virtio_stl_p(vdev, &blkcfg.seg_max,
+ s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2);
 virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
 virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
 virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
@@ -1133,6 +1134,11 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 error_setg(errp, "num-queues property must be larger than 0");
 return;
 }
+if (conf->queue_size <= 2) {
+error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
+   "must be > 2", conf->queue_size);
+return;
+}
 if (!is_power_of_2(conf->queue_size) ||
 conf->queue_size > VIRTQUEUE_MAX_SIZE) {
 error_setg(errp, "invalid queue-size property (%" PRIu16 "), "
@@ -1262,6 +1268,7 @@ static Property virtio_blk_properties[] = {
 true),
 DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
 DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
+DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true),
 DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
  IOThread *),
 DEFINE_PROP_BIT64("discard", VirtIOBlock, host_features,
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 023548b4f3..bfa320387e 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -29,6 +29,9 @@
 
 GlobalProperty hw_compat_4_2[] = {
 { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" },
+{ "virtio-blk-device", "seg-max-adjust", "off"},
+{ "virtio-scsi-device", "seg_max_adjust", "off"},
+{ "vhost-blk-device", "seg_max_adjust", "off"},
 };
 const size_t hw_compat_4_2_len = G_N_ELEMENTS(hw_compat_4_2);
 
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index c693fc748a..26f710d3ec 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -275,6 +275,8 @@ static Property vhost_scsi_properties[] = {
 DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues, 1),
 DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSICommon, conf.virtqueue_size,
128),
+DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSICommon, conf.seg_max_adjust,
+  true),
 DEFINE_PROP_UINT32("max_sectors", VirtIOSCSICommon, conf.max_sectors,
0x),
 DEFINE_PROP_UINT32("cmd_per_lun", VirtIOSCSICommon, conf.cmd_per_lun, 128),
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index e8b2b64d09..405cb6c953 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -654,7 +654,8 @@ static void virtio_scsi_get_config(VirtIODevice *vdev,
 

Re: [PATCH v0 2/2] block: allow to set 'drive' property on a realized block device

2019-12-16 Thread Denis Plotnikov


On 13.12.2019 13:32, Kevin Wolf wrote:
> Am 18.11.2019 um 11:50 hat Denis Plotnikov geschrieben:
>>
>> On 10.11.2019 22:08, Denis Plotnikov wrote:
>>> On 10.11.2019 22:03, Denis Plotnikov wrote:
>>>> This allows to change (replace) the file on a block device and is useful
>>>> to workaround exclusive file access restrictions, e.g. to implement VM
>>>> migration with a shared disk stored on some storage with the exclusive
>>>> file opening model: a destination VM is started waiting for incomming
>>>> migration with a fake image drive, and later, on the last migration
>>>> phase, the fake image file is replaced with the real one.
>>>>
>>>> Signed-off-by: Denis Plotnikov 
>>>> ---
>>>>    hw/core/qdev-properties-system.c | 89 +++-
>>>>    1 file changed, 77 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/hw/core/qdev-properties-system.c
>>>> b/hw/core/qdev-properties-system.c
>>>> index c534590dcd..aaab1370a4 100644
>>>> --- a/hw/core/qdev-properties-system.c
>>>> +++ b/hw/core/qdev-properties-system.c
>>>> @@ -79,8 +79,55 @@ static void set_pointer(Object *obj, Visitor *v,
>>>> Property *prop,
>>>>      /* --- drive --- */
>>>>    -static void do_parse_drive(DeviceState *dev, const char *str, void
>>>> **ptr,
>>>> -   const char *propname, bool iothread,
>>>> Error **errp)
>>>> +static void do_parse_drive_realized(DeviceState *dev, const char *str,
>>>> +    void **ptr, const char *propname,
>>>> +    bool iothread, Error **errp)
>>>> +{
>>>> +    BlockBackend *blk = *ptr;
>>>> +    BlockDriverState *bs = bdrv_lookup_bs(NULL, str, NULL);
>>>> +    int ret;
>>>> +    bool blk_created = false;
>>>> +
>>>> +    if (!bs) {
>>>> +    error_setg(errp, "Can't find blockdev '%s'", str);
>>>> +    return;
>>>> +    }
>>>> +
>>>> +    if (!blk) {
>>>> +    AioContext *ctx = iothread ? bdrv_get_aio_context(bs) :
>>>> + qemu_get_aio_context();
>>>> +    blk = blk_new(ctx, BLK_PERM_ALL, BLK_PERM_ALL);
>>>> +    blk_created = true;
>>> Actually, I have concerns about situation where blk=null.
>>>
>>> Is there any case when scsi-hd (or others) doesn't have a blk assigned
>>> and it's legal?
> No, block devices will always have a BlockBackend, even if it doesn't
> have a root node inserted.
>
>>>> +    } else {
>>>> +    if (blk_bs(blk)) {
>>>> +    blk_remove_bs(blk);
>>>> +    }
>>>> +    }
>>>> +
>>>> +    ret = blk_insert_bs(blk, bs, errp);
>>>> +
>>>> +    if (!ret && blk_created) {
>>>> +    if (blk_attach_dev(blk, dev) < 0) {
>>>> +    /*
>>>> + * Shouldn't be any errors here since we just created
>>>> + * the new blk because the device doesn't have any.
>>>> + * Leave the message here in case blk_attach_dev is changed
>>>> + */
>>>> + error_setg(errp, "Can't attach drive '%s' to device '%s'",
>>>> +    str, object_get_typename(OBJECT(dev)));
>>>> +    } else {
>>>> +    *ptr = blk;
>>>> +    }
>>>> +    }
>> Another problem here, is that the "size" of the device dev may not match
>> after setting a drive.
>> So, we should update it after the drive setting.
>> It was found, that it could be done by calling
>> BlockDevOps.bdrv_parent_cb_resize.
>>
>> But I have some concerns about doing it so. In the case of virtio scsi
>> disk we have the following callstack
>>
>>       bdrv_parent_cb_resize calls() ->
>>       scsi_device_report_change(dev, SENSE_CODE(CAPACITY_CHANGED)) ->
>>               virtio_scsi_change ->
>>       virtio_scsi_push_event(s, dev, VIRTIO_SCSI_T_PARAM_CHANGE,
>>                               sense.asc |
>> (sense.ascq << 8));
> I think the safest option for now (and which should solve the case you
> want to address) is checking whether old and new size match and
> returning an error otherwise.
>
>> virtio_scsi_change  pushes the event to the guest to make the guest
>> ask for size refreshing.  If I'm not mistaken, here we can get a race
>> condition when some another request is processed with an unchanged
>> size and then the size changing request is processed.
> I think this is actually a problem even without resizing: We need to
> quiesce the device between removing the old root and inserting the new
> one. They way to achieve this is probably by splitting blk_drain() into
> a blk_drain_begin()/end() and then draining the BlockBackend here while
> we're working on it.
>
> Kevin
Why don't we use bdrv_drained_begin/end directly? This is what blk_drain 
does.
If we want to split blk_drain we must keep track if blk's brdv isn't 
change otherwise we can end up with drain_begin one and drain end 
another bdrv if we do remove/insert in between.

Another thing is should we really care about this if we have VM stopped 
and the sizes matched?

Denis
>



Re: [PATCH v0 2/2] block: allow to set 'drive' property on a realized block device

2019-12-16 Thread Denis Plotnikov


On 16.12.2019 18:38, Kevin Wolf wrote:
> Am 16.12.2019 um 15:51 hat Denis Plotnikov geschrieben:
>> On 13.12.2019 13:32, Kevin Wolf wrote:
>>> Am 18.11.2019 um 11:50 hat Denis Plotnikov geschrieben:
>>>> Another problem here, is that the "size" of the device dev may not match
>>>> after setting a drive.
>>>> So, we should update it after the drive setting.
>>>> It was found, that it could be done by calling
>>>> BlockDevOps.bdrv_parent_cb_resize.
>>>>
>>>> But I have some concerns about doing it so. In the case of virtio scsi
>>>> disk we have the following callstack
>>>>
>>>>    bdrv_parent_cb_resize calls() ->
>>>>    scsi_device_report_change(dev, SENSE_CODE(CAPACITY_CHANGED)) ->
>>>>            virtio_scsi_change ->
>>>>    virtio_scsi_push_event(s, dev, 
>>>> VIRTIO_SCSI_T_PARAM_CHANGE,
>>>>                            sense.asc |
>>>> (sense.ascq << 8));
>>> I think the safest option for now (and which should solve the case you
>>> want to address) is checking whether old and new size match and
>>> returning an error otherwise.
>>>
>>>> virtio_scsi_change  pushes the event to the guest to make the guest
>>>> ask for size refreshing.  If I'm not mistaken, here we can get a race
>>>> condition when some another request is processed with an unchanged
>>>> size and then the size changing request is processed.
>>> I think this is actually a problem even without resizing: We need to
>>> quiesce the device between removing the old root and inserting the new
>>> one. They way to achieve this is probably by splitting blk_drain() into
>>> a blk_drain_begin()/end() and then draining the BlockBackend here while
>>> we're working on it.
>>>
>>> Kevin
>> Why don't we use bdrv_drained_begin/end directly? This is what
>> blk_drain does.
>> If we want to split blk_drain we must keep track if blk's brdv isn't
>> change otherwise we can end up with drain_begin one and drain end
>> another bdrv if we do remove/insert in between.
> Hmm, true, we would have to keep track of draining at the BlockBackend
> level and consider it in blk_remove_bs() and blk_insert_bs(). Maybe
> that's not worth it.
>
> If we use bdrv_drained_begin/end directly, I think we need to drain both
> the old and the new root node during the process.
>
>> Another thing is should we really care about this if we have VM
>> stopped and the sizes matched?
> How do we know that the VM is stopped? And why would we require this?
I implied the scenario of VM migration over a shared storage with an 
exclusive file access model.
The VM is stopped on drive changing phase.

If there is no use to require it, than ok.

Denis
> Your patch doesn't implement or at least check this, and it seems a bit
> impractical for example when all you want is inserting a filter node.
>
> Kevin




Re: [PATCH v0 1/2] qdev-properties-system: extend set_pionter for unrealized devices

2019-11-22 Thread Denis Plotnikov



On 18.11.2019 21:54, Eduardo Habkost wrote:
> On Sun, Nov 10, 2019 at 10:03:09PM +0300, Denis Plotnikov wrote:
>> Some device's property can be changed if the device has been already
>> realized. For example, it could be "drive" property of a scsi disk device.
>>
>> So far, set_pointer could operate only on a relized device. The patch
>> extends its interface for operation on an unrealized device.
>>
>> Signed-off-by: Denis Plotnikov 
>> ---
>>   hw/core/qdev-properties-system.c | 32 +---
>>   1 file changed, 21 insertions(+), 11 deletions(-)
>>
>> diff --git a/hw/core/qdev-properties-system.c 
>> b/hw/core/qdev-properties-system.c
>> index ba412dd2ca..c534590dcd 100644
>> --- a/hw/core/qdev-properties-system.c
>> +++ b/hw/core/qdev-properties-system.c
>> @@ -38,9 +38,14 @@ static void get_pointer(Object *obj, Visitor *v, Property 
>> *prop,
>>   }
>>   
>>   static void set_pointer(Object *obj, Visitor *v, Property *prop,
>> -void (*parse)(DeviceState *dev, const char *str,
>> -  void **ptr, const char *propname,
>> -  Error **errp),
>> +void (*parse_realized)(DeviceState *dev,
>> +   const char *str, void **ptr,
>> +   const char *propname,
>> +   Error **errp),
>> +void (*parse_unrealized)(DeviceState *dev,
>> + const char *str, void 
>> **ptr,
>> + const char *propname,
>> + Error **errp),
>>   const char *name, Error **errp)
> Wouldn't it be simpler to just add a PropertyInfo::allow_set_after_realize
> bool field, and call the same setter function?  Then you can
> simply change do_parse_drive() to check if realized is true.
May be, but I thought It would be more clear to have a separate callback 
for all the devices supporting the property setting when realized.
Also the "drive" property setting on realized and non-realized device a 
little bit different: in the realized case the setter function expects 
to get
BlockDriverState only, when in the unrealized case the setter can accept 
both BlockBackend and BlockDriverState. Also, in the unrealized case the 
setter function doesn't expect to have a device with an empty BlockBackend.
I decided that extending do_parse_drive would make it more complex for 
understanding. That's why I made two separate functions for both cases.

I'd like to mention that I have a few concerns about 
do_parse_drive_realized (please see the next patch from the series) and 
I'd like them to be reviewed as well. After that, may be it would be 
better to go the way you suggested.

Thanks for reviewing!
Denis

>
>>   {
>>   DeviceState *dev = DEVICE(obj);
>> @@ -48,11 +53,6 @@ static void set_pointer(Object *obj, Visitor *v, Property 
>> *prop,
>>   void **ptr = qdev_get_prop_ptr(dev, prop);
>>   char *str;
>>   
>> -if (dev->realized) {
>> -qdev_prop_set_after_realize(dev, name, errp);
>> -return;
>> -}
>> -
>>   visit_type_str(v, name, &str, &local_err);
>>   if (local_err) {
>>   error_propagate(errp, local_err);
>> @@ -63,7 +63,17 @@ static void set_pointer(Object *obj, Visitor *v, Property 
>> *prop,
>>   *ptr = NULL;
>>   return;
>>   }
>> -parse(dev, str, ptr, prop->name, errp);
>> +
>> +if (dev->realized) {
>> +if (parse_realized) {
>> +parse_realized(dev, str, ptr, prop->name, errp);
>> +} else {
>> +qdev_prop_set_after_realize(dev, name, errp);
>> +}
>> +} else {
>> +parse_unrealized(dev, str, ptr, prop->name, errp);
>> +}
>> +
>>   g_free(str);
>>   }
>>   
>> @@ -178,13 +188,13 @@ static void get_drive(Object *obj, Visitor *v, const 
>> char *name, void *opaque,
>>   static void set_drive(Object *obj, Visitor *v, const char *name, void 
>> *opaque,
>> Error **errp)
>>   {
>> -set_pointer(obj, v, opaque, parse_drive, name, errp);
>> +set_pointer(obj, v, opaque, NULL, parse_drive, name, errp);
>>   }
>>   
>>   static void set_drive_iothread(Object *obj, Visitor *v, const char *name,
>>  void *opaque, Error **errp)
>>   {
>> -set_pointer(obj, v, opaque, parse_drive_iothread, name, errp);
>> +set_pointer(obj, v, opaque, NULL, parse_drive_iothread, name, errp);
>>   }
>>   
>>   const PropertyInfo qdev_prop_drive = {
>> -- 
>> 2.17.0
>>




Re: [PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839

2019-11-25 Thread Denis Plotnikov


On 06.11.2019 15:03, Michael S. Tsirkin wrote:
> On Thu, Oct 24, 2019 at 11:34:34AM +, Denis Lunev wrote:
>> On 10/24/19 12:28 AM, Michael S. Tsirkin wrote:
>>> On Fri, Oct 18, 2019 at 02:55:47PM +0300, Denis Plotnikov wrote:
>>>> From: "Denis V. Lunev" 
>>>>
>>>> Linux guests submit IO requests no longer than PAGE_SIZE * max_seg
>>>> field reported by SCSI controler. Thus typical sequential read with
>>>> 1 MB size results in the following pattern of the IO from the guest:
>>>>8,16   115754 2.766095122  2071  D   R 2095104 + 1008 [dd]
>>>>8,16   115755 2.766108785  2071  D   R 2096112 + 1008 [dd]
>>>>8,16   115756 2.766113486  2071  D   R 2097120 + 32 [dd]
>>>>8,16   115757 2.767668961 0  C   R 2095104 + 1008 [0]
>>>>8,16   115758 2.768534315 0  C   R 2096112 + 1008 [0]
>>>>8,16   115759 2.768539782 0  C   R 2097120 + 32 [0]
>>>> The IO was generated by
>>>>dd if=/dev/sda of=/dev/null bs=1024 iflag=direct
>>>>
>>>> This effectively means that on rotational disks we will observe 3 IOPS
>>>> for each 2 MBs processed. This definitely negatively affects both
>>>> guest and host IO performance.
>>>>
>>>> The cure is relatively simple - we should report lengthy scatter-gather
>>>> ability of the SCSI controller. Fortunately the situation here is very
>>>> good. VirtIO transport layer can accomodate 1024 items in one request
>>>> while we are using only 128. This situation is present since almost
>>>> very beginning. 2 items are dedicated for request metadata thus we
>>>> should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg.
>>>>
>>>> The following pattern is observed after the patch:
>>>>8,16   1 9921 2.662721340  2063  D   R 2095104 + 1024 [dd]
>>>>8,16   1 9922 2.662737585  2063  D   R 2096128 + 1024 [dd]
>>>>8,16   1 9923 2.665188167 0  C   R 2095104 + 1024 [0]
>>>>8,16   1 9924 2.665198777 0  C   R 2096128 + 1024 [0]
>>>> which is much better.
>>>>
>>>> The dark side of this patch is that we are tweaking guest visible
>>>> parameter, though this should be relatively safe as above transport
>>>> layer support is present in QEMU/host Linux for a very long time.
>>>> The patch adds configurable property for VirtIO SCSI with a new default
>>>> and hardcode option for VirtBlock which does not provide good
>>>> configurable framework.
>>>>
>>>> Unfortunately the commit can not be applied as is. For the real cure we
>>>> need guest to be fixed to accomodate that queue length, which is done
>>>> only in the latest 4.14 kernel. Thus we are going to expose the property
>>>> and tweak it on machine type level.
>>>>
>>>> The problem with the old kernels is that they have
>>>> max_segments <= virtqueue_size restriction which cause the guest
>>>> crashing in the case of violation.
>>> This isn't just in the guests: virtio spec also seems to imply this,
>>> or at least be vague on this point.
>>>
>>> So I think it'll need a feature bit.
>>> Doing that in a safe way will also allow being compatible with old guests.
>>>
>>> The only downside is it's a bit more work as we need to
>>> spec this out and add guest support.
>>>
>>>> To fix the case described above in the old kernels we can increase
>>>> virtqueue_size to 256 and max_segments to 254. The pitfall here is
>>>> that seabios allows the virtqueue_size-s < 128, however, the seabios
>>>> patch extending that value to 256 is pending.
>>> And the fix here is just to limit large vq size to virtio 1.0.
>>> In that mode it's fine I think:
>>>
>>>
>>> /* check if the queue is available */
>>> if (vp->use_modern) {
>>> num = vp_read(&vp->common, virtio_pci_common_cfg, queue_size);
>>> if (num > MAX_QUEUE_NUM) {
>>> vp_write(&vp->common, virtio_pci_common_cfg, queue_size,
>>>  MAX_QUEUE_NUM);
>>> num = vp_read(&vp->common, virtio_pci_common_cfg, queue_size);
>>> }
>>> } else {
>>> num = vp_read(&vp->legacy, virtio_pci_legacy, queue_num);
>>>

Re: [PATCH v0 1/2] qdev-properties-system: extend set_pionter for unrealized devices

2019-11-25 Thread Denis Plotnikov


On 25.11.2019 18:30, Eduardo Habkost wrote:
> On Fri, Nov 22, 2019 at 11:36:30AM +0000, Denis Plotnikov wrote:
>>
>> On 18.11.2019 21:54, Eduardo Habkost wrote:
>>> On Sun, Nov 10, 2019 at 10:03:09PM +0300, Denis Plotnikov wrote:
>>>> Some device's property can be changed if the device has been already
>>>> realized. For example, it could be "drive" property of a scsi disk device.
>>>>
>>>> So far, set_pointer could operate only on a relized device. The patch
>>>> extends its interface for operation on an unrealized device.
>>>>
>>>> Signed-off-by: Denis Plotnikov 
>>>> ---
>>>>hw/core/qdev-properties-system.c | 32 +---
>>>>1 file changed, 21 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/hw/core/qdev-properties-system.c 
>>>> b/hw/core/qdev-properties-system.c
>>>> index ba412dd2ca..c534590dcd 100644
>>>> --- a/hw/core/qdev-properties-system.c
>>>> +++ b/hw/core/qdev-properties-system.c
>>>> @@ -38,9 +38,14 @@ static void get_pointer(Object *obj, Visitor *v, 
>>>> Property *prop,
>>>>}
>>>>
>>>>static void set_pointer(Object *obj, Visitor *v, Property *prop,
>>>> -void (*parse)(DeviceState *dev, const char *str,
>>>> -  void **ptr, const char *propname,
>>>> -  Error **errp),
>>>> +void (*parse_realized)(DeviceState *dev,
>>>> +   const char *str, void 
>>>> **ptr,
>>>> +   const char *propname,
>>>> +   Error **errp),
>>>> +void (*parse_unrealized)(DeviceState *dev,
>>>> + const char *str, void 
>>>> **ptr,
>>>> + const char *propname,
>>>> + Error **errp),
>>>>const char *name, Error **errp)
>>> Wouldn't it be simpler to just add a PropertyInfo::allow_set_after_realize
>>> bool field, and call the same setter function?  Then you can
>>> simply change do_parse_drive() to check if realized is true.
>> May be, but I thought It would be more clear to have a separate callback
>> for all the devices supporting the property setting when realized.
>> Also the "drive" property setting on realized and non-realized device a
>> little bit different: in the realized case the setter function expects
>> to get
>> BlockDriverState only, when in the unrealized case the setter can accept
>> both BlockBackend and BlockDriverState. Also, in the unrealized case the
>> setter function doesn't expect to have a device with an empty BlockBackend.
>> I decided that extending do_parse_drive would make it more complex for
>> understanding. That's why I made two separate functions for both cases.
> I understand you might want two separate functions in the
> specific case of drive.  You can still call different
> functions after checking dev->realized inside do_parse_drive().
>
> My point was that you don't need to make set_pointer() require
> two separate function pointers just to propagate 1 bit of
> information that is already available in DeviceState.  In patch
> 2/2 you had to create 4 different copies of parse_drive*()
> because of this.
Yes, that's true. I wanted to suggest a more general way to deal with a 
device on realized and non-realized state.
I may be too much and not necessary. May be we should wait for a 
feedback from the block maintainers?
>
>
>> I'd like to mention that I have a few concerns about
>> do_parse_drive_realized (please see the next patch from the series) and
>> I'd like them to be reviewed as well. After that, may be it would be
>> better to go the way you suggested.
> In the case if your questions in patch 2/2, I'm afraid I don't
> know the answers and we need help from the block maintainers.
Anyway, thanks for taking a glance.
>



[Qemu-devel] [PATCH v3 1/3] qcow2: introduce compression type feature

2019-08-19 Thread Denis Plotnikov
The patch adds some preparation parts for incompatible compression type
feature to QCOW2 header that indicates that *all* compressed clusters
must be (de)compressed using a certain compression type.

It is implied that the compression type is set on the image creation and
can be changed only later by image conversion, thus compression type
defines the only compression algorithm used for the image.

The goal of the feature is to add support of other compression algorithms
to qcow2. For example, ZSTD which is more effective on compression than ZLIB.
It works roughly 2x faster than ZLIB providing a comparable compression ratio
and therefore provides a performance advantage in backup scenarios.

The default compression is ZLIB. Images created with ZLIB compression type
are backward compatible with older qemu versions.

Signed-off-by: Denis Plotnikov 
---
 block/qcow2.c | 94 +++
 block/qcow2.h | 26 ---
 docs/interop/qcow2.txt| 19 +++-
 include/block/block_int.h |  1 +
 qapi/block-core.json  | 22 -
 5 files changed, 152 insertions(+), 10 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 039bdc2f7e..4e07b7e9ec 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1197,6 +1197,32 @@ static int qcow2_update_options(BlockDriverState *bs, 
QDict *options,
 return ret;
 }
 
+static int check_compression_type(BDRVQcow2State *s, Error **errp)
+{
+switch (s->compression_type) {
+case QCOW2_COMPRESSION_TYPE_ZLIB:
+break;
+
+default:
+error_setg(errp, "qcow2: unknown compression type: %u",
+   s->compression_type);
+return -ENOTSUP;
+}
+
+/*
+ * if the compression type differs from QCOW2_COMPRESSION_TYPE_ZLIB
+ * the incompatible feature flag must be set
+ */
+
+if (s->compression_type != QCOW2_COMPRESSION_TYPE_ZLIB &&
+!(s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION_TYPE)) {
+error_setg(errp, "qcow2: Invalid compression type setting");
+return -EINVAL;
+}
+
+return 0;
+}
+
 /* Called with s->lock held.  */
 static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
   int flags, Error **errp)
@@ -1312,6 +1338,35 @@ static int coroutine_fn qcow2_do_open(BlockDriverState 
*bs, QDict *options,
 s->compatible_features  = header.compatible_features;
 s->autoclear_features   = header.autoclear_features;
 
+/*
+ * Handle compression type
+ * Older qcow2 images don't contain the compression type header.
+ * Distinguish them by the header length and use
+ * the only valid (default) compression type in that case
+ */
+if (header.header_length > offsetof(QCowHeader, compression_type)) {
+/* sanity check that we can read a compression type */
+size_t min_len = offsetof(QCowHeader, compression_type) +
+ sizeof(header.compression_type);
+if (header.header_length < min_len) {
+error_setg(errp,
+   "Could not read compression type, "
+   "qcow2 header is too short");
+ret = -EINVAL;
+goto fail;
+}
+
+header.compression_type = be32_to_cpu(header.compression_type);
+s->compression_type = header.compression_type;
+} else {
+s->compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
+}
+
+ret = check_compression_type(s, errp);
+if (ret) {
+goto fail;
+}
+
 if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
 void *feature_table = NULL;
 qcow2_read_extensions(bs, header.header_length, ext_end,
@@ -2516,6 +2571,12 @@ int qcow2_update_header(BlockDriverState *bs)
 total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
 refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
 
+ret = check_compression_type(s, NULL);
+
+if (ret) {
+goto fail;
+}
+
 *header = (QCowHeader) {
 /* Version 2 fields */
 .magic  = cpu_to_be32(QCOW_MAGIC),
@@ -2538,6 +2599,7 @@ int qcow2_update_header(BlockDriverState *bs)
 .autoclear_features = cpu_to_be64(s->autoclear_features),
 .refcount_order = cpu_to_be32(s->refcount_order),
 .header_length  = cpu_to_be32(header_length),
+.compression_type   = cpu_to_be32(s->compression_type),
 };
 
 /* For older versions, write a shorter header */
@@ -2635,6 +2697,11 @@ int qcow2_update_header(BlockDriverState *bs)
 .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
 .name = "lazy refcounts",
 },
+{
+.type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
+.bit  = QCOW2_INCOMPAT_C

[Qemu-devel] [PATCH v3 0/3] qcow2: add zstd cluster compression

2019-08-19 Thread Denis Plotnikov
v3:
* relax the compression type setting requirement when
  the compression type is not zlib [Eric, Kevin]
* add compression type values to the spec [Eric]
* fix wording in the spec and descriptions [Eric]
* fix functions descriptions [Max]
* fix zstd (de)compression functions flaws [Max]
* fix zstd related parts of configure file [Max]
* rebased to v4.1.0-rc5 and chenged the series version aiming to 4.2

v2:
* relax the compression type setting restriction in the spec
* fix qcow2 header size checking
* fix error processing and messaging
* fix qcow2 image specific info reporting
* set Qcow2CompressionType zstd config dependant
* add zstd compressed cluster format description to the spec

v1:
* extend qcow2 header instead of adding a new incompatible extension header
specification re-written accordingly
* enable zstd compression via config
* fix zstd (de)compression functions
* fix comments/description
* fix function naming

---
The goal of the patch-set is to enable qcow2 to use zstd compression for
clusters. ZSTD provides better (de)compression performance than currently
used ZLIB. Using it will improve perforamnce (reduce compression time)
when the compressed clusters is used, e.g backup scenarios.

Also, the patch-set extends qcow2 specification by adding compression_type
feature. The feature enables adding ZSTD and another compression algorithms
in the future.

Here is some measurements ZSTD vs ZLIB:

The test:
Test compresses and decompresses qemu qcow2 image with just
installed rhel-7.6 guest.
Image cluster size: 64K. Image on disk size: 2.2G

The test was conducted with brd disk to reduce the influence
of disk subsystem to the test results.
The results is given in seconds.

compress cmd:
time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd]
src.img [zlib|zstd]_compressed.img
decompress cmd
time ./qemu-img convert -O qcow2
[zlib|zstd]_compressed.img uncompressed.img


The results:
compression decompression
zlib zstd zlib zstd

real 65.5 16.3 (-75 %) 1.9 1.6 (-16 %)
user 65.0 15.8 5.3 2.5
sys 3.3 0.2 2.0 2.0

Both ZLIB and ZSTD gave the same compression ratio: ~1.5
compressed image size in both cases: ~1.4G

Denis Plotnikov (3):
  qcow2: introduce compression type feature
  qcow2: rework the cluster compression routine
  qcow2: add zstd cluster compression

 block/qcow2-threads.c | 172 ++
 block/qcow2.c | 100 ++
 block/qcow2.h |  26 --
 configure |  34 
 docs/interop/qcow2.txt|  39 -
 include/block/block_int.h |   1 +
 qapi/block-core.json  |  23 -
 7 files changed, 371 insertions(+), 24 deletions(-)

-- 
2.17.0




[Qemu-devel] [PATCH v3 3/3] qcow2: add zstd cluster compression

2019-08-19 Thread Denis Plotnikov
zstd significantly reduces cluster compression time.
It provides better compression performance maintaining
the same level of compression ratio in comparison with
zlib, which, at the moment, has been the only compression
method available.

The performance test results:
Test compresses and decompresses qemu qcow2 image with just
installed rhel-7.6 guest.
Image cluster size: 64K. Image on disk size: 2.2G

The test was conducted with brd disk to reduce the influence
of disk subsystem to the test results.
The results is given in seconds.

compress cmd:
  time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd]
  src.img [zlib|zstd]_compressed.img
decompress cmd
  time ./qemu-img convert -O qcow2
  [zlib|zstd]_compressed.img uncompressed.img

   compression   decompression
 zlib   zstd   zlib zstd

real 65.5   16.3 (-75 %)1.9  1.6 (-16 %)
user 65.0   15.85.3  2.5
sys   3.30.22.0  2.0

Both ZLIB and ZSTD gave the same compression ratio: 1.57
compressed image size in both cases: 1.4G

Signed-off-by: Denis Plotnikov 
---
 block/qcow2-threads.c  | 94 ++
 block/qcow2.c  |  6 +++
 configure  | 34 +++
 docs/interop/qcow2.txt | 20 +
 qapi/block-core.json   |  3 +-
 5 files changed, 156 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c
index 14b5bd76fb..85d04e6c2e 100644
--- a/block/qcow2-threads.c
+++ b/block/qcow2-threads.c
@@ -28,6 +28,11 @@
 #define ZLIB_CONST
 #include 
 
+#ifdef CONFIG_ZSTD
+#include 
+#include 
+#endif
+
 #include "qcow2.h"
 #include "block/thread-pool.h"
 #include "crypto.h"
@@ -165,6 +170,85 @@ static ssize_t qcow2_zlib_decompress(void *dest, size_t 
dest_size,
 return ret;
 }
 
+#ifdef CONFIG_ZSTD
+/*
+ * qcow2_zstd_compress()
+ *
+ * Compress @src_size bytes of data using zstd compression method
+ *
+ * @dest - destination buffer, @dest_size bytes
+ * @src - source buffer, @src_size bytes
+ *
+ * Returns: compressed size on success
+ *  -ENOMEM destination buffer is not enough to store compressed data
+ *  -EIOon any other error
+ */
+
+static ssize_t qcow2_zstd_compress(void *dest, size_t dest_size,
+   const void *src, size_t src_size)
+{
+ssize_t ret;
+uint32_t *c_size = dest;
+/* steal some bytes to store compressed chunk size */
+char *d_buf = ((char *) dest) + sizeof(*c_size);
+
+if (dest_size < sizeof(*c_size)) {
+return -ENOMEM;
+}
+
+dest_size -= sizeof(*c_size);
+
+ret = ZSTD_compress(d_buf, dest_size, src, src_size, 5);
+
+if (ZSTD_isError(ret)) {
+if (ZSTD_getErrorCode(ret) == ZSTD_error_dstSize_tooSmall) {
+return -ENOMEM;
+} else {
+return -EIO;
+}
+}
+
+/* store the compressed chunk size in the very beginning of the buffer */
+*c_size = ret;
+
+return ret + sizeof(*c_size);
+}
+
+/*
+ * qcow2_zstd_decompress()
+ *
+ * Decompress some data (not more than @src_size bytes) to produce exactly
+ * @dest_size bytes using zstd compression method
+ *
+ * @dest - destination buffer, @dest_size bytes
+ * @src - source buffer, @src_size bytes
+ *
+ * Returns: 0 on success
+ *  -EIO on any error
+ */
+
+static ssize_t qcow2_zstd_decompress(void *dest, size_t dest_size,
+ const void *src, size_t src_size)
+{
+ssize_t ret;
+/*
+ * zstd decompress wants to know the exact length of the data
+ * for that purpose, on the compression the length is stored in
+ * the very beginning of the compressed buffer
+ */
+const uint32_t *s_size = src;
+const char *s_buf = ((const char *) src) + sizeof(*s_size);
+
+ret = ZSTD_decompress(dest, dest_size, s_buf, *s_size);
+
+if (ZSTD_isError(ret)) {
+return -EIO;
+}
+
+return 0;
+}
+#endif
+
 static int qcow2_compress_pool_func(void *opaque)
 {
 Qcow2CompressData *data = opaque;
@@ -216,6 +300,11 @@ qcow2_co_compress(BlockDriverState *bs, void *dest, size_t 
dest_size,
 fn = qcow2_zlib_compress;
 break;
 
+#ifdef CONFIG_ZSTD
+case QCOW2_COMPRESSION_TYPE_ZSTD:
+fn = qcow2_zstd_compress;
+break;
+#endif
 default:
 return -ENOTSUP;
 }
@@ -248,6 +337,11 @@ qcow2_co_decompress(BlockDriverState *bs, void *dest, 
size_t dest_size,
 fn = qcow2_zlib_decompress;
 break;
 
+#ifdef CONFIG_ZSTD
+case QCOW2_COMPRESSION_TYPE_ZSTD:
+fn = qcow2_zstd_decompress;
+break;
+#endif
 default:
 return -ENOTSUP;
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index 4e07b7e9ec..dfb7b52033 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c

[Qemu-devel] [PATCH v3 2/3] qcow2: rework the cluster compression routine

2019-08-19 Thread Denis Plotnikov
The patch allow to process image compression type defined
in the image header and choose an appropriate method for
image clusters (de)compression.

Signed-off-by: Denis Plotnikov 
---
 block/qcow2-threads.c | 78 +++
 1 file changed, 64 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c
index 3b1e63fe41..14b5bd76fb 100644
--- a/block/qcow2-threads.c
+++ b/block/qcow2-threads.c
@@ -73,8 +73,11 @@ typedef struct Qcow2CompressData {
 Qcow2CompressFunc func;
 } Qcow2CompressData;
 
+
 /*
- * qcow2_compress()
+ * qcow2_zlib_compress()
+ *
+ * Compress @src_size bytes of data using zlib compression method
  *
  * @dest - destination buffer, @dest_size bytes
  * @src - source buffer, @src_size bytes
@@ -83,8 +86,8 @@ typedef struct Qcow2CompressData {
  *  -ENOMEM destination buffer is not enough to store compressed data
  *  -EIOon any other error
  */
-static ssize_t qcow2_compress(void *dest, size_t dest_size,
-  const void *src, size_t src_size)
+static ssize_t qcow2_zlib_compress(void *dest, size_t dest_size,
+   const void *src, size_t src_size)
 {
 ssize_t ret;
 z_stream strm;
@@ -119,19 +122,19 @@ static ssize_t qcow2_compress(void *dest, size_t 
dest_size,
 }
 
 /*
- * qcow2_decompress()
+ * qcow2_zlib_decompress()
  *
  * Decompress some data (not more than @src_size bytes) to produce exactly
- * @dest_size bytes.
+ * @dest_size bytes using zlib compression method
  *
  * @dest - destination buffer, @dest_size bytes
  * @src - source buffer, @src_size bytes
  *
  * Returns: 0 on success
- *  -1 on fail
+ *  -EIO on fail
  */
-static ssize_t qcow2_decompress(void *dest, size_t dest_size,
-const void *src, size_t src_size)
+static ssize_t qcow2_zlib_decompress(void *dest, size_t dest_size,
+ const void *src, size_t src_size)
 {
 int ret = 0;
 z_stream strm;
@@ -144,7 +147,7 @@ static ssize_t qcow2_decompress(void *dest, size_t 
dest_size,
 
 ret = inflateInit2(&strm, -12);
 if (ret != Z_OK) {
-return -1;
+return -EIO;
 }
 
 ret = inflate(&strm, Z_FINISH);
@@ -154,7 +157,7 @@ static ssize_t qcow2_decompress(void *dest, size_t 
dest_size,
  * @src buffer may be processed partly (because in qcow2 we know size 
of
  * compressed data with precision of one sector)
  */
-ret = -1;
+ret = -EIO;
 }
 
 inflateEnd(&strm);
@@ -189,20 +192,67 @@ qcow2_co_do_compress(BlockDriverState *bs, void *dest, 
size_t dest_size,
 return arg.ret;
 }
 
+/*
+ * qcow2_co_compress()
+ *
+ * Compress @src_size bytes of data using the compression
+ * method defined by the image compression type
+ *
+ * @dest - destination buffer, @dest_size bytes
+ * @src - source buffer, @src_size bytes
+ *
+ * Returns: 0 on success
+ *  a negative error code on fail
+ */
 ssize_t coroutine_fn
 qcow2_co_compress(BlockDriverState *bs, void *dest, size_t dest_size,
   const void *src, size_t src_size)
 {
-return qcow2_co_do_compress(bs, dest, dest_size, src, src_size,
-qcow2_compress);
+BDRVQcow2State *s = bs->opaque;
+Qcow2CompressFunc fn;
+
+switch (s->compression_type) {
+case QCOW2_COMPRESSION_TYPE_ZLIB:
+fn = qcow2_zlib_compress;
+break;
+
+default:
+return -ENOTSUP;
+}
+
+return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, fn);
 }
 
+/*
+ * qcow2_co_decompress()
+ *
+ * Decompress some data (not more than @src_size bytes) to produce exactly
+ * @dest_size bytes using the compression method defined by the image
+ * compression type
+ *
+ * @dest - destination buffer, @dest_size bytes
+ * @src - source buffer, @src_size bytes
+ *
+ * Returns: 0 on success
+ *  a negative error code on fail
+ */
 ssize_t coroutine_fn
 qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size,
 const void *src, size_t src_size)
 {
-return qcow2_co_do_compress(bs, dest, dest_size, src, src_size,
-qcow2_decompress);
+BDRVQcow2State *s = bs->opaque;
+Qcow2CompressFunc fn;
+
+switch (s->compression_type) {
+case QCOW2_COMPRESSION_TYPE_ZLIB:
+fn = qcow2_zlib_decompress;
+break;
+
+default:
+return -ENOTSUP;
+}
+
+return qcow2_co_do_compress(bs, dest, dest_size, src, src_size, fn);
 }
 
 
-- 
2.17.0




[Qemu-devel] [PATCH v9] qemu-io: add pattern file for write command

2019-08-19 Thread Denis Plotnikov
The patch allows to provide a pattern file for write
command. There was no similar ability before.

Signed-off-by: Denis Plotnikov 
---
v9:
  * replace flag cast to int with bool [Eric]
  * fix the error message [Eric]
  * use qemu_io_free instead of qemu_vfree [Eric]
  * add function description [Eric]

v8: fix according to Max's comments
  * get rid of unnecessary buffer for the pattern
  * buffer allocation just in bytes
  * take into account the missalign offset
  * don't copy file name
  * changed char* to const char* in input params

v7:
  * fix variable naming
  * make code more readable
  * extend help for write command

v6:
  * the pattern file is read once to reduce io

v5:
  * file name initiated with null to make compilers happy

v4:
  * missing signed-off clause added

v3:
  * missing file closing added
  * exclusive flags processing changed
  * buffer void* converted to char* to fix pointer arithmetics
  * file reading error processing added
---
 qemu-io-cmds.c | 97 ++
 1 file changed, 91 insertions(+), 6 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 09750a23ce..f7bdfe673b 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -351,6 +351,77 @@ static void qemu_io_free(void *p)
 qemu_vfree(p);
 }
 
+/*
+ * qemu_io_alloc_from_file()
+ *
+ * Allocates the buffer and populates it with the content of the given file
+ * up to @len bytes. If the file length is less then @len, then the buffer
+ * is populated with then file content cyclically.
+ *
+ * @blk - the block backend where the buffer content is going to be written to
+ * @len - the buffer length
+ * @file_name - the file to copy the content from
+ *
+ * Returns: the buffer pointer on success
+ *  NULL on error
+ */
+static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len,
+ const char *file_name)
+{
+char *buf, *buf_origin;
+FILE *f = fopen(file_name, "r");
+int pattern_len;
+
+if (!f) {
+perror(file_name);
+return NULL;
+}
+
+if (qemuio_misalign) {
+len += MISALIGN_OFFSET;
+}
+
+buf_origin = buf = blk_blockalign(blk, len);
+
+if (qemuio_misalign) {
+buf_origin += MISALIGN_OFFSET;
+}
+
+pattern_len = fread(buf_origin, 1, len, f);
+
+if (ferror(f)) {
+perror(file_name);
+goto error;
+}
+
+if (pattern_len == 0) {
+fprintf(stderr, "%s: file is empty\n", file_name);
+goto error;
+}
+
+fclose(f);
+
+if (len > pattern_len) {
+len -= pattern_len;
+buf += pattern_len;
+
+while (len > 0) {
+size_t len_to_copy = MIN(pattern_len, len);
+
+memcpy(buf, buf_origin, len_to_copy);
+
+len -= len_to_copy;
+buf += len_to_copy;
+}
+}
+
+return buf_origin;
+
+error:
+qemu_io_free(buf_origin);
+return NULL;
+}
+
 static void dump_buffer(const void *buffer, int64_t offset, int64_t len)
 {
 uint64_t i;
@@ -949,6 +1020,7 @@ static void write_help(void)
 " -n, -- with -z, don't allow slow fallback\n"
 " -p, -- ignored for backwards compatibility\n"
 " -P, -- use different pattern to fill file\n"
+" -s, -- use a pattern file to fill the write buffer\n"
 " -C, -- report statistics in a machine parsable format\n"
 " -q, -- quiet mode, do not show I/O statistics\n"
 " -u, -- with -z, allow unmapping\n"
@@ -965,7 +1037,7 @@ static const cmdinfo_t write_cmd = {
 .perm   = BLK_PERM_WRITE,
 .argmin = 2,
 .argmax = -1,
-.args   = "[-bcCfnquz] [-P pattern] off len",
+.args   = "[-bcCfnquz] [-P pattern | -s source_file] off len",
 .oneline= "writes a number of bytes at a specified offset",
 .help   = write_help,
 };
@@ -974,7 +1046,7 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 {
 struct timeval t1, t2;
 bool Cflag = false, qflag = false, bflag = false;
-bool Pflag = false, zflag = false, cflag = false;
+bool Pflag = false, zflag = false, cflag = false, sflag = false;
 int flags = 0;
 int c, cnt, ret;
 char *buf = NULL;
@@ -983,8 +1055,9 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 /* Some compilers get confused and warn if this is not initialized.  */
 int64_t total = 0;
 int pattern = 0xcd;
+const char *file_name = NULL;
 
-while ((c = getopt(argc, argv, "bcCfnpP:quz")) != -1) {
+while ((c = getopt(argc, argv, "bcCfnpP:quzs:")) != -1) {
 switch (c) {
 case 'b':
 bflag = true;
@@ -1020,6 +1093,10 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 case 'z':
 zflag = true;
 break;
+case 's':
+sf

Re: [Qemu-devel] [PATCH v7] qemu-io: add pattern file for write command

2019-07-30 Thread Denis Plotnikov
Ping!

On Jul 5 2019, at 1:21 pm, Denis Plotnikov  wrote:
The patch allows to provide a pattern file for write
command. There was no similar ability before.

Signed-off-by: Denis Plotnikov 
---
v7:
* fix variable naming
* make code more readable
* extend help for write command

v6:
* the pattern file is read once to reduce io

v5:
* file name initiated with null to make compilers happy

v4:
* missing signed-off clause added

v3:
* missing file closing added
* exclusive flags processing changed
* buffer void* converted to char* to fix pointer arithmetics
* file reading error processing added
---
qemu-io-cmds.c | 86 ++
1 file changed, 80 insertions(+), 6 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 09750a23ce..495170380a 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -343,6 +343,66 @@ static void *qemu_io_alloc(BlockBackend *blk, size_t len, 
int pattern)
return buf;
}

+static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len,
+ char *file_name)
+{
+ char *buf, *buf_origin;
+ FILE *f = fopen(file_name, "r");
+ int pattern_len;
+
+ if (!f) {
+ printf("'%s': %s\n", file_name, strerror(errno));
+ return NULL;
+ }
+
+ if (qemuio_misalign) {
+ len += MISALIGN_OFFSET;
+ }
+
+ buf_origin = buf = blk_blockalign(blk, len);
+
+ pattern_len = fread(buf, sizeof(char), len, f);
+
+ if (ferror(f)) {
+ printf("'%s': %s\n", file_name, strerror(errno));
+ goto error;
+ }
+
+ if (pattern_len == 0) {
+ printf("'%s' is empty\n", file_name);
+ goto error;
+ }
+
+ fclose(f);
+
+ if (len > pattern_len) {
+ char *file_buf = g_malloc(sizeof(char) * pattern_len);
+ memcpy(file_buf, buf, pattern_len);
+ len -= pattern_len;
+ buf += pattern_len;
+
+ while (len > 0) {
+ size_t len_to_copy = MIN(pattern_len, len);
+
+ memcpy(buf, file_buf, len_to_copy);
+
+ len -= len_to_copy;
+ buf += len_to_copy;
+ }
+ qemu_vfree(file_buf);
+ }
+
+ if (qemuio_misalign) {
+ buf_origin += MISALIGN_OFFSET;
+ }
+
+ return buf_origin;
+
+error:
+ qemu_vfree(buf_origin);
+ return NULL;
+}
+
static void qemu_io_free(void *p)
{
if (qemuio_misalign) {
@@ -949,6 +1009,7 @@ static void write_help(void)
" -n, -- with -z, don't allow slow fallback\n"
" -p, -- ignored for backwards compatibility\n"
" -P, -- use different pattern to fill file\n"
+" -s, -- use a pattern file to fill the write buffer\n"
" -C, -- report statistics in a machine parsable format\n"
" -q, -- quiet mode, do not show I/O statistics\n"
" -u, -- with -z, allow unmapping\n"
@@ -965,7 +1026,7 @@ static const cmdinfo_t write_cmd = {
.perm = BLK_PERM_WRITE,
.argmin = 2,
.argmax = -1,
- .args = "[-bcCfnquz] [-P pattern] off len",
+ .args = "[-bcCfnquz] [-P pattern | -s source_file] off len",
.oneline = "writes a number of bytes at a specified offset",
.help = write_help,
};
@@ -974,7 +1035,7 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
{
struct timeval t1, t2;
bool Cflag = false, qflag = false, bflag = false;
- bool Pflag = false, zflag = false, cflag = false;
+ bool Pflag = false, zflag = false, cflag = false, sflag = false;
int flags = 0;
int c, cnt, ret;
char *buf = NULL;
@@ -983,8 +1044,9 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
/* Some compilers get confused and warn if this is not initialized. */
int64_t total = 0;
int pattern = 0xcd;
+ char *file_name = NULL;

- while ((c = getopt(argc, argv, "bcCfnpP:quz")) != -1) {
+ while ((c = getopt(argc, argv, "bcCfnpP:quzs:")) != -1) {
switch (c) {
case 'b':
bflag = true;
@@ -1020,6 +1082,10 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
case 'z':
zflag = true;
break;
+ case 's':
+ sflag = true;
+ file_name = g_strdup(optarg);
+ break;
default:
qemuio_command_usage(&write_cmd);
return -EINVAL;
@@ -1051,8 +1117,9 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
return -EINVAL;
}

- if (zflag && Pflag) {
- printf("-z and -P cannot be specified at the same time\n");
+ if ((int)zflag + (int)Pflag + (int)sflag > 1) {
+ printf("Only one of -z, -P, and -s"
+ "can be specified at the same time\n");
return -EINVAL;
}

@@ -1088,7 +1155,14 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
}

if (!zflag) {
- buf = qemu_io_alloc(blk, count, pattern);
+ if (sflag) {
+ buf = qemu_io_alloc_from_file(blk, count, file_name);
+ if (!buf) {
+ return -EINVAL;
+ }
+ } else {
+ buf = qemu_io_alloc(blk, count, pattern);
+ }
}

gettimeofday(&t1, NULL);
--
2.17.0



Re: [Qemu-devel] [PATCH v2 3/3] qcow2: add zstd cluster compression

2019-07-30 Thread Denis Plotnikov
On Jul 9 2019, at 9:18 am, Markus Armbruster  wrote:
Denis Plotnikov  writes:

zstd significantly reduces cluster compression time.
It provides better compression performance maintaining
the same level of compression ratio in comparison with
zlib, which, by the moment, has been the only compression
method available.

The performance test results:
Test compresses and decompresses qemu qcow2 image with just
installed rhel-7.6 guest.
Image cluster size: 64K. Image on disk size: 2.2G

The test was conducted with brd disk to reduce the influence
of disk subsystem to the test results.
The results is given in seconds.

compress cmd:
time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd]
src.img [zlib|zstd]_compressed.img
decompress cmd
time ./qemu-img convert -O qcow2
[zlib|zstd]_compressed.img uncompressed.img

compression decompression
zlib zstd zlib zstd

real 65.5 16.3 (-75 %) 1.9 1.6 (-16 %)
user 65.0 15.8 5.3 2.5
sys 3.3 0.2 2.0 2.0

Both ZLIB and ZSTD gave the same compression ratio: 1.57
compressed image size in both cases: 1.4G

Signed-off-by: Denis Plotnikov 
[...]
diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
index 7cf068f814..4344e858cb 100644
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@@ -538,6 +538,9 @@ Compressed Clusters Descriptor (x = 62 - (cluster_bits - 
8)):
Another compressed cluster may map to the tail of the final
sector used by this compressed cluster.

+ The layout of the compressed data depends on the compression
+ type used for the image (see compressed cluster layout).
+
If a cluster is unallocated, read requests shall read the data from the backing
file (except if bit 0 in the Standard Cluster Descriptor is set). If there is
no backing file or the backing file is smaller than the image, they shall read
@@ -790,3 +793,19 @@ In the image file the 'enabled' state is reflected by the 
'auto' flag. If this
flag is set, the software must consider the bitmap as 'enabled' and start
tracking virtual disk changes to this bitmap from the first write to the
virtual disk. If this flag is not set then the bitmap is disabled.
+
+=== Compressed cluster layout ===
+
+The compressed cluster data may have a different layout depending on the
+compression type used for the image, and store specific data for the particular
+compression type.
+
+Compressed data layout for the available compression types:
+(x = data_space_length - 1)
+
+ zlib:
+ Byte 0 - x: the compressed data content
+ all the space provided used for compressed data
+ zstd:
+ Byte 0 - 3: the length of compressed data
+ 4 - x: the compressed data content

Adding <http://zlib.net/> and <http://github.com/facebook/zstd> here as
well wouldn't hurt, would it?
ok

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 835dd3c37f..2021e03a84 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -4215,11 +4215,12 @@
# Compression type used in qcow2 image file
#
# @zlib: zlib compression, see <http://zlib.net/>
+# @zstd: zstd compression, see <http://github.com/facebook/zstd>
#
# Since: 4.1
##
{ 'enum': 'Qcow2CompressionType',
- 'data': [ 'zlib' ] }
+ 'data': [ 'zlib', { 'name': 'zstd', 'if': 'defined(CONFIG_ZSTD)' } ] }

##
# @BlockdevCreateOptionsQcow2:

QAPI schema
Acked-by: Markus Armbruster 



Re: [Qemu-devel] [PATCH v2 0/3] add zstd cluster compression

2019-07-30 Thread Denis Plotnikov
Hi all! Is there any other comments besides Markus's one about adding zlib/zstd 
links to compressed cluster layout description?

On Jul 4 2019, at 4:09 pm, Denis Plotnikov  wrote:
change log:

v2:
* relax the compression type setting restriction in the spec
* fix qcow2 header size checking
* fix error processing and messaging
* fix qcow2 image specific info reporting
* set Qcow2CompressionType zstd config dependant
* add zstd compressed cluster format description to the spec

v1:
* extend qcow2 header instead of adding a new incompatible extension header
specification re-written accordingly
* enable zstd compression via config
* fix zstd (de)compression functions
* fix comments/description
* fix function naming

---
The goal of the patch-set is to enable qcow2 to use zstd compression for
clusters. ZSTD provides better (de)compression performance than currently
used ZLIB. Using it will improve perforamnce (reduce compression time)
when the compressed clusters is used, e.g backup scenarios.

Also, the patch-set extends qcow2 specification by adding compression_type
feature. The feature enables adding ZSTD and another compression algorithms
in the future.

Here is some measurements ZSTD vs ZLIB:

The test:
Test compresses and decompresses qemu qcow2 image with just
installed rhel-7.6 guest.
Image cluster size: 64K. Image on disk size: 2.2G

The test was conducted with brd disk to reduce the influence
of disk subsystem to the test results.
The results is given in seconds.

compress cmd:
time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd]
src.img [zlib|zstd]_compressed.img
decompress cmd
time ./qemu-img convert -O qcow2
[zlib|zstd]_compressed.img uncompressed.img


The results:
compression decompression
zlib zstd zlib zstd

real 65.5 16.3 (-75 %) 1.9 1.6 (-16 %)
user 65.0 15.8 5.3 2.5
sys 3.3 0.2 2.0 2.0

Both ZLIB and ZSTD gave the same compression ratio: ~1.5
compressed image size in both cases: ~1.4G

Denis Plotnikov (3):
qcow2: introduce compression type feature
qcow2: rework the cluster compression routine
qcow2: add zstd cluster compression

block/qcow2.c | 287 +++---
block/qcow2.h | 26 +++-
configure | 32 +
docs/interop/qcow2.txt | 40 +-
include/block/block_int.h | 1 +
qapi/block-core.json | 23 ++-
6 files changed, 379 insertions(+), 30 deletions(-)

--
2.17.0



[Qemu-devel] [PATCH v8] qemu-io: add pattern file for write command

2019-08-07 Thread Denis Plotnikov
The patch allows to provide a pattern file for write
command. There was no similar ability before.

Signed-off-by: Denis Plotnikov 
---
v8: fix according to Max's comments
  * get rid of unnecessary buffer for the pattern
  * buffer allocation just in bytes
  * take into account the missalign offset
  * don't copy file name
  * changed char* to const char* in input params

v7:
  * fix variable naming
  * make code more readable
  * extend help for write command

v6:
  * the pattern file is read once to reduce io

v5:
  * file name initiated with null to make compilers happy

v4:
  * missing signed-off clause added

v3:
  * missing file closing added
  * exclusive flags processing changed
  * buffer void* converted to char* to fix pointer arithmetics
  * file reading error processing added
---
 qemu-io-cmds.c | 83 ++
 1 file changed, 77 insertions(+), 6 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 09750a23ce..940271ea00 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -343,6 +343,63 @@ static void *qemu_io_alloc(BlockBackend *blk, size_t len, 
int pattern)
 return buf;
 }
 
+static void *qemu_io_alloc_from_file(BlockBackend *blk, size_t len,
+ const char *file_name)
+{
+char *buf, *buf_origin;
+FILE *f = fopen(file_name, "r");
+int pattern_len;
+
+if (!f) {
+perror(file_name);
+return NULL;
+}
+
+if (qemuio_misalign) {
+len += MISALIGN_OFFSET;
+}
+
+buf_origin = buf = blk_blockalign(blk, len);
+
+if (qemuio_misalign) {
+buf_origin += MISALIGN_OFFSET;
+}
+
+pattern_len = fread(buf_origin, 1, len, f);
+
+if (ferror(f)) {
+perror(file_name);
+goto error;
+}
+
+if (pattern_len == 0) {
+fprintf(stderr, "%s: file is empty\n", file_name);
+goto error;
+}
+
+fclose(f);
+
+if (len > pattern_len) {
+len -= pattern_len;
+buf += pattern_len;
+
+while (len > 0) {
+size_t len_to_copy = MIN(pattern_len, len);
+
+memcpy(buf, buf_origin, len_to_copy);
+
+len -= len_to_copy;
+buf += len_to_copy;
+}
+}
+
+return buf_origin;
+
+error:
+qemu_vfree(buf_origin);
+return NULL;
+}
+
 static void qemu_io_free(void *p)
 {
 if (qemuio_misalign) {
@@ -949,6 +1006,7 @@ static void write_help(void)
 " -n, -- with -z, don't allow slow fallback\n"
 " -p, -- ignored for backwards compatibility\n"
 " -P, -- use different pattern to fill file\n"
+" -s, -- use a pattern file to fill the write buffer\n"
 " -C, -- report statistics in a machine parsable format\n"
 " -q, -- quiet mode, do not show I/O statistics\n"
 " -u, -- with -z, allow unmapping\n"
@@ -965,7 +1023,7 @@ static const cmdinfo_t write_cmd = {
 .perm   = BLK_PERM_WRITE,
 .argmin = 2,
 .argmax = -1,
-.args   = "[-bcCfnquz] [-P pattern] off len",
+.args   = "[-bcCfnquz] [-P pattern | -s source_file] off len",
 .oneline= "writes a number of bytes at a specified offset",
 .help   = write_help,
 };
@@ -974,7 +1032,7 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 {
 struct timeval t1, t2;
 bool Cflag = false, qflag = false, bflag = false;
-bool Pflag = false, zflag = false, cflag = false;
+bool Pflag = false, zflag = false, cflag = false, sflag = false;
 int flags = 0;
 int c, cnt, ret;
 char *buf = NULL;
@@ -983,8 +1041,9 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 /* Some compilers get confused and warn if this is not initialized.  */
 int64_t total = 0;
 int pattern = 0xcd;
+const char *file_name = NULL;
 
-while ((c = getopt(argc, argv, "bcCfnpP:quz")) != -1) {
+while ((c = getopt(argc, argv, "bcCfnpP:quzs:")) != -1) {
 switch (c) {
 case 'b':
 bflag = true;
@@ -1020,6 +1079,10 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 case 'z':
 zflag = true;
 break;
+case 's':
+sflag = true;
+file_name = optarg;
+break;
 default:
 qemuio_command_usage(&write_cmd);
 return -EINVAL;
@@ -1051,8 +1114,9 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 return -EINVAL;
 }
 
-if (zflag && Pflag) {
-printf("-z and -P cannot be specified at the same time\n");
+if ((int)zflag + (int)Pflag + (int)sflag > 1) {
+printf("Only one of -z, -P, and -s"
+   "can be specified at the same time\n");
 return -EINVAL;
 }
 
@@ -1088,7 +1152,14 @@ static int writ

Re: [Qemu-devel] [PATCH v2 0/3] add zstd cluster compression

2019-08-07 Thread Denis Plotnikov
Ping!

On Jul 30 2019, at 5:45 pm, Denis Plotnikov  wrote:
Hi all! Is there any other comments besides Markus's one about adding zlib/zstd 
links to compressed cluster layout description?

On Jul 4 2019, at 4:09 pm, Denis Plotnikov  wrote:
change log:

v2:
* relax the compression type setting restriction in the spec
* fix qcow2 header size checking
* fix error processing and messaging
* fix qcow2 image specific info reporting
* set Qcow2CompressionType zstd config dependant
* add zstd compressed cluster format description to the spec

v1:
* extend qcow2 header instead of adding a new incompatible extension header
specification re-written accordingly
* enable zstd compression via config
* fix zstd (de)compression functions
* fix comments/description
* fix function naming

---
The goal of the patch-set is to enable qcow2 to use zstd compression for
clusters. ZSTD provides better (de)compression performance than currently
used ZLIB. Using it will improve perforamnce (reduce compression time)
when the compressed clusters is used, e.g backup scenarios.

Also, the patch-set extends qcow2 specification by adding compression_type
feature. The feature enables adding ZSTD and another compression algorithms
in the future.

Here is some measurements ZSTD vs ZLIB:

The test:
Test compresses and decompresses qemu qcow2 image with just
installed rhel-7.6 guest.
Image cluster size: 64K. Image on disk size: 2.2G

The test was conducted with brd disk to reduce the influence
of disk subsystem to the test results.
The results is given in seconds.

compress cmd:
time ./qemu-img convert -O qcow2 -c -o compression_type=[zlib|zstd]
src.img [zlib|zstd]_compressed.img
decompress cmd
time ./qemu-img convert -O qcow2
[zlib|zstd]_compressed.img uncompressed.img


The results:
compression decompression
zlib zstd zlib zstd

real 65.5 16.3 (-75 %) 1.9 1.6 (-16 %)
user 65.0 15.8 5.3 2.5
sys 3.3 0.2 2.0 2.0

Both ZLIB and ZSTD gave the same compression ratio: ~1.5
compressed image size in both cases: ~1.4G

Denis Plotnikov (3):
qcow2: introduce compression type feature
qcow2: rework the cluster compression routine
qcow2: add zstd cluster compression

block/qcow2.c | 287 +++---
block/qcow2.h | 26 +++-
configure | 32 +
docs/interop/qcow2.txt | 40 +-
include/block/block_int.h | 1 +
qapi/block-core.json | 23 ++-
6 files changed, 379 insertions(+), 30 deletions(-)

--
2.17.0



Re: [PATCH v1 2/4] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-04 Thread Denis Plotnikov




On 30.01.2020 17:58, Stefan Hajnoczi wrote:

On Wed, Jan 29, 2020 at 05:07:00PM +0300, Denis Plotnikov wrote:

The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
  hw/core/machine.c  | 3 +++
  include/hw/virtio/virtio.h | 2 +-
  2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 3e288bfceb..8bc401d8b7 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -28,6 +28,9 @@
  #include "hw/mem/nvdimm.h"
  
  GlobalProperty hw_compat_4_2[] = {

+{ "virtio-blk-device", "queue-size", "128"},
+{ "virtio-scsi-device", "virtqueue_size", "128"},
+{ "vhost-blk-device", "virtqueue_size", "128"},

vhost-blk-device?!  Who has this?  It's not in qemu.git so please omit
this line. ;-)

So in this case the line:

{ "vhost-blk-device", "seg_max_adjust", "off"},

introduced by my patch:

commit 1bf8a989a566b2ba41c197004ec2a02562a766a4
Author: Denis Plotnikov 
Date:   Fri Dec 20 17:09:04 2019 +0300

    virtio: make seg_max virtqueue size dependent

is also wrong. It should be:

{ "vhost-scsi-device", "seg_max_adjust", "off"},

Am I right?



On the other hand, do you want to do this for the vhost-user-blk,
vhost-user-scsi, and vhost-scsi devices that exist in qemu.git?  Those
devices would benefit from better performance too.

It seems to be so. We also have the test checking those settings:
tests/acceptance/virtio_seg_max_adjust.py
For now it checks virtio-scsi-pci and virtio-blk-pci.
I'm going to extend it for the virtqueue size checking.
If I change vhost-user-blk, vhost-user-scsi and vhost-scsi it's worth
to check those devices too. But I don't know how to form a command line
for that 3 devices since they should involve some third party components as
backends (kernel modules, DPDK, etc.) and they seems to be not available 
in the

qemu git.
Is there any way to do it with some qit.qemu available stubs or 
something else?

If so, could you please point out the proper way to do it?

Thanks!
Denis





Re: [PATCH v1 2/4] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-07 Thread Denis Plotnikov




On 05.02.2020 14:19, Stefan Hajnoczi wrote:

On Tue, Feb 04, 2020 at 12:59:04PM +0300, Denis Plotnikov wrote:


On 30.01.2020 17:58, Stefan Hajnoczi wrote:

On Wed, Jan 29, 2020 at 05:07:00PM +0300, Denis Plotnikov wrote:

The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
   hw/core/machine.c  | 3 +++
   include/hw/virtio/virtio.h | 2 +-
   2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 3e288bfceb..8bc401d8b7 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -28,6 +28,9 @@
   #include "hw/mem/nvdimm.h"
   GlobalProperty hw_compat_4_2[] = {
+{ "virtio-blk-device", "queue-size", "128"},
+{ "virtio-scsi-device", "virtqueue_size", "128"},
+{ "vhost-blk-device", "virtqueue_size", "128"},

vhost-blk-device?!  Who has this?  It's not in qemu.git so please omit
this line. ;-)

So in this case the line:

{ "vhost-blk-device", "seg_max_adjust", "off"},

introduced by my patch:

commit 1bf8a989a566b2ba41c197004ec2a02562a766a4
Author: Denis Plotnikov 
Date:   Fri Dec 20 17:09:04 2019 +0300

     virtio: make seg_max virtqueue size dependent

is also wrong. It should be:

{ "vhost-scsi-device", "seg_max_adjust", "off"},

Am I right?

It's just called "vhost-scsi":

include/hw/virtio/vhost-scsi.h:#define TYPE_VHOST_SCSI "vhost-scsi"


On the other hand, do you want to do this for the vhost-user-blk,
vhost-user-scsi, and vhost-scsi devices that exist in qemu.git?  Those
devices would benefit from better performance too.
After thinking about that for a while, I think we shouldn't extend queue 
sizes for vhost-user-blk, vhost-user-scsi and vhost-scsi.
This is because increasing the queue sizes seems to be just useless for 
them: the all thing is about increasing the queue sizes for increasing 
seg_max (it limits the max block query size from the guest). For 
virtio-blk-device and virtio-scsi-device it makes sense, since they have 
seg-max-adjust property which, if true, sets seg_max to 
virtqueue_size-2. vhost-scsi also have this property but it seems the 
property just doesn't affect anything (remove it?).
Also vhost-user-blk, vhost-user-scsi and vhost-scsi don't do any seg_max 
settings. If I understand correctly, their backends are ment to be 
responsible for doing that.
So, what about changing the queue sizes just for virtio-blk-device and 
virtio-scsi-device?


Denis


It seems to be so. We also have the test checking those settings:
tests/acceptance/virtio_seg_max_adjust.py
For now it checks virtio-scsi-pci and virtio-blk-pci.
I'm going to extend it for the virtqueue size checking.
If I change vhost-user-blk, vhost-user-scsi and vhost-scsi it's worth
to check those devices too. But I don't know how to form a command line
for that 3 devices since they should involve some third party components as
backends (kernel modules, DPDK, etc.) and they seems to be not available in
the
qemu git.
Is there any way to do it with some qit.qemu available stubs or something
else?
If so, could you please point out the proper way to do it?

qemu.git has contrib/vhost-user-blk/ and contrib/vhost-user-scsi/ if
you need to test those vhost-user devices without external dependencies.

Stefan





Re: [PATCH v1 2/4] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-10 Thread Denis Plotnikov




On 09.02.2020 10:49, Michael S. Tsirkin wrote:

On Fri, Feb 07, 2020 at 11:48:05AM +0300, Denis Plotnikov wrote:


On 05.02.2020 14:19, Stefan Hajnoczi wrote:

On Tue, Feb 04, 2020 at 12:59:04PM +0300, Denis Plotnikov wrote:

On 30.01.2020 17:58, Stefan Hajnoczi wrote:

On Wed, Jan 29, 2020 at 05:07:00PM +0300, Denis Plotnikov wrote:

The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
hw/core/machine.c  | 3 +++
include/hw/virtio/virtio.h | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 3e288bfceb..8bc401d8b7 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -28,6 +28,9 @@
#include "hw/mem/nvdimm.h"
GlobalProperty hw_compat_4_2[] = {
+{ "virtio-blk-device", "queue-size", "128"},
+{ "virtio-scsi-device", "virtqueue_size", "128"},
+{ "vhost-blk-device", "virtqueue_size", "128"},

vhost-blk-device?!  Who has this?  It's not in qemu.git so please omit
this line. ;-)

So in this case the line:

{ "vhost-blk-device", "seg_max_adjust", "off"},

introduced by my patch:

commit 1bf8a989a566b2ba41c197004ec2a02562a766a4
Author: Denis Plotnikov 
Date:   Fri Dec 20 17:09:04 2019 +0300

      virtio: make seg_max virtqueue size dependent

is also wrong. It should be:

{ "vhost-scsi-device", "seg_max_adjust", "off"},

Am I right?

It's just called "vhost-scsi":

include/hw/virtio/vhost-scsi.h:#define TYPE_VHOST_SCSI "vhost-scsi"


On the other hand, do you want to do this for the vhost-user-blk,
vhost-user-scsi, and vhost-scsi devices that exist in qemu.git?  Those
devices would benefit from better performance too.

After thinking about that for a while, I think we shouldn't extend queue
sizes for vhost-user-blk, vhost-user-scsi and vhost-scsi.
This is because increasing the queue sizes seems to be just useless for
them: the all thing is about increasing the queue sizes for increasing
seg_max (it limits the max block query size from the guest). For
virtio-blk-device and virtio-scsi-device it makes sense, since they have
seg-max-adjust property which, if true, sets seg_max to virtqueue_size-2.
vhost-scsi also have this property but it seems the property just doesn't
affect anything (remove it?).
Also vhost-user-blk, vhost-user-scsi and vhost-scsi don't do any seg_max
settings. If I understand correctly, their backends are ment to be
responsible for doing that.

The queue size is set by qemu IIRC.


So, what about changing the queue sizes just for virtio-blk-device and
virtio-scsi-device?


Hmm that would break ability to migrate between userspace and vhost
backends, would it not?

I'm not sure I've understood what you meant.
Just for the record, I was going to change virtqueue-size for 
virtio-blk-device and virtio-scsi-device since they can adjust seg_max 
to the specified queue size and I don't want to touch vhost-s and 
vhost-user-s since they don't have adjustable seg_max for now.


Denis




Denis


It seems to be so. We also have the test checking those settings:
tests/acceptance/virtio_seg_max_adjust.py
For now it checks virtio-scsi-pci and virtio-blk-pci.
I'm going to extend it for the virtqueue size checking.
If I change vhost-user-blk, vhost-user-scsi and vhost-scsi it's worth
to check those devices too. But I don't know how to form a command line
for that 3 devices since they should involve some third party components as
backends (kernel modules, DPDK, etc.) and they seems to be not available in
the
qemu git.
Is there any way to do it with some qit.qemu available stubs or something
else?
If so, could you please point out the proper way to do it?

qemu.git has contrib/vhost-user-blk/ and contrib/vhost-user-scsi/ if
you need to test those vhost-user devices without external dependencies.

Stefan





[PATCH] pc: remove erroneous seg_max_adjust setting for vhost-blk-device

2020-02-11 Thread Denis Plotnikov
vhost-blk-device isn't a part of qemu.git

Signed-off-by: Denis Plotnikov 
---
 hw/core/machine.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index d8e30e4895..2501b540ec 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -31,7 +31,6 @@ GlobalProperty hw_compat_4_2[] = {
 { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" },
 { "virtio-blk-device", "seg-max-adjust", "off"},
 { "virtio-scsi-device", "seg_max_adjust", "off"},
-{ "vhost-blk-device", "seg_max_adjust", "off"},
 { "usb-host", "suppress-remote-wake", "off" },
 { "usb-redir", "suppress-remote-wake", "off" },
 };
-- 
2.17.0




[PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-11 Thread Denis Plotnikov
The goal is to reduce the amount of requests issued by a guest on
1M reads/writes. This rises the performance up to 4% on that kind of
disk access pattern.

The maximum chunk size to be used for the guest disk accessing is
limited with seg_max parameter, which represents the max amount of
pices in the scatter-geather list in one guest disk request.

Since seg_max is virqueue_size dependent, increasing the virtqueue
size increases seg_max, which, in turn, increases the maximum size
of data to be read/write from a guest disk.

More details in the original problem statment:
https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html

Suggested-by: Denis V. Lunev 
Signed-off-by: Denis Plotnikov 
---
 hw/block/virtio-blk.c | 4 ++--
 hw/core/machine.c | 2 ++
 hw/scsi/virtio-scsi.c | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 09f46ed85f..6df3a7a6df 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -914,7 +914,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, 
uint8_t *config)
 memset(&blkcfg, 0, sizeof(blkcfg));
 virtio_stq_p(vdev, &blkcfg.capacity, capacity);
 virtio_stl_p(vdev, &blkcfg.seg_max,
- s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2);
+ s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 256 - 2);
 virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
 virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
 virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
@@ -1272,7 +1272,7 @@ static Property virtio_blk_properties[] = {
 DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
 true),
 DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
-DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
+DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256),
 DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, true),
 DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
  IOThread *),
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 2501b540ec..3427d6cf4c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -28,6 +28,8 @@
 #include "hw/mem/nvdimm.h"
 
 GlobalProperty hw_compat_4_2[] = {
+{ "virtio-blk-device", "queue-size", "128"},
+{ "virtio-scsi-device", "virtqueue_size", "128"},
 { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" },
 { "virtio-blk-device", "seg-max-adjust", "off"},
 { "virtio-scsi-device", "seg_max_adjust", "off"},
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 3b61563609..b38f50a429 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -660,7 +660,7 @@ static void virtio_scsi_get_config(VirtIODevice *vdev,
 
 virtio_stl_p(vdev, &scsiconf->num_queues, s->conf.num_queues);
 virtio_stl_p(vdev, &scsiconf->seg_max,
- s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 128 - 
2);
+ s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 256 - 
2);
 virtio_stl_p(vdev, &scsiconf->max_sectors, s->conf.max_sectors);
 virtio_stl_p(vdev, &scsiconf->cmd_per_lun, s->conf.cmd_per_lun);
 virtio_stl_p(vdev, &scsiconf->event_info_size, sizeof(VirtIOSCSIEvent));
@@ -965,7 +965,7 @@ static void virtio_scsi_device_unrealize(DeviceState *dev, 
Error **errp)
 static Property virtio_scsi_properties[] = {
 DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 
1),
 DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI,
- parent_obj.conf.virtqueue_size, 128),
+ parent_obj.conf.virtqueue_size, 256),
 DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI,
   parent_obj.conf.seg_max_adjust, true),
 DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, parent_obj.conf.max_sectors,
-- 
2.17.0




[PATCH v1 2/2] tests/acceptance/virtio_check_params: prepare to check different params

2020-02-11 Thread Denis Plotnikov
Signed-off-by: Denis Plotnikov 
---
 tests/acceptance/virtio_check_params.py | 38 ++---
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/tests/acceptance/virtio_check_params.py 
b/tests/acceptance/virtio_check_params.py
index deec89bf86..e578952a97 100644
--- a/tests/acceptance/virtio_check_params.py
+++ b/tests/acceptance/virtio_check_params.py
@@ -43,7 +43,7 @@ VM_DEV_PARAMS = {'virtio-scsi-pci': ['-device', 
'virtio-scsi-pci,id=scsi0'],
 EXCLUDED_MACHINES = ['none', 'isapc', 'microvm']
 
 
-class VirtioMaxSegSettingsCheck(Test):
+class VirtioParamsCheck(Test):
 @staticmethod
 def make_pattern(props):
 pattern_items = ['{0} = \w+'.format(prop) for prop in props]
@@ -75,12 +75,12 @@ class VirtioMaxSegSettingsCheck(Test):
 props[p[0]] = p[1]
 return query_ok, props, error
 
-def check_mt(self, mt, dev_type_name):
-mt['device'] = dev_type_name # Only for the debug() call.
+def check_mt(self, mt, expected_vals, dev_type_name):
+msg = "mt: %s dev: %s" % (mt, dev_type_name) # For debug() call only.
 logger = logging.getLogger('machine')
-logger.debug(mt)
+logger.debug(msg)
 with QEMUMachine(self.qemu_bin) as vm:
-vm.set_machine(mt["name"])
+vm.set_machine(mt)
 vm.add_args('-nodefaults')
 for s in VM_DEV_PARAMS[dev_type_name]:
 vm.add_args(s)
@@ -92,11 +92,15 @@ class VirtioMaxSegSettingsCheck(Test):
 error = sys.exc_info()[0]
 
 if not query_ok:
-self.fail('machine type {0}: {1}'.format(mt['name'], error))
+self.fail('machine type {0}: {1}'.format(mt, error))
 
 for prop_name, prop_val in props.items():
-expected_val = mt[prop_name]
-self.assertEqual(expected_val, prop_val)
+expected_val = expected_vals[prop_name]
+msg = 'Property value mismatch for (MT: {0}, '\
+  'property name: {1}): expected value: "{2}" '\
+  'actual value: "{3}"'\
+  .format(mt, prop_name, expected_val, prop_val)
+self.assertEqual(expected_val, prop_val, msg)
 
 @staticmethod
 def seg_max_adjust_enabled(mt):
@@ -128,25 +132,27 @@ class VirtioMaxSegSettingsCheck(Test):
 
 @skip("break multi-arch CI")
 def test_machine_types(self):
-# collect all machine types except 'none', 'isapc', 'microvm'
+# collect all machine types
 with QEMUMachine(self.qemu_bin) as vm:
 vm.launch()
 machines = [m['name'] for m in vm.command('query-machines')]
 vm.shutdown()
 
+# ..and exclude non-relevant ones
 machines = self.filter_machines(machines)
 
 for dev_type in DEV_TYPES:
-# create the list of machine types and their parameters.
-mtypes = list()
+# define expected parameters for each machine type
+mt_expected_vals = dict()
 for m in machines:
 if self.seg_max_adjust_enabled(m):
 enabled = 'true'
 else:
 enabled = 'false'
-mtypes.append({'name': m,
-   DEV_TYPES[dev_type]['seg_max_adjust']: enabled})
 
-# test each machine type for a device type
-for mt in mtypes:
-self.check_mt(mt, dev_type)
+mt_expected_vals[m] = {
+DEV_TYPES[dev_type]['seg_max_adjust']: enabled }
+
+# test each machine type
+for mt in mt_expected_vals:
+self.check_mt(mt, mt_expected_vals[mt], dev_type)
-- 
2.17.0




[PATCH v1 1/2] tests/acceptance/virtio_check_params: remove excluded machine types carefully

2020-02-11 Thread Denis Plotnikov
Before, the test failed if an excluded machine type was absent in the machine
types lists.

Signed-off-by: Denis Plotnikov 
---
 tests/acceptance/virtio_check_params.py | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/acceptance/virtio_check_params.py 
b/tests/acceptance/virtio_check_params.py
index 87e6c839d1..deec89bf86 100644
--- a/tests/acceptance/virtio_check_params.py
+++ b/tests/acceptance/virtio_check_params.py
@@ -40,6 +40,8 @@ VM_DEV_PARAMS = {'virtio-scsi-pci': ['-device', 
'virtio-scsi-pci,id=scsi0'],
 '-drive',
 'driver=null-co,id=drive0,if=none']}
 
+EXCLUDED_MACHINES = ['none', 'isapc', 'microvm']
+
 
 class VirtioMaxSegSettingsCheck(Test):
 @staticmethod
@@ -117,6 +119,13 @@ class VirtioMaxSegSettingsCheck(Test):
 return True
 return False
 
+@staticmethod
+def filter_machines(machines):
+for mt in EXCLUDED_MACHINES:
+if mt in machines:
+machines.remove(mt)
+return machines
+
 @skip("break multi-arch CI")
 def test_machine_types(self):
 # collect all machine types except 'none', 'isapc', 'microvm'
@@ -124,9 +133,8 @@ class VirtioMaxSegSettingsCheck(Test):
 vm.launch()
 machines = [m['name'] for m in vm.command('query-machines')]
 vm.shutdown()
-machines.remove('none')
-machines.remove('isapc')
-machines.remove('microvm')
+
+machines = self.filter_machines(machines)
 
 for dev_type in DEV_TYPES:
 # create the list of machine types and their parameters.
-- 
2.17.0




[PATCH v1 0/2] Improve virtio_check_params test

2020-02-11 Thread Denis Plotnikov
* fixed failing on non-existed machine type removal
* the test refactored to add more parameters to check

Gereral questions left:
   How to restric test for using:
   1. on a set of target OS-es
   2. on a set target architectures
  

Denis Plotnikov (2):
  tests/acceptance/virtio_check_params: remove excluded machine types
carefully
  tests/acceptance/virtio_check_params: prepare to check different
params

 tests/acceptance/virtio_check_params.py | 52 -
 1 file changed, 33 insertions(+), 19 deletions(-)

-- 
2.17.0




Re: [PATCH v1 0/2] Improve virtio_check_params test

2020-02-11 Thread Denis Plotnikov




On 11.02.2020 17:37, Philippe Mathieu-Daudé wrote:

Hi Denis,

On 2/11/20 3:25 PM, Denis Plotnikov wrote:

* fixed failing on non-existed machine type removal
* the test refactored to add more parameters to check

Gereral questions left:
    How to restric test for using:
    1. on a set of target OS-es
    2. on a set target architectures

Denis Plotnikov (2):
   tests/acceptance/virtio_check_params: remove excluded machine types
 carefully
   tests/acceptance/virtio_check_params: prepare to check different
 params

  tests/acceptance/virtio_check_params.py | 52 -
  1 file changed, 33 insertions(+), 19 deletions(-)



Have you noticed my other series suggested by Cornelia?

It runs your test on S390X and PPC:
https://www.mail-archive.com/qemu-devel@nongnu.org/msg675092.html
https://www.mail-archive.com/qemu-devel@nongnu.org/msg675095.html

Hi, Philippe

Seems that I've missed them. I just made patches upon the fresh master.
Can I get a git tree which has those patches applied? Or should I wait 
while the patches landed to qemu master and the rebase on them?


Denis



Re: [PATCH v4] qapi/qmp: Add timestamps to qmp command responses

2023-01-10 Thread Denis Plotnikov

[ping]

On 01.11.2022 18:37, Denis Plotnikov wrote:

Add "start" & "end" time values to QMP command responses.

These time values are added to let the qemu management layer get the exact
command execution time without any other time variance which might be brought
by other parts of management layer or qemu internals.
This helps to look for problems poactively from the management layer side.
The management layer would be able to detect problem cases by calculating
QMP command execution time:
1. execution_time_from_mgmt_perspective -
execution_time_of_qmp_command > some_threshold
This detects problems with management layer or internal qemu QMP command
dispatching
2. current_qmp_command_execution_time > avg_qmp_command_execution_time
This detects that a certain QMP command starts to execute longer than
usual
In both these cases more thorough investigation of the root cases should be
done by using some qemu tracepoints depending on particular QMP command under
investigation or by other means. The timestamps help to avoid excessive log
output when qemu tracepoints are used to address similar cases.

Example of result:

 ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

 (QEMU) query-status
 {"end": {"seconds": 1650367305, "microseconds": 831032},
  "start": {"seconds": 1650367305, "microseconds": 831012},
  "return": {"status": "running", "singlestep": false, "running": true}}

The response of the QMP command contains the start & end time of
the QMP command processing.

Also, "start" & "end" timestaps are added to qemu guest agent responses as
qemu-ga shares the same code for request dispatching.

Suggested-by: Andrey Ryabinin
Signed-off-by: Denis Plotnikov
Reviewed-by: Daniel P. Berrangé
---
v3->v4
  - rewrite commit message [Markus]
  - use new fileds description in doc [Markus]
  - change type to int64_t [Markus]
  - simplify tests [Markus]

v2->v3:
  - fix typo "timestaps -> timestamps" [Marc-André]

v1->v2:
  - rephrase doc descriptions [Daniel]
  - add tests for qmp timestamps to qmp test and qga test [Daniel]
  - adjust asserts in test-qmp-cmds according to the new number of returning 
keys

v0->v1:
  - remove interface to control "start" and "end" time values: return 
timestamps unconditionally
  - add description to qmp specification
  - leave the same timestamp format in "seconds", "microseconds" to be 
consistent with events
timestamp
  - fix patch description

  docs/interop/qmp-spec.txt  | 28 ++--
  qapi/qmp-dispatch.c| 18 ++
  tests/qtest/qmp-test.c | 32 
  tests/unit/test-qga.c  | 29 +
  tests/unit/test-qmp-cmds.c |  4 ++--
  5 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt
index b0e8351d5b261..0dd8e716c02f0 100644
--- a/docs/interop/qmp-spec.txt
+++ b/docs/interop/qmp-spec.txt
@@ -158,7 +158,9 @@ responses that have an unknown "id" field.
  
  The format of a success response is:
  
-{ "return": json-value, "id": json-value }

+{ "return": json-value, "id": json-value,
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
  
   Where,
  
@@ -169,13 +171,25 @@ The format of a success response is:

command does not return data
  - The "id" member contains the transaction identification associated
with the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
+- The "end" member contains the exact time of when the server
+  finished executing the command. This excludes any time the
+  command response spent queued, waiting to be sent on the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
  
  2.4.2 error

  ---
  
  The format of an error response is:
  
-{ "error": { "class": json-string, "desc": json-string }, "id": json-value }

+{ "error": { "class": json-string, "desc": json-string }, "id": json-value
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
  
   Where,
  
@@ -184,6 

[PING] [PATCH v4] qapi/qmp: Add timestamps to qmp command responses

2023-01-16 Thread Denis Plotnikov


On 10.01.2023 13:32, Denis Plotnikov wrote:


[ping]

On 01.11.2022 18:37, Denis Plotnikov wrote:

Add "start" & "end" time values to QMP command responses.

These time values are added to let the qemu management layer get the exact
command execution time without any other time variance which might be brought
by other parts of management layer or qemu internals.
This helps to look for problems poactively from the management layer side.
The management layer would be able to detect problem cases by calculating
QMP command execution time:
1. execution_time_from_mgmt_perspective -
execution_time_of_qmp_command > some_threshold
This detects problems with management layer or internal qemu QMP command
dispatching
2. current_qmp_command_execution_time > avg_qmp_command_execution_time
This detects that a certain QMP command starts to execute longer than
usual
In both these cases more thorough investigation of the root cases should be
done by using some qemu tracepoints depending on particular QMP command under
investigation or by other means. The timestamps help to avoid excessive log
output when qemu tracepoints are used to address similar cases.

Example of result:

 ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

 (QEMU) query-status
 {"end": {"seconds": 1650367305, "microseconds": 831032},
  "start": {"seconds": 1650367305, "microseconds": 831012},
  "return": {"status": "running", "singlestep": false, "running": true}}

The response of the QMP command contains the start & end time of
the QMP command processing.

Also, "start" & "end" timestaps are added to qemu guest agent responses as
qemu-ga shares the same code for request dispatching.

Suggested-by: Andrey Ryabinin
Signed-off-by: Denis Plotnikov
Reviewed-by: Daniel P. Berrangé
---
v3->v4
  - rewrite commit message [Markus]
  - use new fileds description in doc [Markus]
  - change type to int64_t [Markus]
  - simplify tests [Markus]

v2->v3:
  - fix typo "timestaps -> timestamps" [Marc-André]

v1->v2:
  - rephrase doc descriptions [Daniel]
  - add tests for qmp timestamps to qmp test and qga test [Daniel]
  - adjust asserts in test-qmp-cmds according to the new number of returning 
keys

v0->v1:
  - remove interface to control "start" and "end" time values: return 
timestamps unconditionally
  - add description to qmp specification
  - leave the same timestamp format in "seconds", "microseconds" to be 
consistent with events
timestamp
  - fix patch description

  docs/interop/qmp-spec.txt  | 28 ++--
  qapi/qmp-dispatch.c| 18 ++
  tests/qtest/qmp-test.c | 32 
  tests/unit/test-qga.c  | 29 +
  tests/unit/test-qmp-cmds.c |  4 ++--
  5 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt
index b0e8351d5b261..0dd8e716c02f0 100644
--- a/docs/interop/qmp-spec.txt
+++ b/docs/interop/qmp-spec.txt
@@ -158,7 +158,9 @@ responses that have an unknown "id" field.
  
  The format of a success response is:
  
-{ "return": json-value, "id": json-value }

+{ "return": json-value, "id": json-value,
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
  
   Where,
  
@@ -169,13 +171,25 @@ The format of a success response is:

command does not return data
  - The "id" member contains the transaction identification associated
with the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
+- The "end" member contains the exact time of when the server
+  finished executing the command. This excludes any time the
+  command response spent queued, waiting to be sent on the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
  
  2.4.2 error

  ---
  
  The format of an error response is:
  
-{ "error": { "class": json-string, "desc": json-string }, "id": json-value }

+{ "error": { "class": json-string, "desc": json-string }, "id": json-value
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds&quo

[PATCH v1 0/2] vl: flush all task from rcu queue before exiting

2021-11-15 Thread Denis Plotnikov
v1 -> v0:
 * move monitor cleanup to the very end of qemu cleanup [Paolo]

The goal is to notify management layer about device destruction on qemu 
shutdown.
Without this series DEVICE_DELETED event may not be sent because of stuck tasks
in the rcu thread. The rcu tasks may stuck on qemu shutdown because the rcu
not always have enough time to run them. 


Denis Plotnikov (2):
  monitor: move monitor destruction to the very end of qemu cleanup
  vl: flush all task from rcu queue before exiting

 include/qemu/rcu.h |  1 +
 monitor/monitor.c  |  6 ++
 softmmu/runstate.c |  4 +++-
 util/rcu.c | 12 
 4 files changed, 22 insertions(+), 1 deletion(-)

-- 
2.25.1




[PATCH v1 1/2] monitor: move monitor destruction to the very end of qemu cleanup

2021-11-15 Thread Denis Plotnikov
This is needed to keep sending DEVICE_DELETED events on qemu cleanup.
The event may happen in the rcu thread and we're going to flush the rcu queue
explicitly before qemu exiting in the next patch. So move the monitor
destruction to the very end of qemu cleanup to be able to send all the events.

Signed-off-by: Denis Plotnikov 
---
 monitor/monitor.c  | 6 ++
 softmmu/runstate.c | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/monitor/monitor.c b/monitor/monitor.c
index 21c7a68758f5..b04ae4850db2 100644
--- a/monitor/monitor.c
+++ b/monitor/monitor.c
@@ -605,11 +605,17 @@ void monitor_data_init(Monitor *mon, bool is_qmp, bool 
skip_flush,
 mon->outbuf = g_string_new(NULL);
 mon->skip_flush = skip_flush;
 mon->use_io_thread = use_io_thread;
+/*
+ * take an extra ref to prevent monitor's chardev
+ * from destroying in qemu_chr_cleanup()
+ */
+object_ref(OBJECT(mon->chr.chr));
 }
 
 void monitor_data_destroy(Monitor *mon)
 {
 g_free(mon->mon_cpu_path);
+object_unref(OBJECT(mon->chr.chr));
 qemu_chr_fe_deinit(&mon->chr, false);
 if (monitor_is_qmp(mon)) {
 monitor_data_destroy_qmp(container_of(mon, MonitorQMP, common));
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 10d9b7365aa7..8d29dd2c00e2 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -819,8 +819,8 @@ void qemu_cleanup(void)
 tpm_cleanup();
 net_cleanup();
 audio_cleanup();
-monitor_cleanup();
 qemu_chr_cleanup();
 user_creatable_cleanup();
+monitor_cleanup();
 /* TODO: unref root container, check all devices are ok */
 }
-- 
2.25.1




[PATCH v1 2/2] vl: flush all task from rcu queue before exiting

2021-11-15 Thread Denis Plotnikov
The device destruction may superimpose over qemu shutdown.
In this case some management layer, requested a device unplug and
waiting for DEVICE_DELETED event, may never get this event.

This happens because device_finalize() may never be called on qemu shutdown
for some devices using address_space_destroy(). The later is called from
the rcu thread.
On qemu shutdown, not all rcu callbacks may be called because the rcu thread
may not have enough time to converge before qemu main thread exit.

To resolve this issue this patch makes rcu thread to finish all its callbacks
explicitly by calling a new rcu intreface function right before
qemu main thread exit.

Signed-off-by: Denis Plotnikov 
---
 include/qemu/rcu.h |  1 +
 softmmu/runstate.c |  2 ++
 util/rcu.c | 12 
 3 files changed, 15 insertions(+)

diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h
index 515d327cf11c..f7fbdc3781e5 100644
--- a/include/qemu/rcu.h
+++ b/include/qemu/rcu.h
@@ -134,6 +134,7 @@ struct rcu_head {
 
 extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func);
 extern void drain_call_rcu(void);
+extern void flush_rcu(void);
 
 /* The operands of the minus operator must have the same type,
  * which must be the one that we specify in the cast.
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 8d29dd2c00e2..3f833678f6eb 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -821,6 +821,8 @@ void qemu_cleanup(void)
 audio_cleanup();
 qemu_chr_cleanup();
 user_creatable_cleanup();
+/* finish all the tasks from rcu queue before exiting */
+flush_rcu();
 monitor_cleanup();
 /* TODO: unref root container, check all devices are ok */
 }
diff --git a/util/rcu.c b/util/rcu.c
index 13ac0f75cb2a..f047f8ee8d16 100644
--- a/util/rcu.c
+++ b/util/rcu.c
@@ -348,6 +348,18 @@ void drain_call_rcu(void)
 
 }
 
+/*
+ * This function drains rcu queue until there are no tasks to do left
+ * and aims to the cases when one needs to ensure that no work hang
+ * in rcu thread before proceeding, e.g. on qemu shutdown.
+ */
+void flush_rcu(void)
+{
+while (qatomic_read(&rcu_call_count) > 0) {
+drain_call_rcu();
+}
+}
+
 void rcu_register_thread(void)
 {
 assert(rcu_reader.ctr == 0);
-- 
2.25.1




[Ping] [PATCH v1 0/2] vl: flush all task from rcu queue before exiting

2021-11-19 Thread Denis Plotnikov

Ping!

On 15.11.2021 12:41, Denis Plotnikov wrote:

v1 -> v0:
  * move monitor cleanup to the very end of qemu cleanup [Paolo]

The goal is to notify management layer about device destruction on qemu 
shutdown.
Without this series DEVICE_DELETED event may not be sent because of stuck tasks
in the rcu thread. The rcu tasks may stuck on qemu shutdown because the rcu
not always have enough time to run them.


Denis Plotnikov (2):
   monitor: move monitor destruction to the very end of qemu cleanup
   vl: flush all task from rcu queue before exiting

  include/qemu/rcu.h |  1 +
  monitor/monitor.c  |  6 ++
  softmmu/runstate.c |  4 +++-
  util/rcu.c | 12 
  4 files changed, 22 insertions(+), 1 deletion(-)



[PING][Ping] [PATCH v1 0/2] vl: flush all task from rcu queue before exiting

2021-11-24 Thread Denis Plotnikov

ping ping

On 19.11.2021 12:42, Denis Plotnikov wrote:


Ping!

On 15.11.2021 12:41, Denis Plotnikov wrote:

v1 -> v0:
  * move monitor cleanup to the very end of qemu cleanup [Paolo]

The goal is to notify management layer about device destruction on qemu 
shutdown.
Without this series DEVICE_DELETED event may not be sent because of stuck tasks
in the rcu thread. The rcu tasks may stuck on qemu shutdown because the rcu
not always have enough time to run them.


Denis Plotnikov (2):
   monitor: move monitor destruction to the very end of qemu cleanup
   vl: flush all task from rcu queue before exiting

  include/qemu/rcu.h |  1 +
  monitor/monitor.c  |  6 ++
  softmmu/runstate.c |  4 +++-
  util/rcu.c | 12 
  4 files changed, 22 insertions(+), 1 deletion(-)



[PING][PATCH v5] qapi/qmp: Add timestamps to qmp command responses

2023-05-10 Thread Denis Plotnikov

Hi all!

It seems that this series has come through a number of reviews and got 
some "reviewed-by".


Is there any flaws to fix preventing to merge this series?

Thanks, Denis

On 26.04.2023 17:08, Denis Plotnikov wrote:

Add "start" & "end" time values to QMP command responses.

These time values are added to let the qemu management layer get the exact
command execution time without any other time variance which might be brought
by other parts of management layer or qemu internals.
This helps to look for problems poactively from the management layer side.
The management layer would be able to detect problem cases by calculating
QMP command execution time:
1. execution_time_from_mgmt_perspective -
execution_time_of_qmp_command > some_threshold
This detects problems with management layer or internal qemu QMP command
dispatching
2. current_qmp_command_execution_time > avg_qmp_command_execution_time
This detects that a certain QMP command starts to execute longer than
usual
In both these cases more thorough investigation of the root cases should be
done by using some qemu tracepoints depending on particular QMP command under
investigation or by other means. The timestamps help to avoid excessive log
output when qemu tracepoints are used to address similar cases.

Example of result:

 ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

 (QEMU) query-status
 {"end": {"seconds": 1650367305, "microseconds": 831032},
  "start": {"seconds": 1650367305, "microseconds": 831012},
  "return": {"status": "running", "singlestep": false, "running": true}}

The response of the QMP command contains the start & end time of
the QMP command processing.

Also, "start" & "end" timestaps are added to qemu guest agent responses as
qemu-ga shares the same code for request dispatching.

Suggested-by: Andrey Ryabinin 
Signed-off-by: Denis Plotnikov 
Reviewed-by: Daniel P. Berrangé 
---
v4->v5:
  - use json-number instead of json-value for time values [Vladimir]
  - use a new util function for timestamp printing [Vladimir]

v3->v4:
  - rewrite commit message [Markus]
  - use new fileds description in doc [Markus]
  - change type to int64_t [Markus]
  - simplify tests [Markus]

v2->v3:
  - fix typo "timestaps -> timestamps" [Marc-André]

v1->v2:
  - rephrase doc descriptions [Daniel]
  - add tests for qmp timestamps to qmp test and qga test [Daniel]
  - adjust asserts in test-qmp-cmds according to the new number of returning 
keys

v0->v1:
  - remove interface to control "start" and "end" time values: return 
timestamps unconditionally
  - add description to qmp specification
  - leave the same timestamp format in "seconds", "microseconds" to be 
consistent with events
timestamp
  - fix patch description
---
  docs/interop/qmp-spec.txt  | 28 ++--
  include/qapi/util.h|  2 ++
  qapi/qapi-util.c   | 11 +++
  qapi/qmp-dispatch.c| 11 +++
  qapi/qmp-event.c   |  6 +-
  tests/qtest/qmp-test.c | 32 
  tests/unit/test-qga.c  | 29 +
  tests/unit/test-qmp-cmds.c |  4 ++--
  8 files changed, 114 insertions(+), 9 deletions(-)

diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt
index b0e8351d5b261..ed204b53373e5 100644
--- a/docs/interop/qmp-spec.txt
+++ b/docs/interop/qmp-spec.txt
@@ -158,7 +158,9 @@ responses that have an unknown "id" field.
  
  The format of a success response is:
  
-{ "return": json-value, "id": json-value }

+{ "return": json-value, "id": json-value,
+  "start": {"seconds": json-number, "microseconds": json-number},
+  "end": {"seconds": json-number, "microseconds": json-number} }
  
   Where,
  
@@ -169,13 +171,25 @@ The format of a success response is:

command does not return data
  - The "id" member contains the transaction identification associated
with the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
+- The "end" member contains the exact time of when the server
+  finished executing the command. This excludes any time the
+  command response spent queued, waiting to be sent on the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
  
  2.4.2 error

  ---
  
  The format of an error response is:
  
-{ &q

[PATCH v5] qapi/qmp: Add timestamps to qmp command responses

2023-04-26 Thread Denis Plotnikov
Add "start" & "end" time values to QMP command responses.

These time values are added to let the qemu management layer get the exact
command execution time without any other time variance which might be brought
by other parts of management layer or qemu internals.
This helps to look for problems poactively from the management layer side.
The management layer would be able to detect problem cases by calculating
QMP command execution time:
1. execution_time_from_mgmt_perspective -
   execution_time_of_qmp_command > some_threshold
   This detects problems with management layer or internal qemu QMP command
   dispatching
2. current_qmp_command_execution_time > avg_qmp_command_execution_time
   This detects that a certain QMP command starts to execute longer than
   usual
In both these cases more thorough investigation of the root cases should be
done by using some qemu tracepoints depending on particular QMP command under
investigation or by other means. The timestamps help to avoid excessive log
output when qemu tracepoints are used to address similar cases.

Example of result:

./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

(QEMU) query-status
{"end": {"seconds": 1650367305, "microseconds": 831032},
 "start": {"seconds": 1650367305, "microseconds": 831012},
 "return": {"status": "running", "singlestep": false, "running": true}}

The response of the QMP command contains the start & end time of
the QMP command processing.

Also, "start" & "end" timestaps are added to qemu guest agent responses as
qemu-ga shares the same code for request dispatching.

Suggested-by: Andrey Ryabinin 
Signed-off-by: Denis Plotnikov 
Reviewed-by: Daniel P. Berrangé 
---
v4->v5:
 - use json-number instead of json-value for time values [Vladimir]
 - use a new util function for timestamp printing [Vladimir]

v3->v4:
 - rewrite commit message [Markus]
 - use new fileds description in doc [Markus]
 - change type to int64_t [Markus]
 - simplify tests [Markus]

v2->v3:
 - fix typo "timestaps -> timestamps" [Marc-André]

v1->v2:
 - rephrase doc descriptions [Daniel]
 - add tests for qmp timestamps to qmp test and qga test [Daniel]
 - adjust asserts in test-qmp-cmds according to the new number of returning keys

v0->v1:
 - remove interface to control "start" and "end" time values: return timestamps 
unconditionally
 - add description to qmp specification
 - leave the same timestamp format in "seconds", "microseconds" to be 
consistent with events
   timestamp
 - fix patch description
---
 docs/interop/qmp-spec.txt  | 28 ++--
 include/qapi/util.h|  2 ++
 qapi/qapi-util.c   | 11 +++
 qapi/qmp-dispatch.c| 11 +++
 qapi/qmp-event.c   |  6 +-
 tests/qtest/qmp-test.c | 32 
 tests/unit/test-qga.c  | 29 +
 tests/unit/test-qmp-cmds.c |  4 ++--
 8 files changed, 114 insertions(+), 9 deletions(-)

diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt
index b0e8351d5b261..ed204b53373e5 100644
--- a/docs/interop/qmp-spec.txt
+++ b/docs/interop/qmp-spec.txt
@@ -158,7 +158,9 @@ responses that have an unknown "id" field.
 
 The format of a success response is:
 
-{ "return": json-value, "id": json-value }
+{ "return": json-value, "id": json-value,
+  "start": {"seconds": json-number, "microseconds": json-number},
+  "end": {"seconds": json-number, "microseconds": json-number} }
 
  Where,
 
@@ -169,13 +171,25 @@ The format of a success response is:
   command does not return data
 - The "id" member contains the transaction identification associated
   with the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
+- The "end" member contains the exact time of when the server
+  finished executing the command. This excludes any time the
+  command response spent queued, waiting to be sent on the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
 
 2.4.2 error
 ---
 
 The format of an error response is:
 
-{ "error": { "class": json-string, "desc": json-string }, "id": json-value }
+{ "error": { "class": json-string, "desc": json-string }, "id": json-value
+  "start": {"seconds": json-number, &q

[PATCH v0 0/2] virtio-blk and vhost-user-blk cross-device migration

2021-10-04 Thread Denis Plotnikov
It might be useful for the cases when a slow block layer should be replaced
with a more performant one on running VM without stopping, i.e. with very low
downtime comparable with the one on migration.

It's possible to achive that for two reasons:

1.The VMStates of "virtio-blk" and "vhost-user-blk" are almost the same.
  They consist of the identical VMSTATE_VIRTIO_DEVICE and differs from
  each other in the values of migration service fields only.
2.The device driver used in the guest is the same: virtio-blk

In the series cross-migration is achieved by adding a new type.
The new type uses virtio-blk VMState instead of vhost-user-blk specific
VMstate, also it implements migration save/load callbacks to be compatible
with migration stream produced by "virtio-blk" device.

Adding the new type instead of modifying the existing one is convenent.
It ease to differ the new virtio-blk-compatible vhost-user-blk
device from the existing non-compatible one using qemu machinery without any
other modifiactions. That gives all the variety of qemu device related
constraints out of box.

0001: adds new type "vhost-user-virtio-blk"
0002: add new type "vhost-user-virtio-blk-pci"

Denis Plotnikov (2):
  vhost-user-blk: add a new vhost-user-virtio-blk type
  vhost-user-blk-pci: add new pci device type to support
vhost-user-virtio-blk

 hw/block/vhost-user-blk.c  | 63 ++
 hw/virtio/vhost-user-blk-pci.c | 43 ++--
 include/hw/virtio/vhost-user-blk.h |  2 +
 3 files changed, 105 insertions(+), 3 deletions(-)

-- 
2.25.1




[PATCH v0 2/2] vhost-user-blk-pci: add new pci device type to support vhost-user-virtio-blk

2021-10-04 Thread Denis Plotnikov
To allow the recently added vhost-user-virtio-blk work via virtio-pci.

This patch refactors the vhost-user-blk-pci object model to reuse
the existing code.

Signed-off-by: Denis Plotnikov 
---
 hw/virtio/vhost-user-blk-pci.c | 43 +++---
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/vhost-user-blk-pci.c b/hw/virtio/vhost-user-blk-pci.c
index 33b404d8a225..2f68296af22f 100644
--- a/hw/virtio/vhost-user-blk-pci.c
+++ b/hw/virtio/vhost-user-blk-pci.c
@@ -34,10 +34,18 @@ typedef struct VHostUserBlkPCI VHostUserBlkPCI;
 /*
  * vhost-user-blk-pci: This extends VirtioPCIProxy.
  */
+#define TYPE_VHOST_USER_BLK_PCI_ABSTRACT "vhost-user-blk-pci-abstract-base"
+#define VHOST_USER_BLK_PCI_ABSTRACT(obj) \
+OBJECT_CHECK(VHostUserBlkPCI, (obj), TYPE_VHOST_USER_BLK_PCI_ABSTRACT)
+
 #define TYPE_VHOST_USER_BLK_PCI "vhost-user-blk-pci-base"
 DECLARE_INSTANCE_CHECKER(VHostUserBlkPCI, VHOST_USER_BLK_PCI,
  TYPE_VHOST_USER_BLK_PCI)
 
+#define TYPE_VHOST_USER_VIRTIO_BLK_PCI "vhost-user-virtio-blk-pci-base"
+#define VHOST_USER_VIRTIO_BLK_PCI(obj) \
+OBJECT_CHECK(VHostUserBlkPCI, (obj), TYPE_VHOST_USER_VIRTIO_BLK_PCI)
+
 struct VHostUserBlkPCI {
 VirtIOPCIProxy parent_obj;
 VHostUserBlk vdev;
@@ -52,7 +60,7 @@ static Property vhost_user_blk_pci_properties[] = {
 
 static void vhost_user_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
 {
-VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(vpci_dev);
+VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI_ABSTRACT(vpci_dev);
 DeviceState *vdev = DEVICE(&dev->vdev);
 
 if (dev->vdev.num_queues == VHOST_USER_BLK_AUTO_NUM_QUEUES) {
@@ -66,7 +74,8 @@ static void vhost_user_blk_pci_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
 }
 
-static void vhost_user_blk_pci_class_init(ObjectClass *klass, void *data)
+static void vhost_user_blk_pci_abstract_class_init(ObjectClass *klass,
+   void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
 VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
@@ -81,6 +90,12 @@ static void vhost_user_blk_pci_class_init(ObjectClass 
*klass, void *data)
 pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
 }
 
+static const VirtioPCIDeviceTypeInfo vhost_user_blk_pci_abstract_info = {
+.base_name  = TYPE_VHOST_USER_BLK_PCI_ABSTRACT,
+.instance_size  = sizeof(VHostUserBlkPCI),
+.class_init = vhost_user_blk_pci_abstract_class_init,
+};
+
 static void vhost_user_blk_pci_instance_init(Object *obj)
 {
 VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(obj);
@@ -92,18 +107,40 @@ static void vhost_user_blk_pci_instance_init(Object *obj)
 }
 
 static const VirtioPCIDeviceTypeInfo vhost_user_blk_pci_info = {
+.parent  = TYPE_VHOST_USER_BLK_PCI_ABSTRACT,
 .base_name   = TYPE_VHOST_USER_BLK_PCI,
 .generic_name= "vhost-user-blk-pci",
 .transitional_name   = "vhost-user-blk-pci-transitional",
 .non_transitional_name   = "vhost-user-blk-pci-non-transitional",
 .instance_size  = sizeof(VHostUserBlkPCI),
 .instance_init  = vhost_user_blk_pci_instance_init,
-.class_init = vhost_user_blk_pci_class_init,
+};
+
+static void vhost_user_virtio_blk_pci_instance_init(Object *obj)
+{
+VHostUserBlkPCI *dev = VHOST_USER_VIRTIO_BLK_PCI(obj);
+
+virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+TYPE_VHOST_USER_VIRTIO_BLK);
+object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+  "bootindex");
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_virtio_blk_pci_info = {
+.parent  = TYPE_VHOST_USER_BLK_PCI_ABSTRACT,
+.base_name   = TYPE_VHOST_USER_VIRTIO_BLK_PCI,
+.generic_name= "vhost-user-virtio-blk-pci",
+.transitional_name   = "vhost-user-virtio-blk-pci-transitional",
+.non_transitional_name   = "vhost-user-virtio-blk-pci-non-transitional",
+.instance_size  = sizeof(VHostUserBlkPCI),
+.instance_init  = vhost_user_virtio_blk_pci_instance_init,
 };
 
 static void vhost_user_blk_pci_register(void)
 {
+virtio_pci_types_register(&vhost_user_blk_pci_abstract_info);
 virtio_pci_types_register(&vhost_user_blk_pci_info);
+virtio_pci_types_register(&vhost_user_virtio_blk_pci_info);
 }
 
 type_init(vhost_user_blk_pci_register)
-- 
2.25.1




[PATCH v0 1/2] vhost-user-blk: add a new vhost-user-virtio-blk type

2021-10-04 Thread Denis Plotnikov
The main reason of adding a new type is to make cross-device live migration
between "virtio-blk" and "vhost-user-blk" devices possible in both directions.

It might be useful for the cases when a slow block layer should be replaced
with a more performant one on running VM without stopping, i.e. with very low
downtime comparable with the one on migration.

It's possible to achive that for two reasons:

1.The VMStates of "virtio-blk" and "vhost-user-blk" are almost the same.
  They consist of the identical VMSTATE_VIRTIO_DEVICE and differs from
  each other in the values of migration service fields only.
2.The device driver used in the guest is the same: virtio-blk

The new type uses virtio-blk VMState instead of vhost-user-blk specific
VMstate, also it implements migration save/load callbacks to be compatible
with migration stream produced by "virtio-blk" device.

Adding the new vhost-user-blk type instead of modifying the existing one
is convenent. It ease to differ the new virtio-blk-compatible vhost-user-blk
device from the existing non-compatible one using qemu machinery without any
other modifiactions. That gives all the variety of qemu device related
constraints out of box.

Signed-off-by: Denis Plotnikov 
---
 hw/block/vhost-user-blk.c  | 63 ++
 include/hw/virtio/vhost-user-blk.h |  2 +
 2 files changed, 65 insertions(+)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index ba13cb87e520..877fe54e891f 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -30,6 +30,7 @@
 #include "hw/virtio/virtio-access.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/runstate.h"
+#include "migration/qemu-file-types.h"
 
 #define REALIZE_CONNECTION_RETRIES 3
 
@@ -612,9 +613,71 @@ static const TypeInfo vhost_user_blk_info = {
 .class_init = vhost_user_blk_class_init,
 };
 
+/*
+ * this is the same as vmstate_virtio_blk
+ * we use it to allow virtio-blk <-> vhost-user-virtio-blk migration
+ */
+static const VMStateDescription vmstate_vhost_user_virtio_blk = {
+.name = "virtio-blk",
+.minimum_version_id = 2,
+.version_id = 2,
+.fields = (VMStateField[]) {
+VMSTATE_VIRTIO_DEVICE,
+VMSTATE_END_OF_LIST()
+},
+};
+
+static void vhost_user_virtio_blk_save(VirtIODevice *vdev, QEMUFile *f)
+{
+/*
+ * put a zero byte in the stream to be compatible with virtio-blk
+ */
+qemu_put_sbyte(f, 0);
+}
+
+static int vhost_user_virtio_blk_load(VirtIODevice *vdev, QEMUFile *f,
+  int version_id)
+{
+if (qemu_get_sbyte(f)) {
+/*
+ * on virtio-blk -> vhost-user-virtio-blk migration we don't expect
+ * to get any infilght requests in the migration stream because
+ * we can't load them yet.
+ * TODO: consider putting those inflight requests to inflight region
+ */
+error_report("%s: can't load in-flight requests",
+ TYPE_VHOST_USER_VIRTIO_BLK);
+return -EINVAL;
+}
+
+return 0;
+}
+
+static void vhost_user_virtio_blk_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+/* override vmstate of vhost_user_blk */
+dc->vmsd = &vmstate_vhost_user_virtio_blk;
+
+/* adding callbacks to be compatible with virtio-blk migration stream */
+vdc->save = vhost_user_virtio_blk_save;
+vdc->load = vhost_user_virtio_blk_load;
+}
+
+static const TypeInfo vhost_user_virtio_blk_info = {
+.name = TYPE_VHOST_USER_VIRTIO_BLK,
+.parent = TYPE_VHOST_USER_BLK,
+.instance_size = sizeof(VHostUserBlk),
+/* instance_init is the same as in parent type */
+.class_init = vhost_user_virtio_blk_class_init,
+};
+
 static void virtio_register_types(void)
 {
 type_register_static(&vhost_user_blk_info);
+type_register_static(&vhost_user_virtio_blk_info);
 }
 
 type_init(virtio_register_types)
diff --git a/include/hw/virtio/vhost-user-blk.h 
b/include/hw/virtio/vhost-user-blk.h
index 7c91f15040eb..d81f18d22596 100644
--- a/include/hw/virtio/vhost-user-blk.h
+++ b/include/hw/virtio/vhost-user-blk.h
@@ -23,6 +23,8 @@
 #include "qom/object.h"
 
 #define TYPE_VHOST_USER_BLK "vhost-user-blk"
+#define TYPE_VHOST_USER_VIRTIO_BLK "vhost-user-virtio-blk"
+
 OBJECT_DECLARE_SIMPLE_TYPE(VHostUserBlk, VHOST_USER_BLK)
 
 #define VHOST_USER_BLK_AUTO_NUM_QUEUES UINT16_MAX
-- 
2.25.1




[PATCH v0] vl: flush all task from rcu queue before exiting

2021-11-02 Thread Denis Plotnikov
The device destruction may superimpose over qemu shutdown.
In this case some management layer, requested a device unplug and
waiting for DEVICE_DELETED event, may never get this event.

This happens because device_finalize() may never be called on qemu shutdown
for some devices using address_space_destroy(). The later is called from
the rcu thread.
On qemu shutdown, not all rcu callbacks may be called because the rcu thread
may not have enough time to converge before qemu main thread exit.

To resolve this issue this patch makes rcu thread to finish all its callbacks
explicitly by calling a new rcu intreface function right before
qemu main thread exit.

Signed-off-by: Denis Plotnikov 
---
 include/qemu/rcu.h |  1 +
 softmmu/runstate.c |  3 +++
 util/rcu.c | 12 
 3 files changed, 16 insertions(+)

diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h
index 515d327cf11c..f7fbdc3781e5 100644
--- a/include/qemu/rcu.h
+++ b/include/qemu/rcu.h
@@ -134,6 +134,7 @@ struct rcu_head {
 
 extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func);
 extern void drain_call_rcu(void);
+extern void flush_rcu(void);
 
 /* The operands of the minus operator must have the same type,
  * which must be the one that we specify in the cast.
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 10d9b7365aa7..28f319a97a2b 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -822,5 +822,8 @@ void qemu_cleanup(void)
 monitor_cleanup();
 qemu_chr_cleanup();
 user_creatable_cleanup();
+
+/* finish all the tasks from rcu queue before exiting */
+flush_rcu();
 /* TODO: unref root container, check all devices are ok */
 }
diff --git a/util/rcu.c b/util/rcu.c
index 13ac0f75cb2a..f047f8ee8d16 100644
--- a/util/rcu.c
+++ b/util/rcu.c
@@ -348,6 +348,18 @@ void drain_call_rcu(void)
 
 }
 
+/*
+ * This function drains rcu queue until there are no tasks to do left
+ * and aims to the cases when one needs to ensure that no work hang
+ * in rcu thread before proceeding, e.g. on qemu shutdown.
+ */
+void flush_rcu(void)
+{
+while (qatomic_read(&rcu_call_count) > 0) {
+drain_call_rcu();
+}
+}
+
 void rcu_register_thread(void)
 {
 assert(rcu_reader.ctr == 0);
-- 
2.25.1




Re: [PATCH v0] vl: flush all task from rcu queue before exiting

2021-11-02 Thread Denis Plotnikov



On 02.11.2021 16:39, Denis Plotnikov wrote:

The device destruction may superimpose over qemu shutdown.
In this case some management layer, requested a device unplug and
waiting for DEVICE_DELETED event, may never get this event.

This happens because device_finalize() may never be called on qemu shutdown
for some devices using address_space_destroy(). The later is called from
the rcu thread.
On qemu shutdown, not all rcu callbacks may be called because the rcu thread
may not have enough time to converge before qemu main thread exit.

To resolve this issue this patch makes rcu thread to finish all its callbacks
explicitly by calling a new rcu intreface function right before
qemu main thread exit.

Signed-off-by: Denis Plotnikov 
---
  include/qemu/rcu.h |  1 +
  softmmu/runstate.c |  3 +++
  util/rcu.c | 12 
  3 files changed, 16 insertions(+)

diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h
index 515d327cf11c..f7fbdc3781e5 100644
--- a/include/qemu/rcu.h
+++ b/include/qemu/rcu.h
@@ -134,6 +134,7 @@ struct rcu_head {
  
  extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func);

  extern void drain_call_rcu(void);
+extern void flush_rcu(void);
  
  /* The operands of the minus operator must have the same type,

   * which must be the one that we specify in the cast.
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 10d9b7365aa7..28f319a97a2b 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -822,5 +822,8 @@ void qemu_cleanup(void)
actually, flush_rcu() should be here before monitor_cleanup to send 
DEVICE_DELETED

  monitor_cleanup();
  qemu_chr_cleanup();
  user_creatable_cleanup();
+
+/* finish all the tasks from rcu queue before exiting */
+flush_rcu();
  /* TODO: unref root container, check all devices are ok */
  }
diff --git a/util/rcu.c b/util/rcu.c
index 13ac0f75cb2a..f047f8ee8d16 100644
--- a/util/rcu.c
+++ b/util/rcu.c
@@ -348,6 +348,18 @@ void drain_call_rcu(void)
  
  }
  
+/*

+ * This function drains rcu queue until there are no tasks to do left
+ * and aims to the cases when one needs to ensure that no work hang
+ * in rcu thread before proceeding, e.g. on qemu shutdown.
+ */
+void flush_rcu(void)
+{
+while (qatomic_read(&rcu_call_count) > 0) {
+drain_call_rcu();
+}
+}
+
  void rcu_register_thread(void)
  {
  assert(rcu_reader.ctr == 0);




[Ping][PATCH v0] vl: flush all task from rcu queue before exiting

2021-11-08 Thread Denis Plotnikov

Ping ping!

On 02.11.2021 16:39, Denis Plotnikov wrote:

The device destruction may superimpose over qemu shutdown.
In this case some management layer, requested a device unplug and
waiting for DEVICE_DELETED event, may never get this event.

This happens because device_finalize() may never be called on qemu shutdown
for some devices using address_space_destroy(). The later is called from
the rcu thread.
On qemu shutdown, not all rcu callbacks may be called because the rcu thread
may not have enough time to converge before qemu main thread exit.

To resolve this issue this patch makes rcu thread to finish all its callbacks
explicitly by calling a new rcu intreface function right before
qemu main thread exit.

Signed-off-by: Denis Plotnikov 
---
  include/qemu/rcu.h |  1 +
  softmmu/runstate.c |  3 +++
  util/rcu.c | 12 
  3 files changed, 16 insertions(+)

diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h
index 515d327cf11c..f7fbdc3781e5 100644
--- a/include/qemu/rcu.h
+++ b/include/qemu/rcu.h
@@ -134,6 +134,7 @@ struct rcu_head {
  
  extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func);

  extern void drain_call_rcu(void);
+extern void flush_rcu(void);
  
  /* The operands of the minus operator must have the same type,

   * which must be the one that we specify in the cast.
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 10d9b7365aa7..28f319a97a2b 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -822,5 +822,8 @@ void qemu_cleanup(void)
  monitor_cleanup();
  qemu_chr_cleanup();
  user_creatable_cleanup();
+
+/* finish all the tasks from rcu queue before exiting */
+flush_rcu();
  /* TODO: unref root container, check all devices are ok */
  }
diff --git a/util/rcu.c b/util/rcu.c
index 13ac0f75cb2a..f047f8ee8d16 100644
--- a/util/rcu.c
+++ b/util/rcu.c
@@ -348,6 +348,18 @@ void drain_call_rcu(void)
  
  }
  
+/*

+ * This function drains rcu queue until there are no tasks to do left
+ * and aims to the cases when one needs to ensure that no work hang
+ * in rcu thread before proceeding, e.g. on qemu shutdown.
+ */
+void flush_rcu(void)
+{
+while (qatomic_read(&rcu_call_count) > 0) {
+drain_call_rcu();
+}
+}
+
  void rcu_register_thread(void)
  {
  assert(rcu_reader.ctr == 0);




Re: [Ping][PATCH v0] vl: flush all task from rcu queue before exiting

2021-11-10 Thread Denis Plotnikov



On 09.11.2021 20:46, Paolo Bonzini wrote:

On 11/9/21 08:23, Denis Plotnikov wrote:

Ping ping!


Looks good, but can you explain why it's okay to call it before 
qemu_chr_cleanup() and user_creatable_cleanup()?


I think a better solution to the ordering problem would be:

  qemu_chr_cleanup();
  user_creatable_cleanup();
  flush_rcu();
  monitor_cleanup();

I agree, this looks better


with something like this:

diff --git a/chardev/char-fe.c b/chardev/char-fe.c
index 7789f7be9c..f0c3ea5447 100644
--- a/chardev/char-fe.c
+++ b/chardev/char-fe.c
@@ -195,6 +195,7 @@ bool qemu_chr_fe_init(CharBackend *b,
 int tag = 0;

 if (s) {
+    object_ref(OBJECT(s));
 if (CHARDEV_IS_MUX(s)) {
 MuxChardev *d = MUX_CHARDEV(s);

@@ -241,6 +242,7 @@ void qemu_chr_fe_deinit(CharBackend *b, bool del)
 } else {
 object_unref(obj);
 }
+    object_unref(obj);
 }
 b->chr = NULL;
 }

to keep the chardev live between qemu_chr_cleanup() and 
monitor_cleanup().


but frankly speaking I don't understand why we have to do ref/unref in 
char-fe interface functions, instead of just ref/uref-ing monitor's char 
device directly like this:


diff --git a/monitor/monitor.c b/monitor/monitor.c
index 21c7a68758f5..3692a8e15268 100644
--- a/monitor/monitor.c
+++ b/monitor/monitor.c
@@ -611,6 +611,7 @@ void monitor_data_destroy(Monitor *mon)
 {
 g_free(mon->mon_cpu_path);
 qemu_chr_fe_deinit(&mon->chr, false);
+    object_unref(OBJECT(&mon->chr));
 if (monitor_is_qmp(mon)) {
 monitor_data_destroy_qmp(container_of(mon, MonitorQMP, common));
 } else {
@@ -737,6 +738,7 @@ int monitor_init(MonitorOptions *opts, bool 
allow_hmp, Error **errp)

 error_propagate(errp, local_err);
 return -1;
 }
+    object_ref(OBJECT(chr));
 return 0;
 }

May be this shows the intentions better?

Denis



Paolo





[PATCH v4] qapi/qmp: Add timestamps to qmp command responses

2022-11-01 Thread Denis Plotnikov
Add "start" & "end" time values to QMP command responses.

These time values are added to let the qemu management layer get the exact
command execution time without any other time variance which might be brought
by other parts of management layer or qemu internals.
This helps to look for problems poactively from the management layer side.
The management layer would be able to detect problem cases by calculating
QMP command execution time:
1. execution_time_from_mgmt_perspective -
   execution_time_of_qmp_command > some_threshold
   This detects problems with management layer or internal qemu QMP command
   dispatching
2. current_qmp_command_execution_time > avg_qmp_command_execution_time
   This detects that a certain QMP command starts to execute longer than
   usual
In both these cases more thorough investigation of the root cases should be
done by using some qemu tracepoints depending on particular QMP command under
investigation or by other means. The timestamps help to avoid excessive log
output when qemu tracepoints are used to address similar cases.

Example of result:

./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

(QEMU) query-status
{"end": {"seconds": 1650367305, "microseconds": 831032},
 "start": {"seconds": 1650367305, "microseconds": 831012},
 "return": {"status": "running", "singlestep": false, "running": true}}

The response of the QMP command contains the start & end time of
the QMP command processing.

Also, "start" & "end" timestaps are added to qemu guest agent responses as
qemu-ga shares the same code for request dispatching.

Suggested-by: Andrey Ryabinin 
Signed-off-by: Denis Plotnikov 
Reviewed-by: Daniel P. Berrangé 
---
v3->v4
 - rewrite commit message [Markus]
 - use new fileds description in doc [Markus]
 - change type to int64_t [Markus]
 - simplify tests [Markus]

v2->v3:
 - fix typo "timestaps -> timestamps" [Marc-André]

v1->v2:
 - rephrase doc descriptions [Daniel]
 - add tests for qmp timestamps to qmp test and qga test [Daniel]
 - adjust asserts in test-qmp-cmds according to the new number of returning keys

v0->v1:
 - remove interface to control "start" and "end" time values: return timestamps 
unconditionally
 - add description to qmp specification
 - leave the same timestamp format in "seconds", "microseconds" to be 
consistent with events
   timestamp
 - fix patch description

 docs/interop/qmp-spec.txt  | 28 ++--
 qapi/qmp-dispatch.c| 18 ++
 tests/qtest/qmp-test.c | 32 
 tests/unit/test-qga.c  | 29 +
 tests/unit/test-qmp-cmds.c |  4 ++--
 5 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt
index b0e8351d5b261..0dd8e716c02f0 100644
--- a/docs/interop/qmp-spec.txt
+++ b/docs/interop/qmp-spec.txt
@@ -158,7 +158,9 @@ responses that have an unknown "id" field.
 
 The format of a success response is:
 
-{ "return": json-value, "id": json-value }
+{ "return": json-value, "id": json-value,
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
 
  Where,
 
@@ -169,13 +171,25 @@ The format of a success response is:
   command does not return data
 - The "id" member contains the transaction identification associated
   with the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
+- The "end" member contains the exact time of when the server
+  finished executing the command. This excludes any time the
+  command response spent queued, waiting to be sent on the wire.
+  It is a json-object with the number of seconds and microseconds
+  since the Unix epoch
 
 2.4.2 error
 ---
 
 The format of an error response is:
 
-{ "error": { "class": json-string, "desc": json-string }, "id": json-value }
+{ "error": { "class": json-string, "desc": json-string }, "id": json-value
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
 
  Where,
 
@@ -184,6 +198,16 @@ The format of an error response is:
   not attempt to parse this message.
 - The "id&

[patch v0] qapi/qmp: Add timestamps to qmp command responses.

2022-09-26 Thread Denis Plotnikov
Add "start" & "end" timestamps to qmp command responses.
It's disabled by default, but can be enabled with 'timestamp=on'
monitor's parameter, e.g.:
-chardev  socket,id=mon1,path=/tmp/qmp.socket,server=on,wait=off
-mon chardev=mon1,mode=control,timestamp=on

Example of result:

./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

(QEMU) query-status
{"end": {"seconds": 1650367305, "microseconds": 831032},
 "start": {"seconds": 1650367305, "microseconds": 831012},
 "return": {"status": "running", "singlestep": false, "running": true}}

The responce of the qmp command contains the start & end time of
the qmp command processing.

These times may be helpful for the management layer in understanding of
the actual timeline of a qmp command processing.

Suggested-by: Andrey Ryabinin 
Signed-off-by: Denis Plotnikov 
---
 include/monitor/monitor.h   |  2 +-
 include/qapi/qmp/dispatch.h |  2 +-
 monitor/monitor-internal.h  |  1 +
 monitor/monitor.c   |  9 -
 monitor/qmp.c   |  5 +++--
 qapi/control.json   |  3 +++
 qapi/qmp-dispatch.c | 28 +++-
 qga/main.c  |  2 +-
 stubs/monitor-core.c|  2 +-
 tests/unit/test-qmp-cmds.c  |  6 +++---
 10 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index a4b40e8391db4..2a18e9ee34bc2 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -19,7 +19,7 @@ bool monitor_cur_is_qmp(void);
 
 void monitor_init_globals(void);
 void monitor_init_globals_core(void);
-void monitor_init_qmp(Chardev *chr, bool pretty, Error **errp);
+void monitor_init_qmp(Chardev *chr, bool pretty, bool timestamp, Error **errp);
 void monitor_init_hmp(Chardev *chr, bool use_readline, Error **errp);
 int monitor_init(MonitorOptions *opts, bool allow_hmp, Error **errp);
 int monitor_init_opts(QemuOpts *opts, Error **errp);
diff --git a/include/qapi/qmp/dispatch.h b/include/qapi/qmp/dispatch.h
index 1e4240fd0dbc0..d07f5764271be 100644
--- a/include/qapi/qmp/dispatch.h
+++ b/include/qapi/qmp/dispatch.h
@@ -56,7 +56,7 @@ const char *qmp_command_name(const QmpCommand *cmd);
 bool qmp_has_success_response(const QmpCommand *cmd);
 QDict *qmp_error_response(Error *err);
 QDict *qmp_dispatch(const QmpCommandList *cmds, QObject *request,
-bool allow_oob, Monitor *cur_mon);
+bool allow_oob, bool timestamp, Monitor *cur_mon);
 bool qmp_is_oob(const QDict *dict);
 
 typedef void (*qmp_cmd_callback_fn)(const QmpCommand *cmd, void *opaque);
diff --git a/monitor/monitor-internal.h b/monitor/monitor-internal.h
index caa2e90ef22a4..69425a7bc8152 100644
--- a/monitor/monitor-internal.h
+++ b/monitor/monitor-internal.h
@@ -136,6 +136,7 @@ typedef struct {
 Monitor common;
 JSONMessageParser parser;
 bool pretty;
+bool timestamp;
 /*
  * When a client connects, we're in capabilities negotiation mode.
  * @commands is &qmp_cap_negotiation_commands then.  When command
diff --git a/monitor/monitor.c b/monitor/monitor.c
index 86949024f643a..85a0b6498dbc1 100644
--- a/monitor/monitor.c
+++ b/monitor/monitor.c
@@ -726,7 +726,7 @@ int monitor_init(MonitorOptions *opts, bool allow_hmp, 
Error **errp)
 
 switch (opts->mode) {
 case MONITOR_MODE_CONTROL:
-monitor_init_qmp(chr, opts->pretty, &local_err);
+monitor_init_qmp(chr, opts->pretty, opts->timestamp, &local_err);
 break;
 case MONITOR_MODE_READLINE:
 if (!allow_hmp) {
@@ -737,6 +737,10 @@ int monitor_init(MonitorOptions *opts, bool allow_hmp, 
Error **errp)
 error_setg(errp, "'pretty' is not compatible with HMP monitors");
 return -1;
 }
+if (opts->timestamp) {
+error_setg(errp, "'timestamp' is not compatible with HMP 
monitors");
+return -1;
+}
 monitor_init_hmp(chr, true, &local_err);
 break;
 default:
@@ -782,6 +786,9 @@ QemuOptsList qemu_mon_opts = {
 },{
 .name = "pretty",
 .type = QEMU_OPT_BOOL,
+},{
+.name = "timestamp",
+.type = QEMU_OPT_BOOL,
 },
 { /* end of list */ }
 },
diff --git a/monitor/qmp.c b/monitor/qmp.c
index 092c527b6fc9c..fd487fee9f850 100644
--- a/monitor/qmp.c
+++ b/monitor/qmp.c
@@ -142,7 +142,7 @@ static void monitor_qmp_dispatch(MonitorQMP *mon, QObject 
*req)
 QDict *error;
 
 rsp = qmp_dispatch(mon->commands, req, qmp_oob_enabled(mon),
-   &mon->common);
+   mon->timestamp, &mon->common);
 
 if (mon->commands == &qmp_cap_negotiation_commands) {

Re: [patch v0] qapi/qmp: Add timestamps to qmp command responses.

2022-09-27 Thread Denis Plotnikov



On 27.09.2022 09:04, Markus Armbruster wrote:

Daniel P. Berrangé  writes:


On Mon, Sep 26, 2022 at 12:59:40PM +0300, Denis Plotnikov wrote:

Add "start" & "end" timestamps to qmp command responses.
It's disabled by default, but can be enabled with 'timestamp=on'
monitor's parameter, e.g.:
 -chardev  socket,id=mon1,path=/tmp/qmp.socket,server=on,wait=off
 -mon chardev=mon1,mode=control,timestamp=on

I'm not convinced a cmdline flag is the right approach here.

I think it ought be something defined by the QMP spec.

The QMP spec is docs/interop/qmp-spec.txt.  The feature needs to be
defined there regardless of how we control it.

ok, thanks for pointing out



The "QMP" greeting should report "timestamp" capabilities.

The 'qmp_capabilities' command can be used to turn on this
capability for all commands henceforth.

Yes, this is how optional QMP protocol features should be controlled.

Bonus: control is per connection, not just globally.


As an option extra, the 'execute' command could gain a
parameter to allow this to be requested for only an
individual command.

Needs a use case.


Alternatively we could say the overhead of adding the timestmaps
is small enough that we just add this unconditionally for
everything hence, with no opt-in/opt-out.

Yes, because the extension is backwards compatible.


May be it worth to send the timestamps always in the response if doesn't 
contradicts with anything and doesn't bring any unnecessary data overhead.


From the other hand turning it on via qmp capabilities seems to be more 
flexible solution.




Aside: qmp-spec.txt could be clearer on what that means.


Example of result:

 ./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

 (QEMU) query-status
 {"end": {"seconds": 1650367305, "microseconds": 831032},
  "start": {"seconds": 1650367305, "microseconds": 831012},
  "return": {"status": "running", "singlestep": false, "running": true}}

The responce of the qmp command contains the start & end time of
the qmp command processing.

Seconds and microseconds since when?  The update to qmp-spec.txt should
tell.

Why split the time into seconds and microseconds?  If you use
microseconds since the Unix epoch (1970-01-01 UTC), 64 bit unsigned will
result in a year 586524 problem:

 $ date --date "@`echo '2^64/100' | bc`"
 Wed Jan 19 09:01:49 CET 586524

Even a mere 53 bits will last until 2255.
This is Just for convenience, may be it's too much and timestamp in msec 
if enough



These times may be helpful for the management layer in understanding of
the actual timeline of a qmp command processing.

Can you explain the problem scenario in more detail.

Yes, please, because:


The mgmt app already knows when it send the QMP command and knows
when it gets the QMP reply.  This covers the time the QMP was
queued before processing (might be large if QMP is blocked on
another slow command) , the processing time, and the time any
reply was queued before sending (ought to be small).

So IIUC, the value these fields add is that they let the mgmt
app extract only the command processing time, eliminating
any variance do to queue before/after.
So the scenario is the following: we need a means to understand from the 
management layer prospecitive of what is the timeline of the command 
execution. This is needed for a problem resolving if a qmp command 
executes for too long from the management layer point of view. 
Specifically, management layer sees the execution time as 
"management_layer_internal_routine_time" + "qemu_dispatching_time" + 
"qemu_qmp_command_execution_time". Suggested qmp command timestaps gives 
"qemu_command_execution_time". Management layer calculates 
"management_layer_internal_routine_time" internally. Using those two 
things we can calculate "qemu_dispatching_time" and decide where the 
potential delays comes from. This will gives us a direction of further 
problem investigation.



Suggested-by: Andrey Ryabinin 
Signed-off-by: Denis Plotnikov 




[PATCH v1] qapi/qmp: Add timestamps to qmp command responses

2022-10-07 Thread Denis Plotnikov
Add "start" & "end" time values to qmp command responses.

These time values are added to let the qemu management layer get the exact
command execution time without any other time variance which might be brought by
other parts of management layer or qemu internals. This is particulary useful
for the management layer logging for later problems resolving.

Example of result:

./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

(QEMU) query-status
{"end": {"seconds": 1650367305, "microseconds": 831032},
 "start": {"seconds": 1650367305, "microseconds": 831012},
 "return": {"status": "running", "singlestep": false, "running": true}}

The responce of the qmp command contains the start & end time of
the qmp command processing.

Suggested-by: Andrey Ryabinin 
Signed-off-by: Denis Plotnikov 
---
v0->v1:
 - remove interface to control "start" and "end" time values: return timestamps 
unconditionally
 - add description to qmp specification
 - leave the same timestamp format in "seconds", "microseconds" to be 
consistent with events
   timestamp
 - fix patch description

 docs/interop/qmp-spec.txt | 20 ++--
 qapi/qmp-dispatch.c   | 18 ++
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt
index b0e8351d5b261..d1cca8bc447ce 100644
--- a/docs/interop/qmp-spec.txt
+++ b/docs/interop/qmp-spec.txt
@@ -158,7 +158,9 @@ responses that have an unknown "id" field.
 
 The format of a success response is:
 
-{ "return": json-value, "id": json-value }
+{ "return": json-value, "id": json-value,
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
 
  Where,
 
@@ -169,13 +171,21 @@ The format of a success response is:
   command does not return data
 - The "id" member contains the transaction identification associated
   with the command execution if issued by the Client
+- The "start" member contains the exact time of when the command has been
+  stated to be processed. It is a fixed json-object with time in
+  seconds and microseconds relative to the Unix Epoch (1 Jan 1970)
+- The "end" member contains the exact time of when the command has been
+  finished to be processed. It is a fixed json-object with time in
+  seconds and microseconds relative to the Unix Epoch (1 Jan 1970)
 
 2.4.2 error
 ---
 
 The format of an error response is:
 
-{ "error": { "class": json-string, "desc": json-string }, "id": json-value }
+{ "error": { "class": json-string, "desc": json-string }, "id": json-value
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
 
  Where,
 
@@ -184,6 +194,12 @@ The format of an error response is:
   not attempt to parse this message.
 - The "id" member contains the transaction identification associated with
   the command execution if issued by the Client
+- The "start" member contains the exact time of when the command has been
+  stated to be processed. It is a fixed json-object with time in
+  seconds and microseconds relative to the Unix Epoch (1 Jan 1970)
+- The "end" member contains the exact time of when the command has been
+  finished to be processed. It is a fixed json-object with time in
+  seconds and microseconds relative to the Unix Epoch (1 Jan 1970)
 
 NOTE: Some errors can occur before the Server is able to read the "id" member,
 in these cases the "id" member will not be part of the error response, even
diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
index 0990873ec8ec1..fce87416f2128 100644
--- a/qapi/qmp-dispatch.c
+++ b/qapi/qmp-dispatch.c
@@ -130,6 +130,22 @@ static void do_qmp_dispatch_bh(void *opaque)
 aio_co_wake(data->co);
 }
 
+static void add_timestamps(QDict *qdict, uint64_t start_ms, uint64_t end_ms)
+{
+QDict *start_dict, *end_dict;
+
+start_dict = qdict_new();
+qdict_put_int(start_dict, "seconds", start_ms / G_USEC_PER_SEC);
+qdict_put_int(start_dict, "microseconds", start_ms % G_USEC_PER_SEC);
+
+end_dict = qdict_new();
+qdict_put_int(end_dict, "seconds", end_ms / G_USEC_PER_SEC);
+qdict_put_int(end_dict, "microseconds", end_ms % G_USEC_PER_SEC);
+
+qdict_put_obj(qdict, "start", QOBJECT(start_dict));
+qdict_put_obj(qdict, "end", QOBJECT(end_dict));
+}
+
 /*
  * Runs outside of coroutine context for OOB commands, but in coroutine
  * context for everything else.
@@ -146,6 +162,7 @@ QDict *qmp_dispatch(const QmpCommandList *cmds, QObject 
*request,
 QObject *id;
 QObject *ret = NULL;
 QDict *rsp = NULL;
+uint64_t ts_start = g_get_real_time();
 
 dict = qobject_to(QDict, request);
 if (!dict) {
@@ -270,5 +287,6 @@ out:
 qdict_put_obj(rsp, "id", qobject_ref(id));
 }
 
+add_timestamps(rsp, ts_start, g_get_real_time());
 return rsp;
 }
-- 
2.25.1




[PATCH v2] qapi/qmp: Add timestamps to qmp command responses

2022-10-11 Thread Denis Plotnikov
Add "start" & "end" time values to qmp command responses.

These time values are added to let the qemu management layer get the exact
command execution time without any other time variance which might be brought by
other parts of management layer or qemu internals. This is particulary useful
for the management layer logging for later problems resolving.

Example of result:

./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

(QEMU) query-status
{"end": {"seconds": 1650367305, "microseconds": 831032},
 "start": {"seconds": 1650367305, "microseconds": 831012},
 "return": {"status": "running", "singlestep": false, "running": true}}

The responce of the qmp command contains the start & end time of
the qmp command processing.

Suggested-by: Andrey Ryabinin 
Signed-off-by: Denis Plotnikov 
---
v0->v1:
 - remove interface to control "start" and "end" time values: return timestamps 
unconditionally
 - add description to qmp specification
 - leave the same timestamp format in "seconds", "microseconds" to be 
consistent with events
   timestamp
 - fix patch description

v1->v2:
 - rephrase doc descriptions [Daniel]
 - add tests for qmp timestamps to qmp test and qga test [Daniel]
 - adjust asserts in test-qmp-cmds according to the new number of returning keys

 docs/interop/qmp-spec.txt  | 28 ++--
 qapi/qmp-dispatch.c| 18 ++
 tests/qtest/qmp-test.c | 34 ++
 tests/unit/test-qga.c  | 31 +++
 tests/unit/test-qmp-cmds.c |  4 ++--
 5 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt
index b0e8351d5b261..2e0b7de0c4dc7 100644
--- a/docs/interop/qmp-spec.txt
+++ b/docs/interop/qmp-spec.txt
@@ -158,7 +158,9 @@ responses that have an unknown "id" field.
 
 The format of a success response is:
 
-{ "return": json-value, "id": json-value }
+{ "return": json-value, "id": json-value,
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
 
  Where,
 
@@ -169,13 +171,25 @@ The format of a success response is:
   command does not return data
 - The "id" member contains the transaction identification associated
   with the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a fixed json-object with time in seconds and microseconds
+  relative to the Unix Epoch (1 Jan 1970)
+- The "end" member contains the exact time of when the server
+  finished executing the command. This excludes any time the
+  command response spent queued, waiting to be sent on the wire.
+  It is a fixed json-object with time in seconds and microseconds
+  relative to the Unix Epoch (1 Jan 1970)
 
 2.4.2 error
 ---
 
 The format of an error response is:
 
-{ "error": { "class": json-string, "desc": json-string }, "id": json-value }
+{ "error": { "class": json-string, "desc": json-string }, "id": json-value
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
 
  Where,
 
@@ -184,6 +198,16 @@ The format of an error response is:
   not attempt to parse this message.
 - The "id" member contains the transaction identification associated with
   the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a fixed json-object with time in seconds and microseconds
+  relative to the Unix Epoch (1 Jan 1970)
+- The "end" member contains the exact time of when the server
+  finished executing the command. This excludes any time the
+  command response spent queued, waiting to be sent on the wire.
+  It is a fixed json-object with time in seconds and microseconds
+  relative to the Unix Epoch (1 Jan 1970)
 
 NOTE: Some errors can occur before the Server is able to read the "id" member,
 in these cases the "id" member will not be part of the error response, even
diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
index 0990873ec8ec1..fce87416f2128 100644
--- a/qapi/qmp-dispatch.c
+++ b/qapi/qmp-dispatch.c
@@ -130,6 +130,22 @@ static void do_qmp_disp

[PATCH v3] qapi/qmp: Add timestamps to qmp command responses

2022-10-11 Thread Denis Plotnikov
Add "start" & "end" time values to qmp command responses.

These time values are added to let the qemu management layer get the exact
command execution time without any other time variance which might be brought by
other parts of management layer or qemu internals. This is particulary useful
for the management layer logging for later problems resolving.

Example of result:

./qemu/scripts/qmp/qmp-shell /tmp/qmp.socket

(QEMU) query-status
{"end": {"seconds": 1650367305, "microseconds": 831032},
 "start": {"seconds": 1650367305, "microseconds": 831012},
 "return": {"status": "running", "singlestep": false, "running": true}}

The responce of the qmp command contains the start & end time of
the qmp command processing.

Suggested-by: Andrey Ryabinin 
Signed-off-by: Denis Plotnikov 
Reviewed-by: Daniel P. Berrangé 
---

v0->v1:
 - remove interface to control "start" and "end" time values: return timestamps 
unconditionally
 - add description to qmp specification
 - leave the same timestamp format in "seconds", "microseconds" to be 
consistent with events
   timestamp
 - fix patch description

v1->v2:
 - rephrase doc descriptions [Daniel]
 - add tests for qmp timestamps to qmp test and qga test [Daniel]
 - adjust asserts in test-qmp-cmds according to the new number of returning keys

v2->v3:
 - fix typo "timestaps -> timestamps" [Marc-André]

 docs/interop/qmp-spec.txt  | 28 ++--
 qapi/qmp-dispatch.c| 18 ++
 tests/qtest/qmp-test.c | 34 ++
 tests/unit/test-qga.c  | 31 +++
 tests/unit/test-qmp-cmds.c |  4 ++--
 5 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/docs/interop/qmp-spec.txt b/docs/interop/qmp-spec.txt
index b0e8351d5b261..2e0b7de0c4dc7 100644
--- a/docs/interop/qmp-spec.txt
+++ b/docs/interop/qmp-spec.txt
@@ -158,7 +158,9 @@ responses that have an unknown "id" field.
 
 The format of a success response is:
 
-{ "return": json-value, "id": json-value }
+{ "return": json-value, "id": json-value,
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
 
  Where,
 
@@ -169,13 +171,25 @@ The format of a success response is:
   command does not return data
 - The "id" member contains the transaction identification associated
   with the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a fixed json-object with time in seconds and microseconds
+  relative to the Unix Epoch (1 Jan 1970)
+- The "end" member contains the exact time of when the server
+  finished executing the command. This excludes any time the
+  command response spent queued, waiting to be sent on the wire.
+  It is a fixed json-object with time in seconds and microseconds
+  relative to the Unix Epoch (1 Jan 1970)
 
 2.4.2 error
 ---
 
 The format of an error response is:
 
-{ "error": { "class": json-string, "desc": json-string }, "id": json-value }
+{ "error": { "class": json-string, "desc": json-string }, "id": json-value
+  "start": {"seconds": json-value, "microseconds": json-value},
+  "end": {"seconds": json-value, "microseconds": json-value} }
 
  Where,
 
@@ -184,6 +198,16 @@ The format of an error response is:
   not attempt to parse this message.
 - The "id" member contains the transaction identification associated with
   the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a fixed json-object with time in seconds and microseconds
+  relative to the Unix Epoch (1 Jan 1970)
+- The "end" member contains the exact time of when the server
+  finished executing the command. This excludes any time the
+  command response spent queued, waiting to be sent on the wire.
+  It is a fixed json-object with time in seconds and microseconds
+  relative to the Unix Epoch (1 Jan 1970)
 
 NOTE: Some errors can occur before the Server is able to read the "id" member,
 in these cases the "id" member will not be part of the error response, even
diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
index 0990873ec8ec1..fce8

Re: [PATCH v3] qapi/qmp: Add timestamps to qmp command responses

2022-10-14 Thread Denis Plotnikov



On 13.10.2022 18:00, Markus Armbruster wrote:

Denis Plotnikov  writes:


Add "start" & "end" time values to qmp command responses.

Please spell it QMP.  More of the same below.

ok

Can you tell me about a problem you cracked (or could have cracked) with
the help of this?


We have a management layer which interacts with qemu via qmp. When it 
issues a qmp command we measure execution time which takes to perform a 
certain qmp command. Some of that commands seems to execute longer that 
expected. In that case there is a question what part of command 
execution takes the majority of time. Is it the flaw in the management 
layer or in qemu qmp command scheduling or the qmp command execution 
itself? The timestaps being added help to exclude the qmp command 
execution time from the question. Also timestamps helps to get know the 
exact time when the command is started and ended and put that 
information to a system logs properly according to timestamps.



  "return": {"status": "running", "singlestep": false, "running": true}}

The responce of the qmp command contains the start & end time of

response

ok



the qmp command processing.

Suggested-by: Andrey Ryabinin 
Signed-off-by: Denis Plotnikov 
Reviewed-by: Daniel P. Berrangé 

Please spell out that this affects both QMP and qemu-ga.

ok

command does not return data
  - The "id" member contains the transaction identification associated
with the command execution if issued by the Client
+- The "start" member contains the exact time of when the server
+  started executing the command. This excludes any time the
+  command request spent queued, after reading it off the wire.
+  It is a fixed json-object with time in seconds and microseconds
+  relative to the Unix Epoch (1 Jan 1970)

What's a "fixed json-object"?

Hmm, I guess you're copying from the description of event member
"timestamp".

That's right

Let's go with "a json-object with the number of seconds and microseconds
since the Unix epoch" everywhere.

ok


Make this int64_t, because that's what g_get_real_time() returns.

Same for add_timestamps() parameters.

ok, will fix the type everywhere


+qobject_unref(resp);
I'd be tempted to fold this into existing tests.


Do you want me to put timestamp checking to an existing testcase?


Thanks,

Denis




+
  qtest_quit(qts);
  }
  
diff --git a/tests/unit/test-qga.c b/tests/unit/test-qga.c

index b4e0a145737d1..18ec9bac3650e 100644
--- a/tests/unit/test-qga.c
+++ b/tests/unit/test-qga.c
@@ -217,6 +217,36 @@ static void test_qga_ping(gconstpointer fix)
  qmp_assert_no_error(ret);
  }
  
+static void test_qga_timestamps(gconstpointer fix)

+{
+QDict *start, *end;
+uint64_t start_s, start_us, end_s, end_us, start_ts, end_ts;
+const TestFixture *fixture = fix;
+g_autoptr(QDict) ret = NULL;
+
+ret = qmp_fd(fixture->fd, "{'execute': 'guest-ping'}");
+g_assert_nonnull(ret);
+qmp_assert_no_error(ret);
+
+start = qdict_get_qdict(ret, "start");
+g_assert(start);
+end = qdict_get_qdict(ret, "end");
+g_assert(end);
+
+start_s = qdict_get_try_int(start, "seconds", 0);
+g_assert(start_s);
+start_us = qdict_get_try_int(start, "microseconds", 0);
+
+end_s = qdict_get_try_int(end, "seconds", 0);
+g_assert(end_s);
+end_us = qdict_get_try_int(end, "microseconds", 0);
+
+start_ts = (start_s * G_USEC_PER_SEC) + start_us;
+end_ts = (end_s * G_USEC_PER_SEC) + end_us;
+
+g_assert(end_ts > start_ts);
+}
+
  static void test_qga_id(gconstpointer fix)
  {
  const TestFixture *fixture = fix;
@@ -948,6 +978,7 @@ int main(int argc, char **argv)
  g_test_add_data_func("/qga/sync-delimited", &fix, 
test_qga_sync_delimited);
  g_test_add_data_func("/qga/sync", &fix, test_qga_sync);
  g_test_add_data_func("/qga/ping", &fix, test_qga_ping);
+g_test_add_data_func("/qga/timestamps", &fix, test_qga_timestamps);
  g_test_add_data_func("/qga/info", &fix, test_qga_info);
  g_test_add_data_func("/qga/network-get-interfaces", &fix,
   test_qga_network_get_interfaces);
diff --git a/tests/unit/test-qmp-cmds.c b/tests/unit/test-qmp-cmds.c
index 6085c099950b5..54d63bb8e346f 100644
--- a/tests/unit/test-qmp-cmds.c
+++ b/tests/unit/test-qmp-cmds.c
@@ -154,7 +154,7 @@ static QObject *do_qmp_dispatch(bool allow_oob, const char 
*template, ...)
  g_assert(resp);
  ret = qdict_get(resp, "return");
  g_assert(ret);
-g_assert(qdict_size(resp) == 1);
+g_assert(qdict_size(resp) == 3);
  
  qobject_ref(ret);

  qobject_unref(resp);
@@ -181,7 +181,7 @@ static void do_qmp_dispatch_error(bool allow_oob, 
ErrorClass cls,
  ==, QapiErrorClass_str(cls));
  g_assert(qdict_get_try_str(error, "desc"));
  g_assert(qdict_size(error) == 2);
-g_assert(qdict_size(resp) == 1);
+g_assert(qdict_size(resp) == 3);
  
  qobject_unref(resp);

  qobject_unref(req);




Re: [PATCH v3] qapi/qmp: Add timestamps to qmp command responses

2022-10-16 Thread Denis Plotnikov



On 14.10.2022 16:19, Daniel P. Berrangé wrote:

On Fri, Oct 14, 2022 at 02:57:06PM +0200, Markus Armbruster wrote:

Daniel P. Berrangé  writes:


On Fri, Oct 14, 2022 at 11:31:13AM +0200, Markus Armbruster wrote:

Daniel P. Berrangé  writes:


On Thu, Oct 13, 2022 at 05:00:26PM +0200, Markus Armbruster wrote:

Denis Plotnikov  writes:


Add "start" & "end" time values to qmp command responses.

Please spell it QMP.  More of the same below.


These time values are added to let the qemu management layer get the exact
command execution time without any other time variance which might be brought by
other parts of management layer or qemu internals. This is particulary useful
for the management layer logging for later problems resolving.

I'm still having difficulties seeing the value add over existing
tracepoints and logging.

Can you tell me about a problem you cracked (or could have cracked) with
the help of this?

Consider your QMP client is logging all commands and replies in its
own logfile (libvirt can do this). Having this start/end timestamps
included means the QMP client log is self contained.

A QMP client can include client-side timestamps in its log.  What value
is being added by server-side timestamps?  According to the commit
message, it's for getting "the exact command execution time without any
other time variance which might be brought by other parts of management
layer or qemu internals."  Why is that useful?  In particular, why is
excluding network and QEMU queueing delays (inbound and outbound)
useful?

Lets, say some commands normally runs in ~100ms, but occasionally
runs in 2secs, and you want to understand why.

A first step is understanding whether a given command itself is
slow at executing, or whether its execution has merely been
delayed because some other aspect of QEMU has delayed its execution.
If the server timestamps show it was very fast, then that indicates
delayed processing. Thus instead of debugging the slow command, I
can think about what scenarios would be responsible for the delay.
Perhaps a previous QMP command was very slow, or maybe there is
simply a large volume of QMP commands backlogged, or some part of
QEMU got blocked.

Another case would be a command that is normally fast, and sometimes
is slower, but still relatively fast. The network and queueing side
might be a significant enough proportion of the total time to obscure
the slowdown. If you can eliminate the non-execution time, you can
see the performance trends over time to spot the subtle slowdowns
and detect abnormal behaviour before it becomes too terrible.

This is troubleshooting.  Asking for better troubleshooting tools is
fair.

However, the proposed timestamps provide much more limited insight than
existing tracepoints.  For instance, enabling

tracepoints are absolutely great and let you get a hell of alot
more information, *provided* you are in a position to actually
use tracepoints. This is, unfortunately, frequently not the case
when supporting real world production deployments.

Exactly!!! Thanks for the pointing out!


Bug reports from customers typically include little more than a
log file they got from the mgmt client at time the problem happened.
The problem experianced may no longer exist, so asking them to run
a tracepoint script is not possible. They may also be reluctant to
actually run tracepoint scripts on a production system, or simply
lack the ability todo so at all, due to constraints of the deployment
environment. Logs from libvirt are something that are collected by
default for many mgmt apps, or can be turned on by the user with
minimal risk of disruption.

Overall, there's a compelling desire to be proactive in collecting
information ahead of time, that might be useful in diagnosing
future bug reports.


This is the main reason. When you encounter a problem one of the first 
questions is "Was there something similar in the past. Another question 
is how often does it happen.


With the timestamps these questions answering becomes easier.

Another thing is that with the qmp command timestamps you can build a 
monitoring system which will report about the cases when 
execution_time_from_mgmt_perspective - excution_time_qmp_command > 
some_threshold which in turn proactively tell you about the potential 
problems. And then you'll start using the qmp tracepoints (and other 
means) to figure out the real reason of the execution time variance.


Thanks, Denis



So it isn't an 'either / or' decision of QMP reply logs vs use of
tracepoints, both are beneficial, with their own pros/cons.

With regards,
Daniel




Re: [PATCH v2] virtio: add VIRTQUEUE_ERROR QAPI event

2023-09-13 Thread Denis Plotnikov

Reviewed-by: Denis Plotnikov 

On 9/12/23 20:57, Vladimir Sementsov-Ogievskiy wrote:

For now we only log the vhost device error, when virtqueue is actually
stopped. Let's add a QAPI event, which makes possible:

  - collect statistics of such errors
  - make immediate actions: take core dumps or do some other debugging
  - inform the user through a management API or UI, so that (s)he can
   react somehow, e.g. reset the device driver in the guest or even
   build up some automation to do so

Note that basically every inconsistency discovered during virtqueue
processing results in a silent virtqueue stop.  The guest then just
sees the requests getting stuck somewhere in the device for no visible
reason.  This event provides a means to inform the management layer of
this situation in a timely fashion.

The event could be reused for some other virtqueue problems (not only
for vhost devices) in future. For this it gets a generic name and
structure.

We keep original VHOST_OPS_DEBUG(), to keep original debug output as is
here, it's not the only call to VHOST_OPS_DEBUG in the file.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---

v2: - improve commit message (just stole wording by Roman, hope he don't
   mind:)
 - add event throttling

  hw/virtio/vhost.c | 12 +---
  monitor/monitor.c | 10 ++
  qapi/qdev.json| 25 +
  3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index e2f6ffb446..162899feee 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -15,6 +15,7 @@
  
  #include "qemu/osdep.h"

  #include "qapi/error.h"
+#include "qapi/qapi-events-qdev.h"
  #include "hw/virtio/vhost.h"
  #include "qemu/atomic.h"
  #include "qemu/range.h"
@@ -1332,11 +1333,16 @@ static void 
vhost_virtqueue_error_notifier(EventNotifier *n)
  struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue,
error_notifier);
  struct vhost_dev *dev = vq->dev;
-int index = vq - dev->vqs;
  
  if (event_notifier_test_and_clear(n) && dev->vdev) {

-VHOST_OPS_DEBUG(-EINVAL,  "vhost vring error in virtqueue %d",
-dev->vq_index + index);
+int ind = vq - dev->vqs + dev->vq_index;
+DeviceState *ds = &dev->vdev->parent_obj;
+
+VHOST_OPS_DEBUG(-EINVAL,  "vhost vring error in virtqueue %d", ind);
+qapi_event_send_virtqueue_error(ds->id, ds->canonical_path, ind,
+VIRTQUEUE_ERROR_VHOST_VRING_ERR,
+"vhost reported failure through vring "
+"error fd");
  }
  }
  
diff --git a/monitor/monitor.c b/monitor/monitor.c

index 941f87815a..cb1ee31156 100644
--- a/monitor/monitor.c
+++ b/monitor/monitor.c
@@ -313,6 +313,7 @@ static MonitorQAPIEventConf 
monitor_qapi_event_conf[QAPI_EVENT__MAX] = {
  [QAPI_EVENT_BALLOON_CHANGE]= { 1000 * SCALE_MS },
  [QAPI_EVENT_QUORUM_REPORT_BAD] = { 1000 * SCALE_MS },
  [QAPI_EVENT_QUORUM_FAILURE]= { 1000 * SCALE_MS },
+[QAPI_EVENT_VIRTQUEUE_ERROR]   = { 1000 * SCALE_MS },
  [QAPI_EVENT_VSERPORT_CHANGE]   = { 1000 * SCALE_MS },
  [QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS },
  };
@@ -497,6 +498,10 @@ static unsigned int qapi_event_throttle_hash(const void 
*key)
  hash += g_str_hash(qdict_get_str(evstate->data, "qom-path"));
  }
  
+if (evstate->event == QAPI_EVENT_VIRTQUEUE_ERROR) {

+hash += g_str_hash(qdict_get_str(evstate->data, "device"));
+}
+
  return hash;
  }
  
@@ -524,6 +529,11 @@ static gboolean qapi_event_throttle_equal(const void *a, const void *b)

 qdict_get_str(evb->data, "qom-path"));
  }
  
+if (eva->event == QAPI_EVENT_VIRTQUEUE_ERROR) {

+return !strcmp(qdict_get_str(eva->data, "device"),
+   qdict_get_str(evb->data, "device"));
+}
+
  return TRUE;
  }
  
diff --git a/qapi/qdev.json b/qapi/qdev.json

index 6bc5a733b8..199e21cae7 100644
--- a/qapi/qdev.json
+++ b/qapi/qdev.json
@@ -161,3 +161,28 @@
  ##
  { 'event': 'DEVICE_UNPLUG_GUEST_ERROR',
'data': { '*device': 'str', 'path': 'str' } }
+
+##
+# @VirtqueueError:
+#
+# Since: 8.2
+##
+{ 'enum': 'VirtqueueError',
+  'data': [ 'vhost-vring-err' ] }
+
+##
+# @VIRTQUEUE_ERROR:
+#
+# Emitted when a device virtqueue fails in runtime.
+#
+# @device: the device's ID if it has one
+# @path: the device's QOM path
+# @virtqueue: virtqueue index
+# @error: error identifier
+# @description: human readable description
+#
+# Since: 8.2
+##
+{ 'event': 'VIRTQUEUE_ERROR',
+ 'data': { '*device': 'str', 'path': 'str', 'virtqueue': 'int',
+'error': 'VirtqueueError', 'description': 'str'} }




[Qemu-devel] [PATCH] kvmclock: update system_time_msr address forcibly

2017-05-24 Thread Denis Plotnikov
Do an update of system_time_msr address every time before reading
the value of tsc_timestamp from guest's kvmclock page.

It should be done in a forcible manner because there is a situation
when system_time_msr has been set by kvm but qemu doesn't aware of it.
This leads to updates of kvmclock_offset without respect of guest's
kvmclock values.

The situation appears when L2 linux guest runs over L1 linux guest and
the action inducing system_time_msr update is tpr access reporting.
Some L1 linux guests turn off processing TPR access and when L0
gets an L2 exit induced by TPR MSR access it doesn't enter L1 and
processed it by itself.
Thus, L1 kvm doesn't know about that TPR access happening and doesn't
exit to qemu which in turn doesn't set system_time_msr address.

This patch fixes this by making sure it knows the correct address every
time it is needed.

Signed-off-by: Denis Plotnikov 
---
 hw/i386/kvm/clock.c | 32 +++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index e713162..035196a 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -48,11 +48,38 @@ struct pvclock_vcpu_time_info {
 uint8_tpad[2];
 } __attribute__((__packed__)); /* 32 bytes */
 
+static void update_all_system_time_msr(void)
+{
+CPUState *cpu;
+CPUX86State *env;
+struct {
+struct kvm_msrs info;
+struct kvm_msr_entry entries[1];
+} msr_data;
+int ret;
+
+msr_data.info.nmsrs = 1;
+msr_data.entries[0].index = MSR_KVM_SYSTEM_TIME;
+
+CPU_FOREACH(cpu) {
+ret = kvm_vcpu_ioctl(cpu, KVM_GET_MSRS, &msr_data);
+
+if (ret < 0) {
+fprintf(stderr, "KVM_GET_MSRS failed: %s\n", strerror(ret));
+abort();
+}
+
+assert(ret == 1);
+env = cpu->env_ptr;
+env->system_time_msr = msr_data.entries[0].data;
+}
+}
+
 static uint64_t kvmclock_current_nsec(KVMClockState *s)
 {
 CPUState *cpu = first_cpu;
 CPUX86State *env = cpu->env_ptr;
-hwaddr kvmclock_struct_pa = env->system_time_msr & ~1ULL;
+hwaddr kvmclock_struct_pa;
 uint64_t migration_tsc = env->tsc;
 struct pvclock_vcpu_time_info time;
 uint64_t delta;
@@ -60,6 +87,9 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
 uint64_t nsec_hi;
 uint64_t nsec;
 
+update_all_system_time_msr();
+kvmclock_struct_pa = env->system_time_msr & ~1ULL;
+
 if (!(env->system_time_msr & 1ULL)) {
 /* KVM clock not active */
 return 0;
-- 
2.7.4




Re: [Qemu-devel] [PATCH] kvmclock: update system_time_msr address forcibly

2017-05-24 Thread Denis Plotnikov



On 24.05.2017 17:09, Denis V. Lunev wrote:

On 05/24/2017 05:07 PM, Denis Plotnikov wrote:

Do an update of system_time_msr address every time before reading
the value of tsc_timestamp from guest's kvmclock page.

It should be done in a forcible manner because there is a situation
when system_time_msr has been set by kvm but qemu doesn't aware of it.
This leads to updates of kvmclock_offset without respect of guest's
kvmclock values.

The situation appears when L2 linux guest runs over L1 linux guest and
the action inducing system_time_msr update is tpr access reporting.
Some L1 linux guests turn off processing TPR access and when L0
gets an L2 exit induced by TPR MSR access it doesn't enter L1 and
processed it by itself.
Thus, L1 kvm doesn't know about that TPR access happening and doesn't
exit to qemu which in turn doesn't set system_time_msr address.

This patch fixes this by making sure it knows the correct address every
time it is needed.

Signed-off-by: Denis Plotnikov 
---
 hw/i386/kvm/clock.c | 32 +++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index e713162..035196a 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -48,11 +48,38 @@ struct pvclock_vcpu_time_info {
 uint8_tpad[2];
 } __attribute__((__packed__)); /* 32 bytes */

+static void update_all_system_time_msr(void)
+{
+CPUState *cpu;
+CPUX86State *env;
+struct {
+struct kvm_msrs info;
+struct kvm_msr_entry entries[1];
+} msr_data;
+int ret;
+
+msr_data.info.nmsrs = 1;
+msr_data.entries[0].index = MSR_KVM_SYSTEM_TIME;
+
+CPU_FOREACH(cpu) {
+ret = kvm_vcpu_ioctl(cpu, KVM_GET_MSRS, &msr_data);
+
+if (ret < 0) {
+fprintf(stderr, "KVM_GET_MSRS failed: %s\n", strerror(ret));
+abort();
+}
+
+assert(ret == 1);
+env = cpu->env_ptr;
+env->system_time_msr = msr_data.entries[0].data;
+}
+}
+
 static uint64_t kvmclock_current_nsec(KVMClockState *s)
 {
 CPUState *cpu = first_cpu;
 CPUX86State *env = cpu->env_ptr;
-hwaddr kvmclock_struct_pa = env->system_time_msr & ~1ULL;
+hwaddr kvmclock_struct_pa;
 uint64_t migration_tsc = env->tsc;
 struct pvclock_vcpu_time_info time;
 uint64_t delta;
@@ -60,6 +87,9 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
 uint64_t nsec_hi;
 uint64_t nsec;

+update_all_system_time_msr();
+kvmclock_struct_pa = env->system_time_msr & ~1ULL;
+

should we do this once/per guest boot?
practically - yes. I can barely imagine that the pv_clock page address 
may be changed after being set once.

But we don't know the exact moment when the guest is going to write it.
And not to be dependent of any other event I decided to check it every 
time before using since it won't make any performance issues because 
this invocation happens on vm state changes only.


Den

 if (!(env->system_time_msr & 1ULL)) {
 /* KVM clock not active */
 return 0;




--
Best,
Denis



[Qemu-devel] [PATCH v2] kvmclock: update system_time_msr address forcibly

2017-05-26 Thread Denis Plotnikov
Do an update of system_time_msr address every time before reading
the value of tsc_timestamp from guest's kvmclock page.

There is no other code paths which ensure that qemu has an up-to-date
value of system_time_msr. So, force this update on guest's tsc_timestamp
reading.

This bug causes effect on those nested setups which turn off TPR access
interception for L2 guests and that access being intercepted by L0 doesn't
show up in L1.
Linux bootstrap initiate kvmclock before APIC initializing causing TPR access.
That's why on L1 guests, having TPR interception turned on for L2, the effect
of the bug is not revealed.

This patch fixes this problem by making sure it knows the correct
system_time_msr address every time it is needed.

Signed-off-by: Denis Plotnikov 
---
 hw/i386/kvm/clock.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 0f75dd3..875d85f 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -61,6 +61,8 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
 uint64_t nsec_hi;
 uint64_t nsec;
 
+cpu_synchronize_state(cpu);
+
 if (!(env->system_time_msr & 1ULL)) {
 /* KVM clock not active */
 return 0;
-- 
2.7.4




[Qemu-devel] [PATCH v3] kvmclock: update system_time_msr address forcibly

2017-05-29 Thread Denis Plotnikov
Do an update of system_time_msr address every time before reading
the value of tsc_timestamp from guest's kvmclock page.

There is no other code paths which ensure that qemu has an up-to-date
value of system_time_msr. So, force this update on guest's tsc_timestamp
reading.

This bug causes effect on those nested setups which turn off TPR access
interception for L2 guests and that access being intercepted by L0 doesn't
show up in L1.
Linux bootstrap initiate kvmclock before APIC initializing causing TPR access.
That's why on L1 guests, having TPR interception turned on for L2, the effect
of the bug is not revealed.

This patch fixes this problem by making sure it knows the correct
system_time_msr address every time it is needed.

Signed-off-by: Denis Plotnikov 
---
 hw/i386/kvm/clock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 13eca37..363d1b5 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -19,6 +19,7 @@
 #include "qemu/host-utils.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/kvm.h"
+#include "sysemu/hw_accel.h"
 #include "kvm_i386.h"
 #include "hw/sysbus.h"
 #include "hw/kvm/clock.h"
@@ -69,6 +70,8 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
 uint64_t nsec_hi;
 uint64_t nsec;
 
+cpu_synchronize_state(cpu);
+
 if (!(env->system_time_msr & 1ULL)) {
 /* KVM clock not active */
 return 0;
-- 
2.7.4




[Qemu-devel] [PATCH] i386: turn off l3-cache property by default

2017-11-24 Thread Denis Plotnikov
Commit 14c985cffa "target-i386: present virtual L3 cache info for vcpus"
introduced and set by default exposing l3 to the guest.

The motivation behind it was that in the Linux scheduler, when waking up
a task on a sibling CPU, the task was put onto the target CPU's runqueue
directly, without sending a reschedule IPI.  Reduction in the IPI count
led to performance gain.

However, this isn't the whole story.  Once the task is on the target
CPU's runqueue, it may have to preempt the current task on that CPU, be
it the idle task putting the CPU to sleep or just another running task.
For that a reschedule IPI will have to be issued, too.  Only when that
other CPU is running a normal task for too little time, the fairness
constraints will prevent the preemption and thus the IPI.

This boils down to the improvement being only achievable in workloads
with many actively switching tasks.  We had no access to the
(proprietary?) SAP HANA benchmark the commit referred to, but the
pattern is also reproduced with "perf bench sched messaging -g 1"
on 1 socket, 8 cores vCPU topology, we see indeed:

l3-cache#res IPI /s #time / 1 loops
off 560K1.8 sec
on  40K 0.9 sec

Now there's a downside: with L3 cache the Linux scheduler is more eager
to wake up tasks on sibling CPUs, resulting in unnecessary cross-vCPU
interactions and therefore exessive halts and IPIs.  E.g. "perf bench
sched pipe -i 10" gives

l3-cache#res IPI /s #HLT /s #time /10 loops
off 200 (no K)  230 0.2 sec
on  400K330K0.5 sec

In a more realistic test, we observe 15% degradation in VM density
(measured as the number of VMs, each running Drupal CMS serving 2 http
requests per second to its main page, with 95%-percentile response
latency under 100 ms) with l3-cache=on.

We think that mostly-idle scenario is more common in cloud and personal
usage, and should be optimized for by default; users of highly loaded
VMs should be able to tune them up themselves.

So switch l3-cache off by default, and add a compat clause for the range
of machine types where it was on.

Signed-off-by: Denis Plotnikov 
Reviewed-by: Roman Kagan 
---
 include/hw/i386/pc.h | 7 ++-
 target/i386/cpu.c| 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 087d184..1d2dcae 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -375,7 +375,12 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
 .driver   = TYPE_X86_CPU,\
 .property = "x-hv-max-vps",\
 .value= "0x40",\
-},
+},\
+{\
+.driver   = TYPE_X86_CPU,\
+.property = "l3-cache",\
+.value= "on",\
+},\
 
 #define PC_COMPAT_2_9 \
 HW_COMPAT_2_9 \
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 1edcf29..95a51bd 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -4154,7 +4154,7 @@ static Property x86_cpu_properties[] = {
 DEFINE_PROP_STRING("hv-vendor-id", X86CPU, hyperv_vendor_id),
 DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true),
 DEFINE_PROP_BOOL("lmce", X86CPU, enable_lmce, false),
-DEFINE_PROP_BOOL("l3-cache", X86CPU, enable_l3_cache, true),
+DEFINE_PROP_BOOL("l3-cache", X86CPU, enable_l3_cache, false),
 DEFINE_PROP_BOOL("kvm-no-smi-migration", X86CPU, kvm_no_smi_migration,
  false),
 DEFINE_PROP_BOOL("vmware-cpuid-freq", X86CPU, vmware_cpuid_freq, true),
-- 
2.7.4




[Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

2018-12-05 Thread Denis Plotnikov
At the time, the "drained section" doesn't protect Block Driver State
from the requests appearing in the vCPU threads.
This could lead to the data loss because of request coming to
an unexpected BDS.

For example, when a request comes to ide controller from the guest,
the controller creates a request coroutine and executes the coroutine
in the vCPU thread. If another thread(iothread) has entered the
"drained section" on a BDS with bdrv_drained_begin, which protects
BDS' AioContext from external requests, and released the AioContext
because of finishing some coroutine by the moment of the request
appearing at the ide controller, the controller acquires the AioContext
and executes its request without any respect to the entered
"drained section" producing any kinds of data inconsistency.

The patch prevents this case by putting requests from external threads to
the queue on AioContext while the context is protected for external requests
and executes those requests later on the external requests protection removing.

Also, the patch marks requests generated in a vCPU thread as external ones
to make use of the request postponing.

How to reproduce:
1. start vm with an ide disk and a linux guest
2. in the guest run: dd if=... of=... bs=4K count=10 oflag=direct
3. (qemu) drive_mirror "disk-name"
4. wait until block job can receive block_job_complete
5. (qemu) block_job_complete "disk-name"
6. blk_aio_p[read|write]v may appear in vCPU context (here is the race)

Signed-off-by: Denis Plotnikov 
---
 block/block-backend.c | 31 +++
 block/io.c|  3 ++-
 dma-helpers.c |  4 ++--
 hw/block/nvme.c   |  8 
 hw/block/xen_disk.c   |  8 
 hw/ide/core.c |  6 --
 hw/scsi/scsi-disk.c   | 10 ++
 include/block/aio.h   | 37 -
 include/block/block.h |  8 +++-
 util/async.c  |  2 ++
 10 files changed, 90 insertions(+), 27 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 60d37a0c3d..10f7dd357d 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1148,6 +1148,8 @@ static int blk_check_byte_request(BlockBackend *blk, 
int64_t offset,
 return 0;
 }
 
+static void coroutine_fn blk_postpone_request(BlockBackend *blk);
+
 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
unsigned int bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags)
@@ -1157,6 +1159,10 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, 
int64_t offset,
 
 trace_blk_co_preadv(blk, bs, offset, bytes, flags);
 
+if ((flags & BDRV_REQ_EXTERNAL)) {
+blk_postpone_request(blk);
+}
+
 ret = blk_check_byte_request(blk, offset, bytes);
 if (ret < 0) {
 return ret;
@@ -1184,6 +1190,10 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, 
int64_t offset,
 
 trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
 
+if ((flags & BDRV_REQ_EXTERNAL)) {
+blk_postpone_request(blk);
+}
+
 ret = blk_check_byte_request(blk, offset, bytes);
 if (ret < 0) {
 return ret;
@@ -1304,6 +1314,27 @@ static void blk_dec_in_flight(BlockBackend *blk)
 aio_wait_kick();
 }
 
+static void coroutine_fn blk_postpone_request(BlockBackend *blk)
+{
+AioContext *ctx;
+
+assert(qemu_in_coroutine());
+ctx = qemu_coroutine_get_aio_context(qemu_coroutine_self());
+
+/*
+ * Put the request to the postponed queue if
+ * external requests is not allowed currenly
+ * The request is continued when the context
+ * leaves the bdrv "drained" section allowing
+ * external requests
+ */
+if (aio_external_disabled(ctx)) {
+blk_dec_in_flight(blk);
+qemu_co_queue_wait(&ctx->postponed_reqs, NULL);
+blk_inc_in_flight(blk);
+}
+}
+
 static void error_callback_bh(void *opaque)
 {
 struct BlockBackendAIOCB *acb = opaque;
diff --git a/block/io.c b/block/io.c
index bd9d688f8b..019da464a2 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1318,7 +1318,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild 
*child,
  * potential fallback support, if we ever implement any read flags
  * to pass through to drivers.  For now, there aren't any
  * passthrough flags.  */
-assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
+assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ |
+   BDRV_REQ_EXTERNAL)));
 
 /* Handle Copy on Read and associated serialisation */
 if (flags & BDRV_REQ_COPY_ON_READ) {
diff --git a/dma-helpers.c b/dma-helpers.c
index 2d7e02d35e..53706031c5 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -235,7 +235,7 @@ BlockAIOCB *dma_blk_read_io_func(int64_t offset, 
QEMUIOVector *iov,
  voi

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

2018-12-10 Thread Denis Plotnikov


On 07.12.2018 15:26, Kevin Wolf wrote:
> Am 05.12.2018 um 13:23 hat Denis Plotnikov geschrieben:
>> At the time, the "drained section" doesn't protect Block Driver State
>> from the requests appearing in the vCPU threads.
>> This could lead to the data loss because of request coming to
>> an unexpected BDS.
>>
>> For example, when a request comes to ide controller from the guest,
>> the controller creates a request coroutine and executes the coroutine
>> in the vCPU thread. If another thread(iothread) has entered the
>> "drained section" on a BDS with bdrv_drained_begin, which protects
>> BDS' AioContext from external requests, and released the AioContext
>> because of finishing some coroutine by the moment of the request
>> appearing at the ide controller, the controller acquires the AioContext
>> and executes its request without any respect to the entered
>> "drained section" producing any kinds of data inconsistency.
>>
>> The patch prevents this case by putting requests from external threads to
>> the queue on AioContext while the context is protected for external requests
>> and executes those requests later on the external requests protection 
>> removing.
In general, I agree with the comments and going to make changes in the 
patches accordingly.

Also, I'd like to ask a question below
>>
>> Also, the patch marks requests generated in a vCPU thread as external ones
>> to make use of the request postponing.
>>
>> How to reproduce:
>> 1. start vm with an ide disk and a linux guest
>> 2. in the guest run: dd if=... of=... bs=4K count=10 oflag=direct
>> 3. (qemu) drive_mirror "disk-name"
>> 4. wait until block job can receive block_job_complete
>> 5. (qemu) block_job_complete "disk-name"
>> 6. blk_aio_p[read|write]v may appear in vCPU context (here is the race)
>>
>> Signed-off-by: Denis Plotnikov 
> 
> This is getting closer, but I'd like to see two more major changes:
> 
>> diff --git a/include/block/aio.h b/include/block/aio.h
>> index 0ca25dfec6..8512bda44e 100644
>> --- a/include/block/aio.h
>> +++ b/include/block/aio.h
>> @@ -19,6 +19,7 @@
>>   #include "qemu/event_notifier.h"
>>   #include "qemu/thread.h"
>>   #include "qemu/timer.h"
>> +#include "qemu/coroutine.h"
>>   
>>   typedef struct BlockAIOCB BlockAIOCB;
>>   typedef void BlockCompletionFunc(void *opaque, int ret);
>> @@ -130,6 +131,11 @@ struct AioContext {
>>   QEMUTimerListGroup tlg;
>>   
>>   int external_disable_cnt;
>> +/* Queue to store the requests coming when the context is disabled for
>> + * external requests.
>> + * Don't use a separate lock for protection relying the context lock
>> + */
>> +CoQueue postponed_reqs;
> 
> Why involve the AioContext at all? This could all be kept at the
> BlockBackend level without extending the layering violation that
> aio_disable_external() is.
> 
> BlockBackends get notified when their root node is drained, so hooking
> things up there should be as easy, if not even easier than in
> AioContext.
> 
>>   /* Number of AioHandlers without .io_poll() */
>>   int poll_disable_cnt;
>> @@ -483,6 +489,15 @@ static inline void aio_timer_init(AioContext *ctx,
>>*/
>>   int64_t aio_compute_timeout(AioContext *ctx);
>>   
>> +/**
>> + * aio_co_enter:
>> + * @ctx: the context to run the coroutine
>> + * @co: the coroutine to run
>> + *
>> + * Enter a coroutine in the specified AioContext.
>> + */
>> +void aio_co_enter(AioContext *ctx, struct Coroutine *co);
>> +
>>   /**
>>* aio_disable_external:
>>* @ctx: the aio context
>> @@ -491,9 +506,17 @@ int64_t aio_compute_timeout(AioContext *ctx);
>>*/
>>   static inline void aio_disable_external(AioContext *ctx)
>>   {
>> +aio_context_acquire(ctx);
>>   atomic_inc(&ctx->external_disable_cnt);
>> +aio_context_release(ctx);
>>   }
> 
> This acquire/release pair looks rather useless?

I'm not sure that I understand everything correctly...
but can a thread (context) try to disable external in another context?

> 
>> +static void run_postponed_co(void *opaque)
>> +{
>> +AioContext *ctx = (AioContext *) opaque;
>> +
>> +qemu_co_queue_restart_all(&ctx->postponed_reqs);
>> +}
>>   /**
>>* aio_enable_external:
>>* @ctx: the aio context
>> @@ -504,12 +527,17 @@ static inline voi

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

2018-12-11 Thread Denis Plotnikov


On 07.12.2018 15:26, Kevin Wolf wrote:
> Am 05.12.2018 um 13:23 hat Denis Plotnikov geschrieben:
>> At the time, the "drained section" doesn't protect Block Driver State
>> from the requests appearing in the vCPU threads.
>> This could lead to the data loss because of request coming to
>> an unexpected BDS.
>>
>> For example, when a request comes to ide controller from the guest,
>> the controller creates a request coroutine and executes the coroutine
>> in the vCPU thread. If another thread(iothread) has entered the
>> "drained section" on a BDS with bdrv_drained_begin, which protects
>> BDS' AioContext from external requests, and released the AioContext
>> because of finishing some coroutine by the moment of the request
>> appearing at the ide controller, the controller acquires the AioContext
>> and executes its request without any respect to the entered
>> "drained section" producing any kinds of data inconsistency.
>>
>> The patch prevents this case by putting requests from external threads to
>> the queue on AioContext while the context is protected for external requests
>> and executes those requests later on the external requests protection 
>> removing.
>>
>> Also, the patch marks requests generated in a vCPU thread as external ones
>> to make use of the request postponing.
>>
>> How to reproduce:
>> 1. start vm with an ide disk and a linux guest
>> 2. in the guest run: dd if=... of=... bs=4K count=10 oflag=direct
>> 3. (qemu) drive_mirror "disk-name"
>> 4. wait until block job can receive block_job_complete
>> 5. (qemu) block_job_complete "disk-name"
>> 6. blk_aio_p[read|write]v may appear in vCPU context (here is the race)
>>
>> Signed-off-by: Denis Plotnikov 
> 
> This is getting closer, but I'd like to see two more major changes:
> 
>> diff --git a/include/block/aio.h b/include/block/aio.h
>> index 0ca25dfec6..8512bda44e 100644
>> --- a/include/block/aio.h
>> +++ b/include/block/aio.h
>> @@ -19,6 +19,7 @@
>>   #include "qemu/event_notifier.h"
>>   #include "qemu/thread.h"
>>   #include "qemu/timer.h"
>> +#include "qemu/coroutine.h"
>>   
>>   typedef struct BlockAIOCB BlockAIOCB;
>>   typedef void BlockCompletionFunc(void *opaque, int ret);
>> @@ -130,6 +131,11 @@ struct AioContext {
>>   QEMUTimerListGroup tlg;
>>   
>>   int external_disable_cnt;
>> +/* Queue to store the requests coming when the context is disabled for
>> + * external requests.
>> + * Don't use a separate lock for protection relying the context lock
>> + */
>> +CoQueue postponed_reqs;
> 
> Why involve the AioContext at all? This could all be kept at the
> BlockBackend level without extending the layering violation that
> aio_disable_external() is.
> 
> BlockBackends get notified when their root node is drained, so hooking
> things up there should be as easy, if not even easier than in
> AioContext.

Just want to make sure that I understood correctly what you meant by 
"BlockBackends get notified". Did you mean that bdrv_drain_end calls
child's role callback blk_root_drained_end by calling 
bdrv_parent_drained_end?

In case if it's so, it won't work if resume postponed requests in 
blk_root_drained_end since we can't know if external is disabled for the 
context because the counter showing that is decreased only after roles' 
drained callbacks are finished at bdrv_do_drained_end.
Please correct me if I'm wrong.

Looking at the patch again, I think that it might be useful to keep the 
requests in the structure that limits their execution and also protects 
the access (context acquire/release) although it's indeed the layering 
violation but at least we can store the parts related at the same place
and later on move somewhere else alongside the request restrictor.

Denis



> 
>>   /* Number of AioHandlers without .io_poll() */
>>   int poll_disable_cnt;
>> @@ -483,6 +489,15 @@ static inline void aio_timer_init(AioContext *ctx,
>>*/
>>   int64_t aio_compute_timeout(AioContext *ctx);
>>   
>> +/**
>> + * aio_co_enter:
>> + * @ctx: the context to run the coroutine
>> + * @co: the coroutine to run
>> + *
>> + * Enter a coroutine in the specified AioContext.
>> + */
>> +void aio_co_enter(AioContext *ctx, struct Coroutine *co);
>> +
>>   /**
>>* aio_disable_external:
>>* @ctx: the aio context
>> @@ -491,9 +506,17 @@ int64_t aio_compute_

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

2018-12-13 Thread Denis Plotnikov


On 12.12.2018 15:24, Kevin Wolf wrote:
> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
>>> Why involve the AioContext at all? This could all be kept at the
>>> BlockBackend level without extending the layering violation that
>>> aio_disable_external() is.
>>>
>>> BlockBackends get notified when their root node is drained, so hooking
>>> things up there should be as easy, if not even easier than in
>>> AioContext.
>>
>> Just want to make sure that I understood correctly what you meant by
>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls
>> child's role callback blk_root_drained_end by calling
>> bdrv_parent_drained_end?
> 
> Yes, blk_root_drained_begin/end calls are all you need. Specifically,
> their adjustments to blk->quiesce_counter that are already there, and in
> the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end()
> we can resume the queued requests.
Sounds it should be so, but it doesn't work that way and that's why:
when doing mirror we may resume postponed coroutines too early when the 
underlying bs is protected from writing at and thus we encounter the 
assert on a write request execution at bdrv_co_write_req_prepare when 
resuming the postponed coroutines.

The thing is that the bs is protected for writing before execution of 
bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls 
bdrv_replace_child_noperm which, in turn, calls child->role->drained_end 
where one of the callbacks is blk_root_drained_end which check 
if(--blk->quiesce_counter == 0) and runs the postponed requests 
(coroutines) if the coundition is true.

In seems that if the external requests disabled on the context we can't 
rely on anything or should check where the underlying bs and its 
underlying nodes are ready to receive requests which sounds quite 
complicated.
Please correct me if still don't understand something in that routine.

Denis
>> In case if it's so, it won't work if resume postponed requests in
>> blk_root_drained_end since we can't know if external is disabled for the
>> context because the counter showing that is decreased only after roles'
>> drained callbacks are finished at bdrv_do_drained_end.
>> Please correct me if I'm wrong.
> 
> You don't need to know about the AioContext state, this is the whole
> point. blk->quiesce_counter is what tells you whether to postpone
> requests.
> 
>> Looking at the patch again, I think that it might be useful to keep the
>> requests in the structure that limits their execution and also protects
>> the access (context acquire/release) although it's indeed the layering
>> violation but at least we can store the parts related at the same place
>> and later on move somewhere else alongside the request restrictor.
> 
> You can keep everything you need in BlockBackend (and that's also where
> your code is that really postpones request).
> 
> Kevin
> 

-- 
Best,
Denis


Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

2018-12-14 Thread Denis Plotnikov


On 13.12.2018 15:20, Kevin Wolf wrote:
> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben:
>> On 12.12.2018 15:24, Kevin Wolf wrote:
>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
>>>>> Why involve the AioContext at all? This could all be kept at the
>>>>> BlockBackend level without extending the layering violation that
>>>>> aio_disable_external() is.
>>>>>
>>>>> BlockBackends get notified when their root node is drained, so hooking
>>>>> things up there should be as easy, if not even easier than in
>>>>> AioContext.
>>>>
>>>> Just want to make sure that I understood correctly what you meant by
>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls
>>>> child's role callback blk_root_drained_end by calling
>>>> bdrv_parent_drained_end?
>>>
>>> Yes, blk_root_drained_begin/end calls are all you need. Specifically,
>>> their adjustments to blk->quiesce_counter that are already there, and in
>>> the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end()
>>> we can resume the queued requests.
>> Sounds it should be so, but it doesn't work that way and that's why:
>> when doing mirror we may resume postponed coroutines too early when the
>> underlying bs is protected from writing at and thus we encounter the
>> assert on a write request execution at bdrv_co_write_req_prepare when
>> resuming the postponed coroutines.
>>
>> The thing is that the bs is protected for writing before execution of
>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls
>> bdrv_replace_child_noperm which, in turn, calls child->role->drained_end
>> where one of the callbacks is blk_root_drained_end which check
>> if(--blk->quiesce_counter == 0) and runs the postponed requests
>> (coroutines) if the coundition is true.
> 
> Hm, so something is messed up with the drain sections in the mirror
> driver. We have:
> 
>  bdrv_drained_begin(target_bs);
>  bdrv_replace_node(to_replace, target_bs, &local_err);
>  bdrv_drained_end(target_bs);
> 
> Obviously, the intention was to keep the BlockBackend drained during
> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0
> inside bdrv_replace_node() when target_bs is drained?
> 
> Looking at bdrv_replace_child_noperm(), it seems that the function has
> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter
> for the parent reaches 0 for a moment because we call .drained_end for
> the old child first and .drained_begin for the new one later.
> 
> So it seems the fix would be to reverse the order and first call
> .drained_begin for the new child and then .drained_end for the old
> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, too.
Yes, it's true, but it's not enough...
In mirror_exit_common() we actively manipulate with block driver states.
When we replaced a node in the snippet you showed we can't allow the 
postponed coroutines to run because the block tree isn't ready to 
receive the requests yet.
To be ready, we need to insert a proper block driver state to the block 
backend which is done here

 blk_remove_bs(bjob->blk);
 blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort);
 blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); << << << <<

 bs_opaque->job = NULL;

 bdrv_drained_end(src);

If the tree isn't ready and we resume the coroutines, we'll end up with 
the request landed in a wrong block driver state.

So, we explicitly should stop all activities on all the driver states
and its parents and allow the activities when everything is ready to go.

Why explicitly, because the block driver states may belong to different 
block backends at the moment of the manipulation beginning.

So, it seems we need to disable all their contexts until the 
manipulation ends.

Please, correct me if I'm wrong.

> 
>> In seems that if the external requests disabled on the context we can't
>> rely on anything or should check where the underlying bs and its
>> underlying nodes are ready to receive requests which sounds quite
>> complicated.
>> Please correct me if still don't understand something in that routine.
> 
> I think the reason why reyling on aio_disable_external() works is simply
> because src is also drained, which keeps external events in the
> AioContext disabled despite the bug in draining the target node.
> 
> The bug would become apparent even with aio_disable_external() if we
> didn't drain src, or even if we just supported src and target being in
> different AioContexts.

Why don't we disable all those contexts involved until the end of the 
block device tree reconstruction?

Thanks!

Denis
> 
> Kevin
> 

-- 
Best,
Denis


Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

2018-12-18 Thread Denis Plotnikov
ping ping

On 14.12.2018 14:54, Denis Plotnikov wrote:
> 
> 
> On 13.12.2018 15:20, Kevin Wolf wrote:
>> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben:
>>> On 12.12.2018 15:24, Kevin Wolf wrote:
>>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
>>>>>> Why involve the AioContext at all? This could all be kept at the
>>>>>> BlockBackend level without extending the layering violation that
>>>>>> aio_disable_external() is.
>>>>>>
>>>>>> BlockBackends get notified when their root node is drained, so 
>>>>>> hooking
>>>>>> things up there should be as easy, if not even easier than in
>>>>>> AioContext.
>>>>>
>>>>> Just want to make sure that I understood correctly what you meant by
>>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls
>>>>> child's role callback blk_root_drained_end by calling
>>>>> bdrv_parent_drained_end?
>>>>
>>>> Yes, blk_root_drained_begin/end calls are all you need. Specifically,
>>>> their adjustments to blk->quiesce_counter that are already there, 
>>>> and in
>>>> the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end()
>>>> we can resume the queued requests.
>>> Sounds it should be so, but it doesn't work that way and that's why:
>>> when doing mirror we may resume postponed coroutines too early when the
>>> underlying bs is protected from writing at and thus we encounter the
>>> assert on a write request execution at bdrv_co_write_req_prepare when
>>> resuming the postponed coroutines.
>>>
>>> The thing is that the bs is protected for writing before execution of
>>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls
>>> bdrv_replace_child_noperm which, in turn, calls child->role->drained_end
>>> where one of the callbacks is blk_root_drained_end which check
>>> if(--blk->quiesce_counter == 0) and runs the postponed requests
>>> (coroutines) if the coundition is true.
>>
>> Hm, so something is messed up with the drain sections in the mirror
>> driver. We have:
>>
>>  bdrv_drained_begin(target_bs);
>>  bdrv_replace_node(to_replace, target_bs, &local_err);
>>  bdrv_drained_end(target_bs);
>>
>> Obviously, the intention was to keep the BlockBackend drained during
>> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0
>> inside bdrv_replace_node() when target_bs is drained?
>>
>> Looking at bdrv_replace_child_noperm(), it seems that the function has
>> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter
>> for the parent reaches 0 for a moment because we call .drained_end for
>> the old child first and .drained_begin for the new one later.
>>
>> So it seems the fix would be to reverse the order and first call
>> .drained_begin for the new child and then .drained_end for the old
>> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, too.
> Yes, it's true, but it's not enough...
> In mirror_exit_common() we actively manipulate with block driver states.
> When we replaced a node in the snippet you showed we can't allow the 
> postponed coroutines to run because the block tree isn't ready to 
> receive the requests yet.
> To be ready, we need to insert a proper block driver state to the block 
> backend which is done here
> 
>      blk_remove_bs(bjob->blk);
>      blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort);
>      blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); << << << <<
> 
>      bs_opaque->job = NULL;
> 
>      bdrv_drained_end(src);
> 
> If the tree isn't ready and we resume the coroutines, we'll end up with 
> the request landed in a wrong block driver state.
> 
> So, we explicitly should stop all activities on all the driver states
> and its parents and allow the activities when everything is ready to go.
> 
> Why explicitly, because the block driver states may belong to different 
> block backends at the moment of the manipulation beginning.
> 
> So, it seems we need to disable all their contexts until the 
> manipulation ends.
> 
> Please, correct me if I'm wrong.
> 
>>
>>> In seems that if the external requests disabled on the context we can't
>>> rely on anything or should check where the underlying bs and its
>>> underlying nodes are ready to receive requests which sounds quite
>>> complicated.
>>> Please correct me if still don't understand something in that routine.
>>
>> I think the reason why reyling on aio_disable_external() works is simply
>> because src is also drained, which keeps external events in the
>> AioContext disabled despite the bug in draining the target node.
>>
>> The bug would become apparent even with aio_disable_external() if we
>> didn't drain src, or even if we just supported src and target being in
>> different AioContexts.
> 
> Why don't we disable all those contexts involved until the end of the 
> block device tree reconstruction?
> 
> Thanks!
> 
> Denis
>>
>> Kevin
>>
> 

-- 
Best,
Denis


Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

2019-01-14 Thread Denis Plotnikov
ping ping ping ping

On 09.01.2019 11:18, Denis Plotnikov wrote:
> ping ping!!!
> 
> On 18.12.2018 11:53, Denis Plotnikov wrote:
>> ping ping
>>
>> On 14.12.2018 14:54, Denis Plotnikov wrote:
>>>
>>>
>>> On 13.12.2018 15:20, Kevin Wolf wrote:
>>>> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben:
>>>>> On 12.12.2018 15:24, Kevin Wolf wrote:
>>>>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
>>>>>>>> Why involve the AioContext at all? This could all be kept at the
>>>>>>>> BlockBackend level without extending the layering violation that
>>>>>>>> aio_disable_external() is.
>>>>>>>>
>>>>>>>> BlockBackends get notified when their root node is drained, so 
>>>>>>>> hooking
>>>>>>>> things up there should be as easy, if not even easier than in
>>>>>>>> AioContext.
>>>>>>>
>>>>>>> Just want to make sure that I understood correctly what you meant by
>>>>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls
>>>>>>> child's role callback blk_root_drained_end by calling
>>>>>>> bdrv_parent_drained_end?
>>>>>>
>>>>>> Yes, blk_root_drained_begin/end calls are all you need. Specifically,
>>>>>> their adjustments to blk->quiesce_counter that are already there, 
>>>>>> and in
>>>>>> the 'if (--blk->quiesce_counter == 0)' block of 
>>>>>> blk_root_drained_end()
>>>>>> we can resume the queued requests.
>>>>> Sounds it should be so, but it doesn't work that way and that's why:
>>>>> when doing mirror we may resume postponed coroutines too early when 
>>>>> the
>>>>> underlying bs is protected from writing at and thus we encounter the
>>>>> assert on a write request execution at bdrv_co_write_req_prepare when
>>>>> resuming the postponed coroutines.
>>>>>
>>>>> The thing is that the bs is protected for writing before execution of
>>>>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls
>>>>> bdrv_replace_child_noperm which, in turn, calls 
>>>>> child->role->drained_end
>>>>> where one of the callbacks is blk_root_drained_end which check
>>>>> if(--blk->quiesce_counter == 0) and runs the postponed requests
>>>>> (coroutines) if the coundition is true.
>>>>
>>>> Hm, so something is messed up with the drain sections in the mirror
>>>> driver. We have:
>>>>
>>>>  bdrv_drained_begin(target_bs);
>>>>  bdrv_replace_node(to_replace, target_bs, &local_err);
>>>>  bdrv_drained_end(target_bs);
>>>>
>>>> Obviously, the intention was to keep the BlockBackend drained during
>>>> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0
>>>> inside bdrv_replace_node() when target_bs is drained?
>>>>
>>>> Looking at bdrv_replace_child_noperm(), it seems that the function has
>>>> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter
>>>> for the parent reaches 0 for a moment because we call .drained_end for
>>>> the old child first and .drained_begin for the new one later.
>>>>
>>>> So it seems the fix would be to reverse the order and first call
>>>> .drained_begin for the new child and then .drained_end for the old
>>>> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, 
>>>> too.
>>> Yes, it's true, but it's not enough...
>>> In mirror_exit_common() we actively manipulate with block driver states.
>>> When we replaced a node in the snippet you showed we can't allow the 
>>> postponed coroutines to run because the block tree isn't ready to 
>>> receive the requests yet.
>>> To be ready, we need to insert a proper block driver state to the 
>>> block backend which is done here
>>>
>>>  blk_remove_bs(bjob->blk);
>>>  blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort);
>>>  blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); << << << <<
>>>
>>>  bs_opaque->job = NULL;
>>>
>>>  bdr

[Qemu-devel] PING: [PATCH] blk: postpone request execution on a context protected with "drained section"

2019-01-17 Thread Denis Plotnikov
Kevin,

could you please take a look at my last comments?

Thanks!

Denis

On 15.01.2019 10:22, Denis Plotnikov wrote:
> ping ping ping ping
> 
> On 09.01.2019 11:18, Denis Plotnikov wrote:
>> ping ping!!!
>>
>> On 18.12.2018 11:53, Denis Plotnikov wrote:
>>> ping ping
>>>
>>> On 14.12.2018 14:54, Denis Plotnikov wrote:
>>>>
>>>>
>>>> On 13.12.2018 15:20, Kevin Wolf wrote:
>>>>> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben:
>>>>>> On 12.12.2018 15:24, Kevin Wolf wrote:
>>>>>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
>>>>>>>>> Why involve the AioContext at all? This could all be kept at the
>>>>>>>>> BlockBackend level without extending the layering violation that
>>>>>>>>> aio_disable_external() is.
>>>>>>>>>
>>>>>>>>> BlockBackends get notified when their root node is drained, so 
>>>>>>>>> hooking
>>>>>>>>> things up there should be as easy, if not even easier than in
>>>>>>>>> AioContext.
>>>>>>>>
>>>>>>>> Just want to make sure that I understood correctly what you 
>>>>>>>> meant by
>>>>>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end 
>>>>>>>> calls
>>>>>>>> child's role callback blk_root_drained_end by calling
>>>>>>>> bdrv_parent_drained_end?
>>>>>>>
>>>>>>> Yes, blk_root_drained_begin/end calls are all you need. 
>>>>>>> Specifically,
>>>>>>> their adjustments to blk->quiesce_counter that are already there, 
>>>>>>> and in
>>>>>>> the 'if (--blk->quiesce_counter == 0)' block of 
>>>>>>> blk_root_drained_end()
>>>>>>> we can resume the queued requests.
>>>>>> Sounds it should be so, but it doesn't work that way and that's why:
>>>>>> when doing mirror we may resume postponed coroutines too early 
>>>>>> when the
>>>>>> underlying bs is protected from writing at and thus we encounter the
>>>>>> assert on a write request execution at bdrv_co_write_req_prepare when
>>>>>> resuming the postponed coroutines.
>>>>>>
>>>>>> The thing is that the bs is protected for writing before execution of
>>>>>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls
>>>>>> bdrv_replace_child_noperm which, in turn, calls 
>>>>>> child->role->drained_end
>>>>>> where one of the callbacks is blk_root_drained_end which check
>>>>>> if(--blk->quiesce_counter == 0) and runs the postponed requests
>>>>>> (coroutines) if the coundition is true.
>>>>>
>>>>> Hm, so something is messed up with the drain sections in the mirror
>>>>> driver. We have:
>>>>>
>>>>>  bdrv_drained_begin(target_bs);
>>>>>  bdrv_replace_node(to_replace, target_bs, &local_err);
>>>>>  bdrv_drained_end(target_bs);
>>>>>
>>>>> Obviously, the intention was to keep the BlockBackend drained during
>>>>> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0
>>>>> inside bdrv_replace_node() when target_bs is drained?
>>>>>
>>>>> Looking at bdrv_replace_child_noperm(), it seems that the function has
>>>>> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter
>>>>> for the parent reaches 0 for a moment because we call .drained_end for
>>>>> the old child first and .drained_begin for the new one later.
>>>>>
>>>>> So it seems the fix would be to reverse the order and first call
>>>>> .drained_begin for the new child and then .drained_end for the old
>>>>> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, 
>>>>> too.
>>>> Yes, it's true, but it's not enough...
>>>> In mirror_exit_common() we actively manipulate with block driver 
>>>> states.
>>>> When we replaced a node in the snippet you showed we can't allow the 
>>>> postponed coroutines to run because the block tree isn&#

Re: [Qemu-devel] PING: [PATCH] blk: postpone request execution on a context protected with "drained section"

2019-01-17 Thread Denis Plotnikov


On 17.01.2019 17:23, Kevin Wolf wrote:
> Am 17.01.2019 um 13:57 hat Denis Plotnikov geschrieben:
>> Kevin,
>>
>> could you please take a look at my last comments?
> 
> I read it, and what it told me is essentially that I need to work on it
> myself to fully understand the problem and possible acceptable solutions
> because you can't seem to find one yourself. I will, but I can't
> guarantee when I can find the time for it.
> 
> Kevin
ok. Thanks!

Denis
> 
>> On 15.01.2019 10:22, Denis Plotnikov wrote:
>>> ping ping ping ping
>>>
>>> On 09.01.2019 11:18, Denis Plotnikov wrote:
>>>> ping ping!!!
>>>>
>>>> On 18.12.2018 11:53, Denis Plotnikov wrote:
>>>>> ping ping
>>>>>
>>>>> On 14.12.2018 14:54, Denis Plotnikov wrote:
>>>>>>
>>>>>>
>>>>>> On 13.12.2018 15:20, Kevin Wolf wrote:
>>>>>>> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben:
>>>>>>>> On 12.12.2018 15:24, Kevin Wolf wrote:
>>>>>>>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
>>>>>>>>>>> Why involve the AioContext at all? This could all be kept at the
>>>>>>>>>>> BlockBackend level without extending the layering violation that
>>>>>>>>>>> aio_disable_external() is.
>>>>>>>>>>>
>>>>>>>>>>> BlockBackends get notified when their root node is drained, so
>>>>>>>>>>> hooking
>>>>>>>>>>> things up there should be as easy, if not even easier than in
>>>>>>>>>>> AioContext.
>>>>>>>>>>
>>>>>>>>>> Just want to make sure that I understood correctly what you
>>>>>>>>>> meant by
>>>>>>>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end
>>>>>>>>>> calls
>>>>>>>>>> child's role callback blk_root_drained_end by calling
>>>>>>>>>> bdrv_parent_drained_end?
>>>>>>>>>
>>>>>>>>> Yes, blk_root_drained_begin/end calls are all you need.
>>>>>>>>> Specifically,
>>>>>>>>> their adjustments to blk->quiesce_counter that are already there,
>>>>>>>>> and in
>>>>>>>>> the 'if (--blk->quiesce_counter == 0)' block of
>>>>>>>>> blk_root_drained_end()
>>>>>>>>> we can resume the queued requests.
>>>>>>>> Sounds it should be so, but it doesn't work that way and that's why:
>>>>>>>> when doing mirror we may resume postponed coroutines too early
>>>>>>>> when the
>>>>>>>> underlying bs is protected from writing at and thus we encounter the
>>>>>>>> assert on a write request execution at bdrv_co_write_req_prepare when
>>>>>>>> resuming the postponed coroutines.
>>>>>>>>
>>>>>>>> The thing is that the bs is protected for writing before execution of
>>>>>>>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls
>>>>>>>> bdrv_replace_child_noperm which, in turn, calls
>>>>>>>> child->role->drained_end
>>>>>>>> where one of the callbacks is blk_root_drained_end which check
>>>>>>>> if(--blk->quiesce_counter == 0) and runs the postponed requests
>>>>>>>> (coroutines) if the coundition is true.
>>>>>>>
>>>>>>> Hm, so something is messed up with the drain sections in the mirror
>>>>>>> driver. We have:
>>>>>>>
>>>>>>>   bdrv_drained_begin(target_bs);
>>>>>>>   bdrv_replace_node(to_replace, target_bs, &local_err);
>>>>>>>   bdrv_drained_end(target_bs);
>>>>>>>
>>>>>>> Obviously, the intention was to keep the BlockBackend drained during
>>>>>>> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0
>>>>>>> inside bdrv_replace_node() when target_bs is drained?
>>>>>>>
>>>>>>> Looking at bdrv_replace

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

2019-01-09 Thread Denis Plotnikov
ping ping!!!

On 18.12.2018 11:53, Denis Plotnikov wrote:
> ping ping
> 
> On 14.12.2018 14:54, Denis Plotnikov wrote:
>>
>>
>> On 13.12.2018 15:20, Kevin Wolf wrote:
>>> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben:
>>>> On 12.12.2018 15:24, Kevin Wolf wrote:
>>>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
>>>>>>> Why involve the AioContext at all? This could all be kept at the
>>>>>>> BlockBackend level without extending the layering violation that
>>>>>>> aio_disable_external() is.
>>>>>>>
>>>>>>> BlockBackends get notified when their root node is drained, so 
>>>>>>> hooking
>>>>>>> things up there should be as easy, if not even easier than in
>>>>>>> AioContext.
>>>>>>
>>>>>> Just want to make sure that I understood correctly what you meant by
>>>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls
>>>>>> child's role callback blk_root_drained_end by calling
>>>>>> bdrv_parent_drained_end?
>>>>>
>>>>> Yes, blk_root_drained_begin/end calls are all you need. Specifically,
>>>>> their adjustments to blk->quiesce_counter that are already there, 
>>>>> and in
>>>>> the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end()
>>>>> we can resume the queued requests.
>>>> Sounds it should be so, but it doesn't work that way and that's why:
>>>> when doing mirror we may resume postponed coroutines too early when the
>>>> underlying bs is protected from writing at and thus we encounter the
>>>> assert on a write request execution at bdrv_co_write_req_prepare when
>>>> resuming the postponed coroutines.
>>>>
>>>> The thing is that the bs is protected for writing before execution of
>>>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls
>>>> bdrv_replace_child_noperm which, in turn, calls 
>>>> child->role->drained_end
>>>> where one of the callbacks is blk_root_drained_end which check
>>>> if(--blk->quiesce_counter == 0) and runs the postponed requests
>>>> (coroutines) if the coundition is true.
>>>
>>> Hm, so something is messed up with the drain sections in the mirror
>>> driver. We have:
>>>
>>>  bdrv_drained_begin(target_bs);
>>>  bdrv_replace_node(to_replace, target_bs, &local_err);
>>>  bdrv_drained_end(target_bs);
>>>
>>> Obviously, the intention was to keep the BlockBackend drained during
>>> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0
>>> inside bdrv_replace_node() when target_bs is drained?
>>>
>>> Looking at bdrv_replace_child_noperm(), it seems that the function has
>>> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter
>>> for the parent reaches 0 for a moment because we call .drained_end for
>>> the old child first and .drained_begin for the new one later.
>>>
>>> So it seems the fix would be to reverse the order and first call
>>> .drained_begin for the new child and then .drained_end for the old
>>> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, too.
>> Yes, it's true, but it's not enough...
>> In mirror_exit_common() we actively manipulate with block driver states.
>> When we replaced a node in the snippet you showed we can't allow the 
>> postponed coroutines to run because the block tree isn't ready to 
>> receive the requests yet.
>> To be ready, we need to insert a proper block driver state to the 
>> block backend which is done here
>>
>>  blk_remove_bs(bjob->blk);
>>  blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort);
>>  blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); << << << <<
>>
>>  bs_opaque->job = NULL;
>>
>>  bdrv_drained_end(src);
>>
>> If the tree isn't ready and we resume the coroutines, we'll end up 
>> with the request landed in a wrong block driver state.
>>
>> So, we explicitly should stop all activities on all the driver states
>> and its parents and allow the activities when everything is ready to go.
>>
>> Why explicitly, because the block driver states may belong to 
>> diffe

Re: [Qemu-devel] [Qemu-block] [PATCH v0 0/2] Postponed actions

2018-08-27 Thread Denis Plotnikov

PING! PING!

On 14.08.2018 10:08, Denis Plotnikov wrote:



On 13.08.2018 19:30, Kevin Wolf wrote:

Am 13.08.2018 um 10:32 hat Denis Plotnikov geschrieben:

Ping ping!

On 16.07.2018 21:59, John Snow wrote:



On 07/16/2018 11:01 AM, Denis Plotnikov wrote:

Ping!



I never saw a reply to Stefan's question on July 2nd, did you reply
off-list?

--js

Yes, I did. I talked to Stefan why the patch set appeared.


The rest of us still don't know the answer. I had the same question.

Kevin

Yes, that's my fault. I should have post it earlier.

I reviewed the problem once again and come up with the following 
explanation.
Indeed, if the global lock has been taken by the main thread the vCPU 
threads won't be able to execute mmio ide.

But, if the main thread will release the lock then nothing will prevent
vCPU threads form execution what they want, e.g writing to the block 
device.


In case of running the mirroring it is possible. Let's take a look
at the following snippet of mirror_run. This is a part the mirroring 
completion part.


     bdrv_drained_begin(bs);
     cnt = bdrv_get_dirty_count(s->dirty_bitmap);
 >>>>>>  if (cnt > 0 || mirror_flush(s) < 0) {
     bdrv_drained_end(bs);
     continue;
     }

(X) >>>>    assert(QLIST_EMPTY(&bs->tracked_requests));

mirror_flush here can yield the current coroutine so nothing more can be 
executed.
We could end up with the situation when the main loop have to revolve to 
poll for another timer/bh to process. While revolving it releases the 
global lock. If the global lock is waited for by a vCPU (any other) 
thread, the waiting thread will get the lock and make what it intends.


This is something that I can observe:

mirror_flush yields coroutine, the main thread revolves and locks 
because a vCPU was waiting for the lock. Now the vCPU thread owns the 
lock and the main thread waits for the lock releasing.

The vCPU thread does cmd_write_dma and releases the lock. Then, the main
thread gets the lock and continues to run eventually proceeding with the 
coroutine yeiled.
If the vCPU requests aren't completed by the moment we will assert at 
(X). If the vCPU requests are completed we won't even notice that we had 
some writes while in the drained section.


Denis



On 29.06.2018 15:40, Denis Plotnikov wrote:

There are cases when a request to a block driver state shouldn't have
appeared producing dangerous race conditions.
This misbehaviour is usually happens with storage devices emulated
without eventfd for guest to host notifications like IDE.

The issue arises when the context is in the "drained" section
and doesn't expect the request to come, but request comes from the
device not using iothread and which context is processed by the main
loop.

The main loop apart of the iothread event loop isn't blocked by the
"drained" section.
The request coming and processing while in "drained" section can 
spoil

the
block driver state consistency.

This behavior can be observed in the following KVM-based case:

1. Setup a VM with an IDE disk.
2. Inside a VM start a disk writing load for the IDE device
     e.g: dd if= of= bs=X count=Y oflag=direct
3. On the host create a mirroring block job for the IDE device
     e.g: drive_mirror  
4. On the host finish the block job
     e.g: block_job_complete 
    Having done the 4th action, you could get an assert:
assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run.
On my setup, the assert is 1/3 reproducible.

The patch series introduces the mechanism to postpone the requests
until the BDS leaves "drained" section for the devices not using
iothreads.
Also, it modifies the asynchronous block backend infrastructure to 
use

that mechanism to release the assert bug for IDE devices.

Denis Plotnikov (2):
     async: add infrastructure for postponed actions
     block: postpone the coroutine executing if the BDS's is drained

    block/block-backend.c | 58 
++-
    include/block/aio.h   | 63 
+++

    util/async.c  | 33 +++
    3 files changed, 142 insertions(+), 12 deletions(-)





--
Best,
Denis




--
Best,
Denis



Re: [Qemu-devel] [Qemu-block] [PATCH v0 0/2] Postponed actions

2018-08-28 Thread Denis Plotnikov




On 27.08.2018 19:05, John Snow wrote:



On 08/27/2018 03:05 AM, Denis Plotnikov wrote:

PING! PING!



Sorry, Kevin and Stefan are both on PTO right now, I think. I can't
promise I have the time to look soon, but you at least deserve an answer
for the radio silence the last week.

--js

Thanks for the response!
I'll be waiting for some comments!

Denis



On 14.08.2018 10:08, Denis Plotnikov wrote:



On 13.08.2018 19:30, Kevin Wolf wrote:

Am 13.08.2018 um 10:32 hat Denis Plotnikov geschrieben:

Ping ping!

On 16.07.2018 21:59, John Snow wrote:



On 07/16/2018 11:01 AM, Denis Plotnikov wrote:

Ping!



I never saw a reply to Stefan's question on July 2nd, did you reply
off-list?

--js

Yes, I did. I talked to Stefan why the patch set appeared.


The rest of us still don't know the answer. I had the same question.

Kevin

Yes, that's my fault. I should have post it earlier.

I reviewed the problem once again and come up with the following
explanation.
Indeed, if the global lock has been taken by the main thread the vCPU
threads won't be able to execute mmio ide.
But, if the main thread will release the lock then nothing will prevent
vCPU threads form execution what they want, e.g writing to the block
device.

In case of running the mirroring it is possible. Let's take a look
at the following snippet of mirror_run. This is a part the mirroring
completion part.

  bdrv_drained_begin(bs);
  cnt = bdrv_get_dirty_count(s->dirty_bitmap);
  >>>>>>  if (cnt > 0 || mirror_flush(s) < 0) {
  bdrv_drained_end(bs);
  continue;
  }

(X) >>>>    assert(QLIST_EMPTY(&bs->tracked_requests));

mirror_flush here can yield the current coroutine so nothing more can
be executed.
We could end up with the situation when the main loop have to revolve
to poll for another timer/bh to process. While revolving it releases
the global lock. If the global lock is waited for by a vCPU (any
other) thread, the waiting thread will get the lock and make what it
intends.

This is something that I can observe:

mirror_flush yields coroutine, the main thread revolves and locks
because a vCPU was waiting for the lock. Now the vCPU thread owns the
lock and the main thread waits for the lock releasing.
The vCPU thread does cmd_write_dma and releases the lock. Then, the main
thread gets the lock and continues to run eventually proceeding with
the coroutine yeiled.
If the vCPU requests aren't completed by the moment we will assert at
(X). If the vCPU requests are completed we won't even notice that we
had some writes while in the drained section.

Denis



On 29.06.2018 15:40, Denis Plotnikov wrote:

There are cases when a request to a block driver state shouldn't
have
appeared producing dangerous race conditions.
This misbehaviour is usually happens with storage devices emulated
without eventfd for guest to host notifications like IDE.

The issue arises when the context is in the "drained" section
and doesn't expect the request to come, but request comes from the
device not using iothread and which context is processed by the main
loop.

The main loop apart of the iothread event loop isn't blocked by the
"drained" section.
The request coming and processing while in "drained" section can
spoil
the
block driver state consistency.

This behavior can be observed in the following KVM-based case:

1. Setup a VM with an IDE disk.
2. Inside a VM start a disk writing load for the IDE device
      e.g: dd if= of= bs=X count=Y oflag=direct
3. On the host create a mirroring block job for the IDE device
      e.g: drive_mirror  
4. On the host finish the block job
      e.g: block_job_complete 
     Having done the 4th action, you could get an assert:
assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run.
On my setup, the assert is 1/3 reproducible.

The patch series introduces the mechanism to postpone the requests
until the BDS leaves "drained" section for the devices not using
iothreads.
Also, it modifies the asynchronous block backend infrastructure
to use
that mechanism to release the assert bug for IDE devices.

Denis Plotnikov (2):
      async: add infrastructure for postponed actions
      block: postpone the coroutine executing if the BDS's is drained

     block/block-backend.c | 58
++-
     include/block/aio.h   | 63
+++
     util/async.c  | 33 +++
     3 files changed, 142 insertions(+), 12 deletions(-)





--
Best,
Denis








--
Best,
Denis



Re: [Qemu-devel] [PATCH v1 00/17] Background snapshots

2018-09-04 Thread Denis Plotnikov

Hi Peter

I moved the code to the repository 
https://github.com/denis-plotnikov/qemu/tree/background-snapshot-kvm.
the current version includes fixes with respect to your comments for 
version 1.
I moved KVM related patches to the end of the branch (formerly patch 
series).
Since, the KVM patches and the other parts to modify (vhost an others) 
are needless in favor of upcoming userfaltfd,
I would ask you to review the general framework which is able to work 
with tcg.


Thanks in advance!

Denis

On 20.07.2018 12:27, Peter Xu wrote:

On Wed, Jul 18, 2018 at 06:41:43PM +0300, Denis Plotnikov wrote:

The workflow to make a snapshot is the following:
1. Pause the vm
2. Make a snapshot of block devices using the scheme of your choice
3. Turn on background-snapshot migration capability
4. Start the migration using the destination (migration stream) of your choice.
The migration will resume the vm execution by itself
when it has the devices' states saved  and is ready to start ram writing
to the migration stream.
5. Listen to the migration finish event

The bakground snapshot works with support of KVM patch:
"x86: mmu: report failed memory access to the userspace"
(not applied to the mainstream, it's in the kvm mailing list)


Hello, Denis,

Do you mind to push your tree to an online repository in case to make
review easier?

Thanks,



--
Best,
Denis



Re: [Qemu-devel] [PATCH v1 00/17] Background snapshots

2018-09-05 Thread Denis Plotnikov

Hi Peter,

Thanks for the reply.

Ok, I understand about tcg.
So my only option is to wait for userfaultfd-wp.
Do you know if anyone is  currently working on this? And if so, then is 
there any estimations when the userfaultfd is ready?


Denis


On 05.09.2018 06:32, Peter Xu wrote:

On Tue, Sep 04, 2018 at 04:00:31PM +0300, Denis Plotnikov wrote:

Hi Peter


Hi, Denis,



I moved the code to the repository
https://github.com/denis-plotnikov/qemu/tree/background-snapshot-kvm.
the current version includes fixes with respect to your comments for version
1.
I moved KVM related patches to the end of the branch (formerly patch
series).
Since, the KVM patches and the other parts to modify (vhost an others) are
needless in favor of upcoming userfaltfd,
I would ask you to review the general framework which is able to work with
tcg.

Thanks in advance!


Thank you for pushing the tree.

I might have made a mistake before that I thought this work is at
least working for TCG, but I think I was wrong.  The problem is (I'm
trying to repeat Dave's question that you seems haven't yet answered):
even for TCG there could be use cases where the process might access
guest memory from the kernel space (e.g., vhost, or any system calls
that with a guest memory buffer passed in).  I'm afraid mprotect() and
the whole signal-based mechanism cannot be able to address these page
faults, then we'll encounter adhoc errors and we'll need to fix all
these places up.  Userfaultfd-wp should not have this problem.

I think the general idea of the work is good, but I'm not sure whether
we can merge the work if we don't settle these issues.

Regards,



--
Best,
Denis



Re: [Qemu-devel] [Qemu-block] [PATCH v0 0/2] Postponed actions

2018-09-10 Thread Denis Plotnikov

PING PING!

On 28.08.2018 13:23, Denis Plotnikov wrote:



On 27.08.2018 19:05, John Snow wrote:



On 08/27/2018 03:05 AM, Denis Plotnikov wrote:

PING! PING!



Sorry, Kevin and Stefan are both on PTO right now, I think. I can't
promise I have the time to look soon, but you at least deserve an answer
for the radio silence the last week.

--js

Thanks for the response!
I'll be waiting for some comments!

Denis



On 14.08.2018 10:08, Denis Plotnikov wrote:



On 13.08.2018 19:30, Kevin Wolf wrote:

Am 13.08.2018 um 10:32 hat Denis Plotnikov geschrieben:

Ping ping!

On 16.07.2018 21:59, John Snow wrote:



On 07/16/2018 11:01 AM, Denis Plotnikov wrote:

Ping!



I never saw a reply to Stefan's question on July 2nd, did you reply
off-list?

--js

Yes, I did. I talked to Stefan why the patch set appeared.


The rest of us still don't know the answer. I had the same question.

Kevin

Yes, that's my fault. I should have post it earlier.

I reviewed the problem once again and come up with the following
explanation.
Indeed, if the global lock has been taken by the main thread the vCPU
threads won't be able to execute mmio ide.
But, if the main thread will release the lock then nothing will prevent
vCPU threads form execution what they want, e.g writing to the block
device.

In case of running the mirroring it is possible. Let's take a look
at the following snippet of mirror_run. This is a part the mirroring
completion part.

  bdrv_drained_begin(bs);
  cnt = bdrv_get_dirty_count(s->dirty_bitmap);
  >>>>>>  if (cnt > 0 || mirror_flush(s) < 0) {
  bdrv_drained_end(bs);
  continue;
  }

(X) >>>>    assert(QLIST_EMPTY(&bs->tracked_requests));

mirror_flush here can yield the current coroutine so nothing more can
be executed.
We could end up with the situation when the main loop have to revolve
to poll for another timer/bh to process. While revolving it releases
the global lock. If the global lock is waited for by a vCPU (any
other) thread, the waiting thread will get the lock and make what it
intends.

This is something that I can observe:

mirror_flush yields coroutine, the main thread revolves and locks
because a vCPU was waiting for the lock. Now the vCPU thread owns the
lock and the main thread waits for the lock releasing.
The vCPU thread does cmd_write_dma and releases the lock. Then, the 
main

thread gets the lock and continues to run eventually proceeding with
the coroutine yeiled.
If the vCPU requests aren't completed by the moment we will assert at
(X). If the vCPU requests are completed we won't even notice that we
had some writes while in the drained section.

Denis



On 29.06.2018 15:40, Denis Plotnikov wrote:

There are cases when a request to a block driver state shouldn't
have
appeared producing dangerous race conditions.
This misbehaviour is usually happens with storage devices emulated
without eventfd for guest to host notifications like IDE.

The issue arises when the context is in the "drained" section
and doesn't expect the request to come, but request comes from the
device not using iothread and which context is processed by the 
main

loop.

The main loop apart of the iothread event loop isn't blocked by 
the

"drained" section.
The request coming and processing while in "drained" section can
spoil
the
block driver state consistency.

This behavior can be observed in the following KVM-based case:

1. Setup a VM with an IDE disk.
2. Inside a VM start a disk writing load for the IDE device
      e.g: dd if= of= bs=X count=Y oflag=direct
3. On the host create a mirroring block job for the IDE device
      e.g: drive_mirror  
4. On the host finish the block job
      e.g: block_job_complete 
     Having done the 4th action, you could get an assert:
assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run.
On my setup, the assert is 1/3 reproducible.

The patch series introduces the mechanism to postpone the requests
until the BDS leaves "drained" section for the devices not using
iothreads.
Also, it modifies the asynchronous block backend infrastructure
to use
that mechanism to release the assert bug for IDE devices.

Denis Plotnikov (2):
      async: add infrastructure for postponed actions
      block: postpone the coroutine executing if the BDS's is 
drained


     block/block-backend.c | 58
++-
     include/block/aio.h   | 63
+++
     util/async.c  | 33 +++
     3 files changed, 142 insertions(+), 12 deletions(-)





--
Best,
Denis










--
Best,
Denis



Re: [Qemu-devel] [PATCH v0 2/2] block: postpone the coroutine executing if the BDS's is drained

2018-09-12 Thread Denis Plotnikov




On 10.09.2018 15:41, Kevin Wolf wrote:

Am 29.06.2018 um 14:40 hat Denis Plotnikov geschrieben:

Fixes the problem of ide request appearing when the BDS is in
the "drained section".

Without the patch the request can come and be processed by the main
event loop, as the ide requests are processed by the main event loop
and the main event loop doesn't stop when its context is in the
"drained section".
The request execution is postponed until the end of "drained section".

The patch doesn't modify ide specific code, as well as any other
device code. Instead, it modifies the infrastructure of asynchronous
Block Backend requests, in favor of postponing the requests arisen
when in "drained section" to remove the possibility of request appearing
for all the infrastructure clients.

This approach doesn't make vCPU processing the request wait untill
the end of request processing.

Signed-off-by: Denis Plotnikov 


I generally agree with the idea that requests should be queued during a
drained section. However, I think there are a few fundamental problems
with the implementation in this series:

1) aio_disable_external() is already a layering violation and we'd like
to get rid of it (by replacing it with a BlockDevOps callback from
BlockBackend to the devices), so adding more functionality there
feels like a step in the wrong direction.

2) Only blk_aio_* are fixed, while we also have synchronous public
interfaces (blk_pread/pwrite) as well as coroutine-based ones
(blk_co_*). They need to be postponed as well.

Good point! Thanks!


blk_co_preadv/pwritev() are the common point in the call chain for
all of these variants, so this is where the fix needs to live.
Using the common point might be a good idea, but in case aio requests we 
also have to mane completions which out of the scope of 
blk_co_p(read|write)v:


static void blk_aio_write_entry(void *opaque) {
...
rwco->ret = blk_co_pwritev(...);

blk_aio_complete(acb);
...
}

This makes the difference.
I would suggest adding waiting until "drained_end" is done on the 
synchronous read/write at blk_prw


  >

3) Within a drained section, you want requests from other users to be
blocked, but not your own ones (essentially you want exclusive
access). We don't have blk_drained_begin/end() yet, so this is not
something to implement right now, but let's keep this requirement in
mind and choose a design that allows this.
There is an idea to distinguish the requests that should be done without 
respect to "drained section" by using a flag in BdrvRequestFlags. The 
requests with a flag set should be processed anyway.


I believe the whole logic should be kept local to BlockBackend, and
blk_root_drained_begin/end() should be the functions that start queuing
requests or let queued requests resume.

As we are already in coroutine context in blk_co_preadv/pwritev(), after
checking that blk->quiesce_counter > 0, we can enter the coroutine
object into a list and yield. blk_root_drained_end() calls aio_co_wake()
for each of the queued coroutines. This should be all that we need to
manage.
In my understanding by using brdv_drained_begin/end we want to protect a 
certain BlockDriverState from external access but not the whole 
BlockBackend which may involve using a number of BlockDriverState-s.
I though it because we could possibly change a backing file for some 
BlockDriverState. And for the time of changing we need to prevent 
external access to it but keep the io going.
By using blk_root_drained_begin/end() we put to "drained section" all 
the BlockDriverState-s linked to that root.

Does it have to be so?

Denis



Kevin



--
Best,
Denis



Re: [Qemu-devel] [PATCH v0 2/2] block: postpone the coroutine executing if the BDS's is drained

2018-09-12 Thread Denis Plotnikov




On 12.09.2018 16:15, Kevin Wolf wrote:

Am 12.09.2018 um 14:03 hat Denis Plotnikov geschrieben:

On 10.09.2018 15:41, Kevin Wolf wrote:

Am 29.06.2018 um 14:40 hat Denis Plotnikov geschrieben:

Fixes the problem of ide request appearing when the BDS is in
the "drained section".

Without the patch the request can come and be processed by the main
event loop, as the ide requests are processed by the main event loop
and the main event loop doesn't stop when its context is in the
"drained section".
The request execution is postponed until the end of "drained section".

The patch doesn't modify ide specific code, as well as any other
device code. Instead, it modifies the infrastructure of asynchronous
Block Backend requests, in favor of postponing the requests arisen
when in "drained section" to remove the possibility of request appearing
for all the infrastructure clients.

This approach doesn't make vCPU processing the request wait untill
the end of request processing.

Signed-off-by: Denis Plotnikov 


I generally agree with the idea that requests should be queued during a
drained section. However, I think there are a few fundamental problems
with the implementation in this series:

1) aio_disable_external() is already a layering violation and we'd like
 to get rid of it (by replacing it with a BlockDevOps callback from
 BlockBackend to the devices), so adding more functionality there
 feels like a step in the wrong direction.

2) Only blk_aio_* are fixed, while we also have synchronous public
 interfaces (blk_pread/pwrite) as well as coroutine-based ones
 (blk_co_*). They need to be postponed as well.

Good point! Thanks!


 blk_co_preadv/pwritev() are the common point in the call chain for
 all of these variants, so this is where the fix needs to live.

Using the common point might be a good idea, but in case aio requests we
also have to mane completions which out of the scope of
blk_co_p(read|write)v:


I don't understand what you mean here (possibly because I fail to
understand the word "mane") and what completions have to do with

mane = make

queueing of requests.

Just to clarify, we are talking about the following situation, right?
bdrv_drain_all_begin() has returned, so all the old requests have
already been drained and their completion callback has already been
called. For any new requests that come in, we need to queue them until
the drained section ends. In other words, they won't reach the point
where they could possibly complete before .drained_end.

Yes

To make it clear: I'm trying to defend the idea that putting the 
postponing routine in blk_co_preadv/pwritev is not the best choice and 
that's why:


If I understood your idea correctly, if we do the postponing inside
blk_co_p(write|read)v we don't know whether we do synchronous or 
asynchronous request.
We need to know this because if we postpone an async request then, 
later, on the postponed requests processing, we must to make "a 
completion" for that request stating that it's finally "done".


Furthermore, for sync requests if we postpone them, we must block the 
clients issued them until the requests postponed have been processed on 
drained section leaving. This would ask an additional notification 
mechanism. Instead, we can just check whether we could proceed in 
blk_p(write|read) and if not (we're in drained) to wait there.


We avoid the things above if we postponing in blk_aio_prwv and waiting 
in blk_prw without postponing.


What do you think?




static void blk_aio_write_entry(void *opaque) {
 ...
 rwco->ret = blk_co_pwritev(...);

 blk_aio_complete(acb);
 ...
}

This makes the difference.
I would suggest adding waiting until "drained_end" is done on the
synchronous read/write at blk_prw


It is possible, but then the management becomes a bit more complicated
because you have more than just a list of Coroutines that you need to
wake up.

One thing that could be problematic in blk_co_preadv/pwritev is that
blk->in_flight would count even requests that are queued if we're not
careful. Then a nested drain would deadlock because the BlockBackend
would never say that it's quiesced.


   >

3) Within a drained section, you want requests from other users to be
 blocked, but not your own ones (essentially you want exclusive
 access). We don't have blk_drained_begin/end() yet, so this is not
 something to implement right now, but let's keep this requirement in
 mind and choose a design that allows this.

There is an idea to distinguish the requests that should be done without
respect to "drained section" by using a flag in BdrvRequestFlags. The
requests with a flag set should be processed anyway.


I don't think that would work because the accesses can be nested qui

Re: [Qemu-devel] [PATCH v0 0/7] Background snapshots

2018-07-16 Thread Denis Plotnikov




On 13.07.2018 08:20, Peter Xu wrote:

On Fri, Jun 29, 2018 at 11:03:13AM +0300, Denis Plotnikov wrote:

The patch set adds the ability to make external snapshots while VM is running.

The workflow to make a snapshot is the following:
1. Pause the vm
2. Make a snapshot of block devices using the scheme of your choice
3. Turn on background-snapshot migration capability
4. Start the migration using the destination (migration stream) of your choice.
The migration will resume the vm execution by itself
when it has the devices' states saved  and is ready to start ram writing
to the migration stream.
5. Listen to the migration finish event

The feature relies on KVM unapplied ability to report the faulting address.
Please find the KVM patch snippet to make the patchset work below:

+++ b/arch/x86/kvm/vmx.c
@@ -,X +,XX @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
  
 vcpu->arch.exit_qualification = exit_qualification;
  
-   return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);

+   r = kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+if (r == -EFAULT) {
+   unsigned long hva = kvm_vcpu_gfn_to_hva(vcpu, gpa >> 
PAGE_SHIFT);
+
+   vcpu->run->exit_reason = KVM_EXIT_FAIL_MEM_ACCESS;
+   vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
+   vcpu->run->fail_mem_access.hva = hva | (gpa & (PAGE_SIZE-1));
+   r = 0;
+
+   }
+   return r;

The patch to KVM can be sent if the patch set approved


Hi, Denis,

If the work will definitely require KVM to cooperate, AFAIU the thing
we normally do is that we first propose the kernel counterpart on kvm
list, then it'll be easier to review the QEMU counterpart (or, propose
both kvm/qemu changes at the same time, always the QEMU changes can be
RFC, as a reference to prove the kvm change is valid and useful).  Not
sure whether you should do this as well for this live snapshot work.

Since we might have two backends in the future, my major question for
that counterpart series would be whether we need to support both in
the future (mprotect, and userfaultfd), and the differences between
the two methods from kernel's point of view.  I would vaguely guess
that we can at least firstly have mprotect work then userfaultfd then
we can automatically choose the backend when both are provided, but I
guess that discussion might still better happen on the kvm list.  Also
I would also guess that in that work you'd better consider no-ept case
as well for Intel, even for AMD.  But not sure we can at least start a
RFC with the simplest scenario and prove its validity.

Regards,


Hi, Peter,
I think this is a good idea to go through the KVM path firstly.
When the discussion come to some conclusion further steps may become
more clear.
I'll send the patch there shortly to start the discussion.

Thanks!

Best, Denis



Re: [Qemu-devel] [PATCH v0 0/2] Postponed actions

2018-07-16 Thread Denis Plotnikov

Ping!

On 29.06.2018 15:40, Denis Plotnikov wrote:

There are cases when a request to a block driver state shouldn't have
appeared producing dangerous race conditions.
This misbehaviour is usually happens with storage devices emulated
without eventfd for guest to host notifications like IDE.

The issue arises when the context is in the "drained" section
and doesn't expect the request to come, but request comes from the
device not using iothread and which context is processed by the main loop.

The main loop apart of the iothread event loop isn't blocked by the
"drained" section.
The request coming and processing while in "drained" section can spoil the
block driver state consistency.

This behavior can be observed in the following KVM-based case:

1. Setup a VM with an IDE disk.
2. Inside a VM start a disk writing load for the IDE device
   e.g: dd if= of= bs=X count=Y oflag=direct
3. On the host create a mirroring block job for the IDE device
   e.g: drive_mirror  
4. On the host finish the block job
   e.g: block_job_complete 
  
Having done the 4th action, you could get an assert:

assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run.
On my setup, the assert is 1/3 reproducible.

The patch series introduces the mechanism to postpone the requests
until the BDS leaves "drained" section for the devices not using iothreads.
Also, it modifies the asynchronous block backend infrastructure to use
that mechanism to release the assert bug for IDE devices.

Denis Plotnikov (2):
   async: add infrastructure for postponed actions
   block: postpone the coroutine executing if the BDS's is drained

  block/block-backend.c | 58 ++-
  include/block/aio.h   | 63 +++
  util/async.c  | 33 +++
  3 files changed, 142 insertions(+), 12 deletions(-)



--
Best,
Denis



Re: [Qemu-devel] [Qemu-block] [PATCH v0 0/2] Postponed actions

2018-07-18 Thread Denis Plotnikov




On 16.07.2018 21:59, John Snow wrote:



On 07/16/2018 11:01 AM, Denis Plotnikov wrote:

Ping!



I never saw a reply to Stefan's question on July 2nd, did you reply
off-list?
For some reason, there are no Stefan's replies on my server. Found it in 
the web. Will respond to it shortly.


Thanks!

Denis


--js


On 29.06.2018 15:40, Denis Plotnikov wrote:

There are cases when a request to a block driver state shouldn't have
appeared producing dangerous race conditions.
This misbehaviour is usually happens with storage devices emulated
without eventfd for guest to host notifications like IDE.

The issue arises when the context is in the "drained" section
and doesn't expect the request to come, but request comes from the
device not using iothread and which context is processed by the main
loop.

The main loop apart of the iothread event loop isn't blocked by the
"drained" section.
The request coming and processing while in "drained" section can spoil
the
block driver state consistency.

This behavior can be observed in the following KVM-based case:

1. Setup a VM with an IDE disk.
2. Inside a VM start a disk writing load for the IDE device
    e.g: dd if= of= bs=X count=Y oflag=direct
3. On the host create a mirroring block job for the IDE device
    e.g: drive_mirror  
4. On the host finish the block job
    e.g: block_job_complete 
   Having done the 4th action, you could get an assert:
assert(QLIST_EMPTY(&bs->tracked_requests)) from mirror_run.
On my setup, the assert is 1/3 reproducible.

The patch series introduces the mechanism to postpone the requests
until the BDS leaves "drained" section for the devices not using
iothreads.
Also, it modifies the asynchronous block backend infrastructure to use
that mechanism to release the assert bug for IDE devices.

Denis Plotnikov (2):
    async: add infrastructure for postponed actions
    block: postpone the coroutine executing if the BDS's is drained

   block/block-backend.c | 58 ++-
   include/block/aio.h   | 63 +++
   util/async.c  | 33 +++
   3 files changed, 142 insertions(+), 12 deletions(-)





--
Best,
Denis



[Qemu-devel] [PATCH v1 09/17] background snapshot: extend RAM request for holding a page copy pointer

2018-07-18 Thread Denis Plotnikov
This pointer is going to be used to transfer a memory.
Once the memory page is copied the content the snapshot interested in is
saved for writing and we can make the page writable again.

Signed-off-by: Denis Plotnikov 
---
 migration/ram.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index ce3dead932..dc7dfe0726 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -188,6 +188,7 @@ struct RAMSrcPageRequest {
 RAMBlock *rb;
 hwaddroffset;
 hwaddrlen;
+void *page_copy;
 
 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 };
@@ -265,6 +266,8 @@ struct PageSearchStatus {
 unsigned long page;
 /* Set once we wrap around */
 bool complete_round;
+/* Pointer to the cached page */
+void *page_copy;
 };
 typedef struct PageSearchStatus PageSearchStatus;
 
-- 
2.17.0




[Qemu-devel] [PATCH v1 15/17] kvm: add vCPU failed memeory access processing

2018-07-18 Thread Denis Plotnikov
Is done with support of the KVM patch returning the faulting address.

Signed-off-by: Denis Plotnikov 
---
 target/i386/kvm.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 3ac5302bc5..55b8860d1a 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -45,6 +45,8 @@
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "migration/blocker.h"
+#include "migration/savevm.h"
+#include "migration/ram.h"
 #include "exec/memattrs.h"
 #include "trace.h"
 
@@ -3130,6 +3132,18 @@ static bool host_supports_vmx(void)
 return ecx & CPUID_EXT_VMX;
 }
 
+static int kvm_handle_fail_mem_access(CPUState *cpu)
+{
+struct kvm_run *run = cpu->kvm_run;
+int ret = ram_process_page_fault((void *)run->fail_mem_access.hva);
+
+if (ret >= 0) {
+cpu_resume(cpu);
+}
+
+return ret;
+}
+
 #define VMX_INVALID_GUEST_STATE 0x8021
 
 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
@@ -3188,6 +3202,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run 
*run)
 ioapic_eoi_broadcast(run->eoi.vector);
 ret = 0;
 break;
+case KVM_EXIT_FAIL_MEM_ACCESS:
+ret = kvm_handle_fail_mem_access(cs);
+break;
 default:
 fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
 ret = -1;
-- 
2.17.0




[Qemu-devel] [PATCH v1 05/17] ram: extend the data structures for background snapshotting

2018-07-18 Thread Denis Plotnikov
Signed-off-by: Denis Plotnikov 
---
 include/exec/ram_addr.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 6cbc02aa0f..5b403d537d 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -36,6 +36,8 @@ struct RAMBlock {
 char idstr[256];
 /* RCU-enabled, writes protected by the ramlist lock */
 QLIST_ENTRY(RAMBlock) next;
+/* blocks used for background snapshot */
+QLIST_ENTRY(RAMBlock) bgs_next;
 QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers;
 int fd;
 size_t page_size;
@@ -49,6 +51,11 @@ struct RAMBlock {
 unsigned long *unsentmap;
 /* bitmap of already received pages in postcopy */
 unsigned long *receivedmap;
+/* The following 2 are for background snapshot */
+/* Pages currently being copied */
+unsigned long *touched_map;
+/* Pages has been copied already */
+unsigned long *copied_map;
 };
 
 static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset)
-- 
2.17.0




  1   2   3   4   5   6   >