[QEMU][PATCH 1/5] MAINTAINERS: Update maintainer's email for Xilinx CAN

2022-09-10 Thread Vikram Garhwal
Signed-off-by: Vikram Garhwal 
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1729c0901c..1d45e92271 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1751,8 +1751,8 @@ F: tests/qtest/intel-hda-test.c
 F: tests/qtest/fuzz-sb16-test.c
 
 Xilinx CAN
-M: Vikram Garhwal 
-M: Francisco Iglesias 
+M: Vikram Garhwal 
+M: Francisco Iglesias 
 S: Maintained
 F: hw/net/can/xlnx-*
 F: include/hw/net/xlnx-*
-- 
2.17.1




[QEMU][PATCH 2/5] hw/net/can: Introduce Xilinx Versal CANFD controller

2022-09-10 Thread Vikram Garhwal
The Xilinx Versal CANFD controller is developed based on SocketCAN, QEMU CAN bus
implementation. Bus connection and socketCAN connection for each CAN module
can be set through command lines.

Example for using connecting CANFD0 and CANFD1 to same bus:
-machine xlnx-versal-virt
-object can-bus,id=canbus
-machine canbus0=canbus
-machine canbus1=canbus

To create virtual CAN on the host machine, please check the QEMU CAN docs:
https://github.com/qemu/qemu/blob/master/docs/can.txt

Signed-off-by: Vikram Garhwal 
---
 hw/net/can/meson.build |1 +
 hw/net/can/trace-events|7 +
 hw/net/can/xlnx-versal-canfd.c | 2157 
 include/hw/net/xlnx-versal-canfd.h |   92 ++
 4 files changed, 2257 insertions(+)
 create mode 100644 hw/net/can/xlnx-versal-canfd.c
 create mode 100644 include/hw/net/xlnx-versal-canfd.h

diff --git a/hw/net/can/meson.build b/hw/net/can/meson.build
index 8fabbd9ee6..8d85201cb0 100644
--- a/hw/net/can/meson.build
+++ b/hw/net/can/meson.build
@@ -5,3 +5,4 @@ softmmu_ss.add(when: 'CONFIG_CAN_PCI', if_true: 
files('can_mioe3680_pci.c'))
 softmmu_ss.add(when: 'CONFIG_CAN_CTUCANFD', if_true: files('ctucan_core.c'))
 softmmu_ss.add(when: 'CONFIG_CAN_CTUCANFD_PCI', if_true: files('ctucan_pci.c'))
 softmmu_ss.add(when: 'CONFIG_XLNX_ZYNQMP', if_true: files('xlnx-zynqmp-can.c'))
+softmmu_ss.add(when: 'CONFIG_XLNX_VERSAL', if_true: 
files('xlnx-versal-canfd.c'))
diff --git a/hw/net/can/trace-events b/hw/net/can/trace-events
index 8346a98ab5..de64ac1b31 100644
--- a/hw/net/can/trace-events
+++ b/hw/net/can/trace-events
@@ -7,3 +7,10 @@ xlnx_can_filter_mask_pre_write(uint8_t filter_num, uint32_t 
value) "Filter%d MAS
 xlnx_can_tx_data(uint32_t id, uint8_t dlc, uint8_t db0, uint8_t db1, uint8_t 
db2, uint8_t db3, uint8_t db4, uint8_t db5, uint8_t db6, uint8_t db7) "Frame: 
ID: 0x%08x DLC: 0x%02x DATA: 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 
0x%02x"
 xlnx_can_rx_data(uint32_t id, uint32_t dlc, uint8_t db0, uint8_t db1, uint8_t 
db2, uint8_t db3, uint8_t db4, uint8_t db5, uint8_t db6, uint8_t db7) "Frame: 
ID: 0x%08x DLC: 0x%02x DATA: 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 
0x%02x"
 xlnx_can_rx_discard(uint32_t status) "Controller is not enabled for bus 
communication. Status Register: 0x%08x"
+
+# xlnx-versal-canfd.c
+xlnx_canfd_update_irq(char *path, uint32_t isr, uint32_t ier, uint32_t irq) 
"%s: ISR: 0x%08x IER: 0x%08x IRQ: 0x%08x"
+xlnx_canfd_rx_fifo_filter_reject(char *path, uint32_t id, uint8_t dlc) "%s: 
Frame: ID: 0x%08x DLC: 0x%02x"
+xlnx_canfd_rx_data(char *path, uint32_t id, uint8_t dlc, uint8_t flags) "%s: 
Frame: ID: 0x%08x DLC: 0x%02x CANFD Flag: 0x%02x"
+xlnx_canfd_tx_data(char *path, uint32_t id, uint8_t dlc, uint8_t flgas) "%s: 
Frame: ID: 0x%08x DLC: 0x%02x CANFD Flag: 0x%02x"
+xlnx_canfd_reset(char *path, uint32_t val) "%s: Resetting controller with 
value = 0x%08x"
diff --git a/hw/net/can/xlnx-versal-canfd.c b/hw/net/can/xlnx-versal-canfd.c
new file mode 100644
index 00..7cfc8fee12
--- /dev/null
+++ b/hw/net/can/xlnx-versal-canfd.c
@@ -0,0 +1,2157 @@
+/*
+ * QEMU model of the Xilinx Versal CANFD device.
+ *
+ * This implementation is based on the following datasheet:
+ * https://docs.xilinx.com/v/u/2.0-English/pg223-canfd
+ *
+ * Copyright (c) 2022 AMD Inc.
+ *
+ * Written-by: Vikram Garhwal
+ *
+ * Based on QEMU CANFD Device emulation implemented by Jin Yang, Deniz Eren and
+ * Pavel Pisa
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/sysbus.h"
+#include "hw/irq.h"
+#include "hw/register.h"
+#include "qapi/error.h"
+#include "qemu/bitops.h"
+#include "qemu/log.h"
+#include "qemu/cutils.h"
+#include "qemu/event_notifier.h"
+#include "hw/qdev-properties.h"
+#include "qom/object_interfaces.h"
+#include "migration/vmstate.h"
+#include "hw/net/xlnx-versal-canfd.h"
+#include "trace.h"
+
+/* To avoid the build issues on Win

[QEMU][PATCH 3/5] xlnx-zynqmp: Connect Xilinx VERSAL CANFD controllers

2022-09-10 Thread Vikram Garhwal
Connect CANFD0 and CANFD1 on the Versal-virt machine.

Signed-off-by: Vikram Garhwal 
---
 hw/arm/xlnx-versal-virt.c| 45 
 hw/arm/xlnx-versal.c | 37 +
 include/hw/arm/xlnx-versal.h | 12 ++
 3 files changed, 94 insertions(+)

diff --git a/hw/arm/xlnx-versal-virt.c b/hw/arm/xlnx-versal-virt.c
index 37fc9b919c..062e6f2a95 100644
--- a/hw/arm/xlnx-versal-virt.c
+++ b/hw/arm/xlnx-versal-virt.c
@@ -40,9 +40,11 @@ struct VersalVirt {
 uint32_t clk_25Mhz;
 uint32_t usb;
 uint32_t dwc;
+uint32_t canfd[2];
 } phandle;
 struct arm_boot_info binfo;
 
+CanBusState *canbus[XLNX_VERSAL_NR_CANFD];
 struct {
 bool secure;
 } cfg;
@@ -235,6 +237,34 @@ static void fdt_add_uart_nodes(VersalVirt *s)
 }
 }
 
+static void fdt_add_canfd_nodes(VersalVirt *s)
+{
+uint64_t addrs[] = { MM_CANFD0, MM_CANFD1 };
+uint32_t size[] = {MM_CANFD0_SIZE, MM_CANFD1_SIZE};
+unsigned int irqs[] = { VERSAL_CANFD0_IRQ_0, VERSAL_CANFD1_IRQ_0 };
+const char compat[] = "xlnx,versal-canfd";
+int i;
+
+/* Create and connect CANFD0 and CANFD1 nodes to canbus0. */
+for (i = 0; i < ARRAY_SIZE(addrs); i++) {
+char *name = g_strdup_printf("/canfd@%" PRIx64, addrs[i]);
+qemu_fdt_add_subnode(s->fdt, name);
+qemu_fdt_setprop_cell(s->fdt, name, "rx-fifo0", 0x40);
+qemu_fdt_setprop_cell(s->fdt, name, "enable-rx-fifo1", 0x1);
+qemu_fdt_setprop_cell(s->fdt, name, "rx-fifo1", 0x40);
+
+qemu_fdt_setprop_cells(s->fdt, name, "interrupts",
+   GIC_FDT_IRQ_TYPE_SPI, irqs[i],
+   GIC_FDT_IRQ_FLAGS_LEVEL_HI);
+qemu_fdt_setprop_sized_cells(s->fdt, name, "reg",
+ 2, addrs[i], 2, size[i]);
+qemu_fdt_setprop(s->fdt, name, "compatible",
+ compat, sizeof(compat));
+
+g_free(name);
+}
+}
+
 static void fdt_add_fixed_link_nodes(VersalVirt *s, char *gemname,
  uint32_t phandle)
 {
@@ -639,12 +669,17 @@ static void versal_virt_init(MachineState *machine)
 TYPE_XLNX_VERSAL);
 object_property_set_link(OBJECT(&s->soc), "ddr", OBJECT(machine->ram),
  &error_abort);
+object_property_set_link(OBJECT(&s->soc), "canbus0", OBJECT(s->canbus[0]),
+ &error_abort);
+object_property_set_link(OBJECT(&s->soc), "canbus1", OBJECT(s->canbus[1]),
+ &error_abort);
 sysbus_realize(SYS_BUS_DEVICE(&s->soc), &error_fatal);
 
 fdt_create(s);
 create_virtio_regions(s);
 fdt_add_gem_nodes(s);
 fdt_add_uart_nodes(s);
+fdt_add_canfd_nodes(s);
 fdt_add_gic_nodes(s);
 fdt_add_timer_nodes(s);
 fdt_add_zdma_nodes(s);
@@ -712,6 +747,16 @@ static void versal_virt_init(MachineState *machine)
 
 static void versal_virt_machine_instance_init(Object *obj)
 {
+VersalVirt *s = XLNX_VERSAL_VIRT_MACHINE(obj);
+
+object_property_add_link(obj, "canbus0", TYPE_CAN_BUS,
+ (Object **)&s->canbus[0],
+ object_property_allow_set_link,
+ 0);
+object_property_add_link(obj, "canbus1", TYPE_CAN_BUS,
+ (Object **)&s->canbus[1],
+ object_property_allow_set_link,
+ 0);
 }
 
 static void versal_virt_machine_class_init(ObjectClass *oc, void *data)
diff --git a/hw/arm/xlnx-versal.c b/hw/arm/xlnx-versal.c
index 57276e1506..452f433eb4 100644
--- a/hw/arm/xlnx-versal.c
+++ b/hw/arm/xlnx-versal.c
@@ -185,6 +185,38 @@ static void versal_create_uarts(Versal *s, qemu_irq *pic)
 }
 }
 
+static void versal_create_canfds(Versal *s, qemu_irq *pic)
+{
+int i;
+
+for (i = 0; i < ARRAY_SIZE(s->lpd.iou.canfd); i++) {
+static const int irqs[] = { VERSAL_CANFD0_IRQ_0, VERSAL_CANFD1_IRQ_0};
+static const uint64_t addrs[] = { MM_CANFD0, MM_CANFD1 };
+char *name = g_strdup_printf("canfd%d", i);
+DeviceState *dev;
+MemoryRegion *mr;
+
+object_initialize_child(OBJECT(s), name, &s->lpd.iou.canfd[i],
+TYPE_XILINX_CANFD);
+dev = DEVICE(&s->lpd.iou.canfd[i]);
+
+object_property_set_int(OBJECT(&s->lpd.iou.canfd[i]), "ext_clk_freq",
+XLNX_VERSAL_CANFD_REF_CLK , &error_abort);
+
+object_property_set_link(OBJECT(dev), "canfdbus",
+ OBJECT(s->lpd.iou.canbus[i]),
+ &error_abort);
+
+sysbus_realize(SYS_BUS_DEVICE(dev), &error_fatal);
+
+mr = sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0);
+memory_region_add_subregion(&s->mr_ps, addrs[i], mr);
+
+sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, pic[irqs[i]]);
+   

[QEMU][PATCH 4/5] tests/qtest: Introduce tests for Xilinx VERSAL CANFD controller

2022-09-10 Thread Vikram Garhwal
The QTests perform three tests on the Xilinx VERSAL CANFD controller:
Tests the CANFD controllers in loopback.
Tests the CANFD controllers in normal mode with CAN frame.
Tests the CANFD controllers in normal mode with CANFD frame.

Signed-off-by: Vikram Garhwal 
---
 tests/qtest/meson.build   |   1 +
 tests/qtest/xlnx-canfd-test.c | 421 ++
 2 files changed, 422 insertions(+)
 create mode 100644 tests/qtest/xlnx-canfd-test.c

diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index e910cb32ca..c3802fd788 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -217,6 +217,7 @@ qtests_aarch64 = \
   (config_all_devices.has_key('CONFIG_TPM_TIS_SYSBUS') ? 
['tpm-tis-device-test'] : []) +\
   (config_all_devices.has_key('CONFIG_TPM_TIS_SYSBUS') ? 
['tpm-tis-device-swtpm-test'] : []) +  \
   (config_all_devices.has_key('CONFIG_XLNX_ZYNQMP_ARM') ? ['xlnx-can-test', 
'fuzz-xlnx-dp-test'] : []) + \
+  (config_all_devices.has_key('CONFIG_XLNX_VERSAL') ? ['xlnx-canfd-test'] : 
[]) + \
   ['arm-cpu-features',
'numa-test',
'boot-serial-test',
diff --git a/tests/qtest/xlnx-canfd-test.c b/tests/qtest/xlnx-canfd-test.c
new file mode 100644
index 00..15dc03c98c
--- /dev/null
+++ b/tests/qtest/xlnx-canfd-test.c
@@ -0,0 +1,421 @@
+/*
+ * QTests for the Xilinx Versal CANFD controller.
+ *
+ * Copyright (c) 2022 AMD Inc.
+ *
+ * Written-by: Vikram Garhwal
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest.h"
+
+/* Base address. */
+#define CANFD0_BASE_ADDR0xFF06
+#define CANFD1_BASE_ADDR0xFF07
+
+/* Register addresses. */
+#define R_SRR_OFFSET0x00
+#define R_MSR_OFFSET0x04
+#define R_FILTER_CONTROL_REGISTER   0xe0
+#define R_SR_OFFSET 0x18
+#define R_ISR_OFFSET0x1C
+#define R_IER_OFFSET0x20
+#define R_ICR_OFFSET0x24
+#define R_TX_READY_REQ_REGISTER 0x90
+#define RX_FIFO_STATUS_REGISTER 0xE8
+#define R_TXID_OFFSET   0x100
+#define R_TXDLC_OFFSET  0x104
+#define R_TXDATA1_OFFSET0x108
+#define R_TXDATA2_OFFSET0x10C
+#define R_AFMR_REGISTER00xa00
+#define R_AFIR_REGISTER00xa04
+#define R_RX0_ID_OFFSET 0x2100
+#define R_RX0_DLC_OFFSET0x2104
+#define R_RX0_DATA1_OFFSET  0x2108
+#define R_RX0_DATA2_OFFSET  0x210C
+
+/* CANFD modes. */
+#define SRR_CONFIG_MODE 0x00
+#define MSR_NORMAL_MODE 0x00
+#define MSR_LOOPBACK_MODE   (1 << 1)
+#define ENABLE_CANFD(1 << 1)
+
+/* CANFD status. */
+#define STATUS_CONFIG_MODE  (1 << 0)
+#define STATUS_NORMAL_MODE  (1 << 3)
+#define STATUS_LOOPBACK_MODE(1 << 1)
+#define ISR_TXOK(1 << 1)
+#define ISR_RXOK(1 << 4)
+
+#define ENABLE_ALL_FILTERS  0x
+#define ENABLE_ALL_INTERRUPTS   0x
+
+/* We are sending one canfd message. */
+#define TX_READY_REG_VAL0x1
+
+#define FIRST_RX_STORE_INDEX0x1
+#define STATUS_REG_MASK 0xF
+#define DLC_FD_BIT_SHIFT0x1B
+#define DLC_FD_BIT_MASK 0xF800
+#define FIFO_STATUS_READ_INDEX_MASK 0x3F
+#define FIFO_STATUS_FILL_LEVEL_MASK 0x7F00
+#define FILL_LEVEL_SHIFT0x8
+
+/* CANFD frame size ID, DLC and 16 DATA word. */
+#define CANFD_FRAME_SIZE18
+/* CAN frame size ID, DLC and 2 DATA word. */
+#define CAN_FRAME_SIZE  4
+
+/* Set the filters for CANFD controller. */
+static void enable_filters(QTestState *qts)
+{
+ const uint32_t arr_afmr[32] = { 0xb423deaa, 0xa2a40bdc, 0x1b64f486,
+ 

[QEMU][PATCH 5/5] MAINTAINERS: Include canfd tests under Xilinx CAN

2022-09-10 Thread Vikram Garhwal
Signed-off-by: Vikram Garhwal 
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1d45e92271..4bc811c237 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1756,7 +1756,7 @@ M: Francisco Iglesias 
 S: Maintained
 F: hw/net/can/xlnx-*
 F: include/hw/net/xlnx-*
-F: tests/qtest/xlnx-can-test*
+F: tests/qtest/xlnx-can*-test*
 
 EDU
 M: Jiri Slaby 
-- 
2.17.1




[QEMU][PATCH 0/5] Introduce Xilinx Versal CANFD

2022-09-10 Thread Vikram Garhwal
Hi,
This patch implements CANFD controller for xlnx-versal-virt machine. There are
two controllers CANFD0@0xFF06_ and CANFD1@0xFF07_ are connected to the
machine.

Also, added basic qtests for data exchange between both the controllers in
various supported configs.

Regards,
Vikram

Vikram Garhwal (5):
  MAINTAINERS: Update maintainer's email for Xilinx CAN
  hw/net/can: Introduce Xilinx Versal CANFD controller
  xlnx-zynqmp: Connect Xilinx VERSAL CANFD controllers
  tests/qtest: Introduce tests for Xilinx VERSAL CANFD controller
  MAINTAINERS: Include canfd tests under Xilinx CAN

 MAINTAINERS|6 +-
 hw/arm/xlnx-versal-virt.c  |   45 +
 hw/arm/xlnx-versal.c   |   37 +
 hw/net/can/meson.build |1 +
 hw/net/can/trace-events|7 +
 hw/net/can/xlnx-versal-canfd.c | 2157 
 include/hw/arm/xlnx-versal.h   |   12 +
 include/hw/net/xlnx-versal-canfd.h |   92 ++
 tests/qtest/meson.build|1 +
 tests/qtest/xlnx-canfd-test.c  |  421 ++
 10 files changed, 2776 insertions(+), 3 deletions(-)
 create mode 100644 hw/net/can/xlnx-versal-canfd.c
 create mode 100644 include/hw/net/xlnx-versal-canfd.h
 create mode 100644 tests/qtest/xlnx-canfd-test.c

-- 
2.17.1




Re: [PATCH] bugfix:migrate with block-dirty-bitmap (disk size is big enough) can't be finished

2022-09-10 Thread Vladimir Sementsov-Ogievskiy

On 9/10/22 09:35, liuhaiwei wrote:

From: liuhaiwei 

bug description as  https://gitlab.com/qemu-project/qemu/-/issues/1203
Usually,we use the precopy or postcopy mode to migrate block dirty bitmap.
but if block-dirty-bitmap size more than threshold size,we cannot entry the 
migration_completion in migration_iteration_run function
To solve this problem, we can setting  the pending size to a fake 
value(threshold-1 or 0) to tell  migration_iteration_run function to entry the 
migration_completion,if pending size > threshold size




Actually, bitmaps migrate in postcopy. So, you should start postcopy for it to work (qmp 
command migrate-start-postcopy). This command simply set the boolean variable, so that in 
migration_iteration_run() we'll move to postcopy when needed. So, you can start this 
command immediately after migrate command, or even before it, but after setting the 
"dirty-bitmaps" capability.

Fake pending is a wrong thing to do, it means that you will make downtime to be 
larger than expected.

--
Best regards,
Vladimir



Re: [PATCH 1/4] qom: add devault value

2022-09-10 Thread Vladimir Sementsov-Ogievskiy

On 9/8/22 13:36, Maksim Davydov wrote:

qmp_qom_list_properties can print default values if they are available
as qmp_device_list_properties does, because both of them use the
ObjectPropertyInfo structure with default_value field. This can be useful
when working with "not device" types.

Signed-off-by: Maksim Davydov



Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir



Re: [PATCH] bugfix:migrate with block-dirty-bitmap (disk size is big enough) can't be finished

2022-09-10 Thread Vladimir Sementsov-Ogievskiy

Hi!

On 9/10/22 13:47, Seaway Liu(刘海伟) wrote:

hi,i have a question
if failed in migration using post-copy mode,is there some way to restore the 
memory data back to soucre VM?




As far as I understand, no, there is not.

Postcopy started actually means: target has started. So, RAM is touched by 
target VM process, no way to rollback.

Still, things are not so bad: when you enable dirty-bitmaps capability, but not 
postcopy-ram capability, RAM is migrated in precopy as usual. So, when target 
started, the only thing that is not yet migrated is dirty bitmap. So, in worst 
case (migration failure after postcopy started) you'll loose your dirty bitmap. 
VM is migrated and normally running on target. Unfinished bitmaps on target are 
automatically released (see cancel_incoming_locked()). So, in worst case you'll 
have to start your incremental backup chain from a new full-backup.




发自我的小米
在 Vladimir Sementsov-Ogievskiy ,2022年9月10日 下午6:18写道:

On 9/10/22 09:35, liuhaiwei wrote:

From: liuhaiwei 

bug description as  https://gitlab.com/qemu-project/qemu/-/issues/1203
Usually,we use the precopy or postcopy mode to migrate block dirty bitmap.
but if block-dirty-bitmap size more than threshold size,we cannot entry the 
migration_completion in migration_iteration_run function
To solve this problem, we can setting  the pending size to a fake 
value(threshold-1 or 0) to tell  migration_iteration_run function to entry the 
migration_completion,if pending size > threshold size




Actually, bitmaps migrate in postcopy. So, you should start postcopy for it to work (qmp 
command migrate-start-postcopy). This command simply set the boolean variable, so that in 
migration_iteration_run() we'll move to postcopy when needed. So, you can start this 
command immediately after migrate command, or even before it, but after setting the 
"dirty-bitmaps" capability.

Fake pending is a wrong thing to do, it means that you will make downtime to be 
larger than expected.

--
Best regards,
Vladimir



--
Best regards,
Vladimir



Re: [PATCH 2/4] qmp: add dump machine type compatible properties

2022-09-10 Thread Vladimir Sementsov-Ogievskiy

On 9/8/22 13:36, Maksim Davydov wrote:

To control that creating new machine type doesn't affect the previous
types (their compat_props) and to check complex compat_props inheritance
we need qmp command to print machine type compatible properties.
This patch adds the ability to get list of all the compat_props of the
corresponding supported machines for their comparison via new optional
argument of "query-machines" command.

Signed-off-by: Maksim Davydov



Reviewed-by: Vladimir Sementsov-Ogievskiy 


--
Best regards,
Vladimir



Re: [PATCH 3/4] python/qmp: increase read buffer size

2022-09-10 Thread Vladimir Sementsov-Ogievskiy

On 9/8/22 13:36, Maksim Davydov wrote:

After modification of "query-machines" command the buffer size should be
more than 452kB to contain output with compat-props.

Signed-off-by: Maksim Davydov 
---
  python/qemu/qmp/qmp_client.py | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/qemu/qmp/qmp_client.py b/python/qemu/qmp/qmp_client.py
index 5dcda04a75..659fe4d98c 100644
--- a/python/qemu/qmp/qmp_client.py
+++ b/python/qemu/qmp/qmp_client.py
@@ -197,8 +197,8 @@ async def run(self, address='/tmp/qemu.socket'):
  #: Logger object used for debugging messages.
  logger = logging.getLogger(__name__)
  
-# Read buffer limit; large enough to accept query-qmp-schema

-_limit = (256 * 1024)
+# Read buffer limit; large enough to accept query-machines
+_limit = (512 * 1024)
  
  # Type alias for pending execute() result items

  _PendingT = Union[Message, ExecInterruptedError]



Reviewed-by: Vladimir Sementsov-Ogievskiy 

This patch should better go before 02, to never break things in the history.

--
Best regards,
Vladimir



Re: [PATCH 4/4] scripts: add script to compare compatible properties

2022-09-10 Thread Vladimir Sementsov-Ogievskiy

On 9/8/22 13:36, Maksim Davydov wrote:

This script run QEMU to obtain compat_props of machines and default
values of different types and produce appropriate table. This table
can be used to compare machine types to choose the most suitable
machine. Also table in json or csv format should be used to check that
new machine doesn't affect previous ones via comparisin tables with and
without new machine.
Default values of properties are needed to fill "holes" in the table (one
machine has these properties and another not).

Notes:
* some init values from the devices can't be available like properties
from virtio-9p when configure has --disable-virtfs. This situations will
be seen in the table as "unavailable driver".
* Default values can be get can be obtained in an unobvious way, like
x86 features. If the script doesn't know how to get property default value
to compare one machine with another it fills "holes" with "unavailable
method". This is done because script uses whitelist model to get default
values of different types. It means that the method that can't be applied
to a new type that can crash this script. It is better to get an
"unavailable driver" when creating a new machine with new compatible
properties than to break this script. So it turns out a more stable and
generic script.
* If the default value can't be obtained because this property doesn't
exist or because this property can't have default value, appropriate
"hole" will be filled by "unknown property" or "no default value"
* If the property is applied to the abstract class, the script collects
default values from all child classes (set of default values)

Example:

scripts/compare_mt.py --MT pc-q35-3.1 pc-q35-2.12

╒╤═══╤═══╕
││  pc-q35-2.12  │  pc-q35-3.1   │
╞╪═══╪═══╡
│EPYC-IBPB-x86_64-cpu-xlevel │  0x800a   │  2147483678   │
├┼───┼───┤
│   EPYC-x86_64-cpu-xlevel   │  0x800a   │  2147483678   │
├┼───┼───┤
│ Skylake-Server-IBRS-x86_64-cpu-pku │ False │ True  │
├┼───┼───┤
│   Skylake-Server-x86_64-cpu-pku│ False │ True  │
├┼───┼───┤
│ VGA-global-vmstate │ True  │ False │
├┼───┼───┤
│ cirrus-vga-global-vmstate  │ True  │ False │
├┼───┼───┤
│hda-audio-use-timer │ False │ True  │
├┼───┼───┤
│  migration-decompress-error-check  │ False │ True  │
├┼───┼───┤
│   qxl-vga-global-vmstate   │ True  │ False │
├┼───┼───┤
│ vmware-svga-global-vmstate │ True  │ False │
├┼───┼───┤
│  x86_64-cpu-legacy-cache   │ True  │ [True, False] │
├┼───┼───┤
│ x86_64-cpu-topoext │ False │ [True, False] │
├┼───┼───┤
│   x86_64-cpu-x-hv-synic-kvm-only   │ True  │ False │
╘╧═══╧═══╛

Signed-off-by: Maksim Davydov 
---
  scripts/compare_mt.py | 370 ++
  1 file changed, 370 insertions(+)
  create mode 100755 scripts/compare_mt.py

diff --git a/scripts/compare_mt.py b/scripts/compare_mt.py
new file mode 100755
index 00..a063c79682
--- /dev/null
+++ b/scripts/compare_mt.py
@@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) Yandex Technologies LLC, 2022
+#
+# Script to compare machine type compatible properties
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see .
+
+from tabulate import tabulate
+import json


json import unused


+import sys
+from os

Re: [PATCH] bugfix:migrate with block-dirty-bitmap (disk size is big enough) can't be finished

2022-09-10 Thread 刘海伟
hi,i have a question
if failed in migration using post-copy mode,is there some way to restore the 
memory data back to soucre VM?




发自我的小米
在 Vladimir Sementsov-Ogievskiy ,2022年9月10日 下午6:18写道:

On 9/10/22 09:35, liuhaiwei wrote:
> From: liuhaiwei 
>
> bug description as  https://gitlab.com/qemu-project/qemu/-/issues/1203
> Usually,we use the precopy or postcopy mode to migrate block dirty bitmap.
> but if block-dirty-bitmap size more than threshold size,we cannot entry the 
> migration_completion in migration_iteration_run function
> To solve this problem, we can setting  the pending size to a fake 
> value(threshold-1 or 0) to tell  migration_iteration_run function to entry 
> the migration_completion,if pending size > threshold size
>


Actually, bitmaps migrate in postcopy. So, you should start postcopy for it to 
work (qmp command migrate-start-postcopy). This command simply set the boolean 
variable, so that in migration_iteration_run() we'll move to postcopy when 
needed. So, you can start this command immediately after migrate command, or 
even before it, but after setting the "dirty-bitmaps" capability.

Fake pending is a wrong thing to do, it means that you will make downtime to be 
larger than expected.

--
Best regards,
Vladimir


[PATCH v2] hw/virtio/vhost-shadow-virtqueue: Silence GCC error "maybe-uninitialized"

2022-09-10 Thread Bernhard Beschow
GCC issues a false positive warning, resulting in build failure with -Werror:

  In file included from /usr/include/glib-2.0/glib.h:114,
   from src/include/glib-compat.h:32,
   from src/include/qemu/osdep.h:144,
   from ../src/hw/virtio/vhost-shadow-virtqueue.c:10:
  In function ‘g_autoptr_cleanup_generic_gfree’,
  inlined from ‘vhost_handle_guest_kick’ at 
../src/hw/virtio/vhost-shadow-virtqueue.c:292:42:
  /usr/include/glib-2.0/glib/glib-autocleanups.h:28:3: error: ‘elem’ may be 
used uninitialized [-Werror=maybe-uninitialized]
 28 |   g_free (*pp);
|   ^~~~
  ../src/hw/virtio/vhost-shadow-virtqueue.c: In function 
‘vhost_handle_guest_kick’:
  ../src/hw/virtio/vhost-shadow-virtqueue.c:292:42: note: ‘elem’ was declared 
here
292 | g_autofree VirtQueueElement *elem;
|  ^~~~
  cc1: all warnings being treated as errors

There is actually no problem since "elem" is initialized in both branches.
Silence the warning by initializig it with "NULL".

$ gcc --version
gcc (GCC) 12.2.0

Fixes: 9c2ab2f1ec333be8614cc12272d4b91960704dbe ("vhost: stop transfer elem 
ownership in vhost_handle_guest_kick")
Signed-off-by: Bernhard Beschow 
---
 hw/virtio/vhost-shadow-virtqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index e8e5bbc368..596d4434d2 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -289,7 +289,7 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue 
*svq)
 virtio_queue_set_notification(svq->vq, false);
 
 while (true) {
-g_autofree VirtQueueElement *elem;
+g_autofree VirtQueueElement *elem = NULL;
 int r;
 
 if (svq->next_guest_avail_elem) {
-- 
2.37.3




[PATCH] arm/monitor: add register name resolution

2022-09-10 Thread Nikola Brkovic
This patch allows the monitor to resolve the
stack pointer, instruction pointer,
system status register and FPU status register
on ARM targets.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1145

Signed-off-by: Nikola Brkovic 
---
 target/arm/monitor.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/target/arm/monitor.c b/target/arm/monitor.c
index 80c64fa355..143c95bca4 100644
--- a/target/arm/monitor.c
+++ b/target/arm/monitor.c
@@ -31,6 +31,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qdict.h"
 #include "qom/qom-qobject.h"
+#include "monitor/hmp-target.h"
 
 static GICCapability *gic_cap_new(int version)
 {
@@ -228,3 +229,31 @@ CpuModelExpansionInfo 
*qmp_query_cpu_model_expansion(CpuModelExpansionType type,
 
 return expansion_info;
 }
+
+static target_long monitor_get_cpsr(Monitor *mon, const struct MonitorDef *md,
+int val)
+{
+CPUArchState *env = mon_get_cpu_env(mon);
+return cpsr_read(env);
+}
+
+const MonitorDef monitor_defs[] = {
+{ "sp|r13", offsetof(CPUARMState, regs[13])},
+{ "lr|r14", offsetof(CPUARMState, regs[14])},
+#ifndef TARGET_AARCH64
+{ "pc|r15|ip", offsetof(CPUARMState, regs[15]) },
+#else
+{ "pc|ip", offsetof(CPUARMState, pc) },
+#endif
+
+/* State registers */
+{ "cpsr", 0, &monitor_get_cpsr},
+{ "fpscr", offsetof(CPUARMState, vfp.fp_status)},
+
+{ NULL }
+};
+
+const MonitorDef *target_monitor_defs(void)
+{
+return monitor_defs;
+}
-- 
2.37.3




Re: [PATCH v2] hw/virtio/vhost-shadow-virtqueue: Silence GCC error "maybe-uninitialized"

2022-09-10 Thread Richard Henderson

On 9/10/22 16:11, Bernhard Beschow wrote:

GCC issues a false positive warning, resulting in build failure with -Werror:

   In file included from /usr/include/glib-2.0/glib.h:114,
from src/include/glib-compat.h:32,
from src/include/qemu/osdep.h:144,
from ../src/hw/virtio/vhost-shadow-virtqueue.c:10:
   In function ‘g_autoptr_cleanup_generic_gfree’,
   inlined from ‘vhost_handle_guest_kick’ at 
../src/hw/virtio/vhost-shadow-virtqueue.c:292:42:
   /usr/include/glib-2.0/glib/glib-autocleanups.h:28:3: error: ‘elem’ may be 
used uninitialized [-Werror=maybe-uninitialized]
  28 |   g_free (*pp);
 |   ^~~~
   ../src/hw/virtio/vhost-shadow-virtqueue.c: In function 
‘vhost_handle_guest_kick’:
   ../src/hw/virtio/vhost-shadow-virtqueue.c:292:42: note: ‘elem’ was declared 
here
 292 | g_autofree VirtQueueElement *elem;
 |  ^~~~
   cc1: all warnings being treated as errors

There is actually no problem since "elem" is initialized in both branches.
Silence the warning by initializig it with "NULL".

$ gcc --version
gcc (GCC) 12.2.0

Fixes: 9c2ab2f1ec333be8614cc12272d4b91960704dbe ("vhost: stop transfer elem 
ownership in vhost_handle_guest_kick")
Signed-off-by: Bernhard Beschow
---
  hw/virtio/vhost-shadow-virtqueue.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


It's not a false positive, but the fix is correct.

Reviewed-by: Richard Henderson 


r~



[PATCH] tests/9p: split virtio-9p-test.c into tests and 9p client part

2022-09-10 Thread Christian Schoenebeck
This patch is pure refactoring, it does not change behaviour.

virtio-9p-test.c grew to 1657 lines. Let's split this file up between
actual 9p test cases vs. 9p test client, to make it easier to
concentrate on the actual 9p tests.

Move the 9p test client code to a new unit virtio-9p-client.c, which
are basically all functions and types prefixed with v9fs_* already.

Note that some client wrapper functions (do_*) are preserved in
virtio-9p-test.c, simply because these wrapper functions are going to
be wiped with subsequent patches anyway.

As the global QGuestAllocator variable is moved to virtio-9p-client.c,
add a new function v9fs_set_allocator() to be used by virtio-9p-test.c
instead of fiddling with a global variable across units and libraries.

Signed-off-by: Christian Schoenebeck 
---

As I am working on extending the previously sent RFC [1] (which will be
using function calls with named function arguments), I realized that it
makes sense to first split the client code out to a new file, and then
make the upcoming patches based on this patch here. Because that way
I don't have to touch the order of the client functions and the upcoming
patches will therefore become better readable.

[1] https://lore.kernel.org/all/e1odqqv-0003d4...@lizzy.crudebyte.com/

 tests/qtest/libqos/meson.build|   1 +
 tests/qtest/libqos/virtio-9p-client.c | 683 +++
 tests/qtest/libqos/virtio-9p-client.h | 139 +
 tests/qtest/virtio-9p-test.c  | 770 +-
 4 files changed, 849 insertions(+), 744 deletions(-)
 create mode 100644 tests/qtest/libqos/virtio-9p-client.c
 create mode 100644 tests/qtest/libqos/virtio-9p-client.h

diff --git a/tests/qtest/libqos/meson.build b/tests/qtest/libqos/meson.build
index cff83c86d9..ca99c9ab7d 100644
--- a/tests/qtest/libqos/meson.build
+++ b/tests/qtest/libqos/meson.build
@@ -34,6 +34,7 @@ libqos_srcs = files(
 'tpci200.c',
 'virtio.c',
 'virtio-9p.c',
+'virtio-9p-client.c',
 'virtio-balloon.c',
 'virtio-blk.c',
 'vhost-user-blk.c',
diff --git a/tests/qtest/libqos/virtio-9p-client.c 
b/tests/qtest/libqos/virtio-9p-client.c
new file mode 100644
index 00..c1d2478d02
--- /dev/null
+++ b/tests/qtest/libqos/virtio-9p-client.c
@@ -0,0 +1,683 @@
+/*
+ * 9P network client for VirtIO 9P test cases (based on QTest)
+ *
+ * Copyright (c) 2014 SUSE LINUX Products GmbH
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*
+ * Not so fast! You might want to read the 9p developer docs first:
+ * https://wiki.qemu.org/Documentation/9p
+ */
+
+#include "virtio-9p-client.h"
+
+#define QVIRTIO_9P_TIMEOUT_US (10 * 1000 * 1000)
+static QGuestAllocator *alloc;
+
+void v9fs_set_allocator(QGuestAllocator *t_alloc)
+{
+alloc = t_alloc;
+}
+
+void v9fs_memwrite(P9Req *req, const void *addr, size_t len)
+{
+qtest_memwrite(req->qts, req->t_msg + req->t_off, addr, len);
+req->t_off += len;
+}
+
+void v9fs_memskip(P9Req *req, size_t len)
+{
+req->r_off += len;
+}
+
+void v9fs_memread(P9Req *req, void *addr, size_t len)
+{
+qtest_memread(req->qts, req->r_msg + req->r_off, addr, len);
+req->r_off += len;
+}
+
+void v9fs_uint8_read(P9Req *req, uint8_t *val)
+{
+v9fs_memread(req, val, 1);
+}
+
+void v9fs_uint16_write(P9Req *req, uint16_t val)
+{
+uint16_t le_val = cpu_to_le16(val);
+
+v9fs_memwrite(req, &le_val, 2);
+}
+
+void v9fs_uint16_read(P9Req *req, uint16_t *val)
+{
+v9fs_memread(req, val, 2);
+le16_to_cpus(val);
+}
+
+void v9fs_uint32_write(P9Req *req, uint32_t val)
+{
+uint32_t le_val = cpu_to_le32(val);
+
+v9fs_memwrite(req, &le_val, 4);
+}
+
+void v9fs_uint64_write(P9Req *req, uint64_t val)
+{
+uint64_t le_val = cpu_to_le64(val);
+
+v9fs_memwrite(req, &le_val, 8);
+}
+
+void v9fs_uint32_read(P9Req *req, uint32_t *val)
+{
+v9fs_memread(req, val, 4);
+le32_to_cpus(val);
+}
+
+void v9fs_uint64_read(P9Req *req, uint64_t *val)
+{
+v9fs_memread(req, val, 8);
+le64_to_cpus(val);
+}
+
+/* len[2] string[len] */
+uint16_t v9fs_string_size(const char *string)
+{
+size_t len = strlen(string);
+
+g_assert_cmpint(len, <=, UINT16_MAX - 2);
+
+return 2 + len;
+}
+
+void v9fs_string_write(P9Req *req, const char *string)
+{
+int len = strlen(string);
+
+g_assert_cmpint(len, <=, UINT16_MAX);
+
+v9fs_uint16_write(req, (uint16_t) len);
+v9fs_memwrite(req, string, len);
+}
+
+void v9fs_string_read(P9Req *req, uint16_t *len, char **string)
+{
+uint16_t local_len;
+
+v9fs_uint16_read(req, &local_len);
+if (len) {
+*len = local_len;
+}
+if (string) {
+*string = g_malloc(local_len + 1);
+v9fs_memread(req, *string, local_len);
+(*string)[local_len] = 0;
+} else {
+v9fs_memskip(req, local_len);
+}
+}
+
+typedef struct {
+uint32_t size;
+uint8_t id;
+

Re: [PATCH v2] hw/virtio/vhost-shadow-virtqueue: Silence GCC error "maybe-uninitialized"

2022-09-10 Thread Bernhard Beschow
Am 10. September 2022 15:11:17 UTC schrieb Bernhard Beschow :
>GCC issues a false positive warning, resulting in build failure with -Werror:
>
>  In file included from /usr/include/glib-2.0/glib.h:114,
>   from src/include/glib-compat.h:32,
>   from src/include/qemu/osdep.h:144,
>   from ../src/hw/virtio/vhost-shadow-virtqueue.c:10:
>  In function ‘g_autoptr_cleanup_generic_gfree’,
>  inlined from ‘vhost_handle_guest_kick’ at 
> ../src/hw/virtio/vhost-shadow-virtqueue.c:292:42:
>  /usr/include/glib-2.0/glib/glib-autocleanups.h:28:3: error: ‘elem’ may be 
> used uninitialized [-Werror=maybe-uninitialized]
> 28 |   g_free (*pp);
>|   ^~~~
>  ../src/hw/virtio/vhost-shadow-virtqueue.c: In function 
> ‘vhost_handle_guest_kick’:
>  ../src/hw/virtio/vhost-shadow-virtqueue.c:292:42: note: ‘elem’ was declared 
> here
>292 | g_autofree VirtQueueElement *elem;
>|  ^~~~
>  cc1: all warnings being treated as errors
>
>There is actually no problem since "elem" is initialized in both branches.
>Silence the warning by initializig it with "NULL".
>
>$ gcc --version
>gcc (GCC) 12.2.0
>
>Fixes: 9c2ab2f1ec333be8614cc12272d4b91960704dbe ("vhost: stop transfer elem 
>ownership in vhost_handle_guest_kick")
>Signed-off-by: Bernhard Beschow 

Forgot to add from v1:

Reviewed-by: Philippe Mathieu-Daudé 

>---
> hw/virtio/vhost-shadow-virtqueue.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
>diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
>b/hw/virtio/vhost-shadow-virtqueue.c
>index e8e5bbc368..596d4434d2 100644
>--- a/hw/virtio/vhost-shadow-virtqueue.c
>+++ b/hw/virtio/vhost-shadow-virtqueue.c
>@@ -289,7 +289,7 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue 
>*svq)
> virtio_queue_set_notification(svq->vq, false);
> 
> while (true) {
>-g_autofree VirtQueueElement *elem;
>+g_autofree VirtQueueElement *elem = NULL;
> int r;
> 
> if (svq->next_guest_avail_elem) {




Re:Re: [PATCH] bugfix:migrate with block-dirty-bitmap (disk size is big enough) can't be finished

2022-09-10 Thread liuhaiwei9699



sometimes ,post-copy mode is not the best choice. For instance, Supposing 
migrate process will take ten minutes,but network may be interruptted In this 
process .
If it does happenthe , memory data of VM will be splitted into two parts, and 
will not be rollback.This is a bad situation


so  migrate-start-postcopy will not be setted in conservative scenario. In this 
case, the migration with block dirty bitmap may not be finished.




The migration of block dirty bitmap should not dependent on post-copy or 
pre-copy mode.


At 2022-09-10 18:58:12, "Vladimir Sementsov-Ogievskiy" 
 wrote:
>Hi!
>
>On 9/10/22 13:47, Seaway Liu(刘海伟) wrote:
>> hi,i have a question
>> if failed in migration using post-copy mode,is there some way to restore the 
>> memory data back to soucre VM?
>> 
>
>
>As far as I understand, no, there is not.
>
>Postcopy started actually means: target has started. So, RAM is touched by 
>target VM process, no way to rollback.
>
>Still, things are not so bad: when you enable dirty-bitmaps capability, but 
>not postcopy-ram capability, RAM is migrated in precopy as usual. So, when 
>target started, the only thing that is not yet migrated is dirty bitmap. So, 
>in worst case (migration failure after postcopy started) you'll loose your 
>dirty bitmap. VM is migrated and normally running on target. Unfinished 
>bitmaps on target are automatically released (see cancel_incoming_locked()). 
>So, in worst case you'll have to start your incremental backup chain from a 
>new full-backup.
>
>> 
>> 
>> 发自我的小米
>> 在 Vladimir Sementsov-Ogievskiy ,2022年9月10日 
>> 下午6:18写道:
>> 
>> On 9/10/22 09:35, liuhaiwei wrote:
>>> From: liuhaiwei 
>>>
>>> bug description as  https://gitlab.com/qemu-project/qemu/-/issues/1203
>>> Usually,we use the precopy or postcopy mode to migrate block dirty bitmap.
>>> but if block-dirty-bitmap size more than threshold size,we cannot entry the 
>>> migration_completion in migration_iteration_run function
>>> To solve this problem, we can setting  the pending size to a fake 
>>> value(threshold-1 or 0) to tell  migration_iteration_run function to entry 
>>> the migration_completion,if pending size > threshold size
>>>
>> 
>> 
>> Actually, bitmaps migrate in postcopy. So, you should start postcopy for it 
>> to work (qmp command migrate-start-postcopy). This command simply set the 
>> boolean variable, so that in migration_iteration_run() we'll move to 
>> postcopy when needed. So, you can start this command immediately after 
>> migrate command, or even before it, but after setting the "dirty-bitmaps" 
>> capability.
>> 
>> Fake pending is a wrong thing to do, it means that you will make downtime to 
>> be larger than expected.
>> 
>> --
>> Best regards,
>> Vladimir
>
>
>-- 
>Best regards,
>Vladimir


Re: [PATCH v9 2/7] file-posix: introduce helper functions for sysfs attributes

2022-09-10 Thread Damien Le Moal
On 2022/09/10 14:27, Sam Li wrote:
> Use get_sysfs_str_val() to get the string value of device
> zoned model. Then get_sysfs_zoned_model() can convert it to
> BlockZoneModel type of QEMU.
> 
> Use get_sysfs_long_val() to get the long value of zoned device
> information.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Hannes Reinecke 
> Reviewed-by: Stefan Hajnoczi 

Looks good to me.

Reviewed-by: Damien Le Moal 

> ---
>  block/file-posix.c   | 121 ++-
>  include/block/block_int-common.h |   3 +
>  2 files changed, 88 insertions(+), 36 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 48cd096624..0a8b4b426e 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1210,66 +1210,109 @@ static int hdev_get_max_hw_transfer(int fd, struct 
> stat *st)
>  #endif
>  }
>  
> -static int hdev_get_max_segments(int fd, struct stat *st)
> -{
> +/*
> + * Get a sysfs attribute value as character string.
> + */
> +static int get_sysfs_str_val(struct stat *st, const char *attribute,
> + char **val) {
>  #ifdef CONFIG_LINUX
> -char buf[32];
> -const char *end;
> -char *sysfspath = NULL;
> +g_autofree char *sysfspath = NULL;
>  int ret;
> -int sysfd = -1;
> -long max_segments;
> +size_t len;
>  
> -if (S_ISCHR(st->st_mode)) {
> -if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
> -return ret;
> -}
> +if (!S_ISBLK(st->st_mode)) {
>  return -ENOTSUP;
>  }
>  
> -if (!S_ISBLK(st->st_mode)) {
> -return -ENOTSUP;
> +sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
> +major(st->st_rdev), minor(st->st_rdev),
> +attribute);
> +ret = g_file_get_contents(sysfspath, val, &len, NULL);
> +if (ret == -1) {
> +return -ENOENT;
>  }
>  
> -sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -major(st->st_rdev), minor(st->st_rdev));
> -sysfd = open(sysfspath, O_RDONLY);
> -if (sysfd == -1) {
> -ret = -errno;
> -goto out;
> +/* The file is ended with '\n' */
> +char *p;
> +p = *val;
> +if (*(p + len - 1) == '\n') {
> +*(p + len - 1) = '\0';
>  }
> -do {
> -ret = read(sysfd, buf, sizeof(buf) - 1);
> -} while (ret == -1 && errno == EINTR);
> +return ret;
> +#else
> +return -ENOTSUP;
> +#endif
> +}
> +
> +static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned) {
> +g_autofree char *val = NULL;
> +int ret;
> +
> +ret = get_sysfs_str_val(st, "zoned", &val);
>  if (ret < 0) {
> -ret = -errno;
> -goto out;
> -} else if (ret == 0) {
> -ret = -EIO;
> -goto out;
> +return ret;
>  }
> -buf[ret] = 0;
> -/* The file is ended with '\n', pass 'end' to accept that. */
> -ret = qemu_strtol(buf, &end, 10, &max_segments);
> -if (ret == 0 && end && *end == '\n') {
> -ret = max_segments;
> +
> +if (strcmp(val, "host-managed") == 0) {
> +*zoned = BLK_Z_HM;
> +} else if (strcmp(val, "host-aware") == 0) {
> +*zoned = BLK_Z_HA;
> +} else if (strcmp(val, "none") == 0) {
> +*zoned = BLK_Z_NONE;
> +} else {
> +return -ENOTSUP;
>  }
> +return 0;
> +}
>  
> -out:
> -if (sysfd != -1) {
> -close(sysfd);
> +/*
> + * Get a sysfs attribute value as a long integer.
> + */
> +static long get_sysfs_long_val(struct stat *st, const char *attribute) {
> +#ifdef CONFIG_LINUX
> +g_autofree char *str = NULL;
> +const char *end;
> +long val;
> +int ret;
> +
> +ret = get_sysfs_str_val(st, attribute, &str);
> +if (ret < 0) {
> +return ret;
> +}
> +
> +/* The file is ended with '\n', pass 'end' to accept that. */
> +ret = qemu_strtol(str, &end, 10, &val);
> +if (ret == 0 && end && *end == '\0') {
> +ret = val;
>  }
> -g_free(sysfspath);
>  return ret;
>  #else
>  return -ENOTSUP;
>  #endif
>  }
>  
> +static int hdev_get_max_segments(int fd, struct stat *st) {
> +#ifdef CONFIG_LINUX
> +int ret;
> +
> +if (S_ISCHR(st->st_mode)) {
> +if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
> +return ret;
> +}
> +return -ENOTSUP;
> +}
> +return get_sysfs_long_val(st, "max_segments");
> +#else
> +return -ENOTSUP;
> +#endif
> +}
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
>  struct stat st;
> +int ret;
> +BlockZoneModel zoned;
>  
>  s->needs_alignment = raw_needs_alignment(bs);
>  raw_probe_alignment(bs, s->fd, errp);
> @@ -1307,6 +1350,12 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  bs->bl.max_hw_iov = ret;
>  }
>  }
> +
> +ret = get_sysf

Re: [PATCH v9 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-09-10 Thread Damien Le Moal
On 2022/09/10 14:27, Sam Li wrote:
[...]
> +/*
> + * Send a zone_report command.
> + * offset is a byte offset from the start of the device. No alignment
> + * required for offset.
> + * nr_zones represents IN maximum and OUT actual.
> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +unsigned int *nr_zones,
> +BlockZoneDescriptor *zones)
> +{
> +int ret;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk); /* increase before waiting */
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +blk_dec_in_flight(blk);
> +return -ENOMEDIUM;
> +}
> +ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
> +/*
> + * Send a zone_management command.
> + * op is the zone operation;
> + * offset is the byte offset from the start of the zoned device;
> + * len is the maximum number of bytes the command should operate on. It
> + * should be aligned with the zone sector size.

This should read:

* offset is the byte offset of the start of the first zone to operate on;
* len is the maximum number of bytes the command should operate on. It
* should be aligned with the device zone size.

No ?

> + */
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +int64_t offset, int64_t len)
> +{
> +int ret;
> +IO_CODE();
> +
> +
> +blk_inc_in_flight(blk);
> +blk_wait_while_drained(blk);
> +
> +ret = blk_check_byte_request(blk, offset, len);
> +if (ret < 0) {
> +return ret;
> +}
> +
> +ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 0a8b4b426e..4edfa25d04 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -67,6 +67,9 @@
>  #include 
>  #include 
>  #include 
> +#if defined(CONFIG_BLKZONED)
> +#include 
> +#endif
>  #include 
>  #include 
>  #include 
> @@ -216,6 +219,15 @@ typedef struct RawPosixAIOData {
>  PreallocMode prealloc;
>  Error **errp;
>  } truncate;
> +struct {
> +unsigned int *nr_zones;
> +BlockZoneDescriptor *zones;
> +} zone_report;
> +struct {
> +unsigned long zone_op;
> +const char *zone_op_name;
> +bool all;
> +} zone_mgmt;
>  };
>  } RawPosixAIOData;
>  
> @@ -1339,7 +1351,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  #endif
>  
>  if (bs->sg || S_ISBLK(st.st_mode)) {
> -int ret = hdev_get_max_hw_transfer(s->fd, &st);
> +ret = hdev_get_max_hw_transfer(s->fd, &st);
>  
>  if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
>  bs->bl.max_hw_transfer = ret;
> @@ -1356,6 +1368,27 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  zoned = BLK_Z_NONE;
>  }
>  bs->bl.zoned = zoned;
> +if (zoned != BLK_Z_NONE) {
> +ret = get_sysfs_long_val(&st, "chunk_sectors");
> +if (ret > 0) {
> +bs->bl.zone_sectors = ret;
> +}

It may be good to check that we are getting a valid zone size here. So may be
change the check to something like this ?

if (ret <= 0) {
*** print some error message mentioning the invalid zone size ***
bs->bl.zoned = BLK_Z_NONE;
return;
}
bs->bl.zone_sectors = ret;

> +
> +ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
> +if (ret > 0) {
> +bs->bl.max_append_sectors = ret / 512;
> +}
> +
> +ret = get_sysfs_long_val(&st, "max_open_zones");
> +if (ret >= 0) {
> +bs->bl.max_open_zones = ret;
> +}
> +
> +ret = get_sysfs_long_val(&st, "max_active_zones");
> +if (ret >= 0) {
> +bs->bl.max_active_zones = ret;
> +}
> +}
>  }
>  
>  static int check_for_dasd(int fd)
> @@ -1850,6 +1883,145 @@ static off_t copy_file_range(int in_fd, off_t 
> *in_off, int out_fd,
>  }
>  #endif
>  
> +/*
> + * parse_zone - Fill a zone descriptor
> + */
> +#if defined(CONFIG_BLKZONED)
> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> +  const struct blk_zone *blkz) {
> +zone->start = blkz->start;
> +zone->length = blkz->len;
> +zone->cap = blkz->capacity;
> +zone->wp = blkz->wp;
> +
> +switch (blkz->type) {
> +case BLK_ZONE_TYPE_SEQWRITE_REQ:
> +zone->type = BLK_ZT_SWR;
> +break;
> +case BLK_ZONE_TYPE_SEQWRITE_PREF:
> +zone->type = BLK_ZT_SWP;
> +break;
> +case BLK_ZONE_TYPE_CONVENTIONAL:
> +zone->type = BLK_ZT_CONV;
> +break;
> +default:
> +g_

Re: [PATCH v9 4/7] raw-format: add zone operations to pass through requests

2022-09-10 Thread Damien Le Moal
On 2022/09/10 14:27, Sam Li wrote:
> raw-format driver usually sits on top of file-posix driver. It needs to
> pass through requests of zone commands.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 

Reviewed-by: Damien Le Moal 

> ---
>  block/raw-format.c | 13 +
>  1 file changed, 13 insertions(+)
> 
> diff --git a/block/raw-format.c b/block/raw-format.c
> index 69fd650eaf..6b20bd22ef 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -314,6 +314,17 @@ static int coroutine_fn raw_co_pdiscard(BlockDriverState 
> *bs,
>  return bdrv_co_pdiscard(bs->file, offset, bytes);
>  }
>  
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t 
> offset,
> +   unsigned int *nr_zones,
> +   BlockZoneDescriptor *zones) {
> +return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
> +}
> +
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp 
> op,
> + int64_t offset, int64_t len) {
> +return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
> +}
> +
>  static int64_t raw_getlength(BlockDriverState *bs)
>  {
>  int64_t len;
> @@ -614,6 +625,8 @@ BlockDriver bdrv_raw = {
>  .bdrv_co_pwritev  = &raw_co_pwritev,
>  .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
>  .bdrv_co_pdiscard = &raw_co_pdiscard,
> +.bdrv_co_zone_report  = &raw_co_zone_report,
> +.bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
>  .bdrv_co_block_status = &raw_co_block_status,
>  .bdrv_co_copy_range_from = &raw_co_copy_range_from,
>  .bdrv_co_copy_range_to  = &raw_co_copy_range_to,

-- 
Damien Le Moal
Western Digital Research




Re: [PATCH v9 7/7] docs/zoned-storage: add zoned device documentation

2022-09-10 Thread Damien Le Moal
On 2022/09/10 14:27, Sam Li wrote:
> Add the documentation about the zoned device support to virtio-blk
> emulation.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> ---
>  docs/devel/zoned-storage.rst   | 41 ++
>  docs/system/qemu-block-drivers.rst.inc |  6 
>  2 files changed, 47 insertions(+)
>  create mode 100644 docs/devel/zoned-storage.rst
> 
> diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
> new file mode 100644
> index 00..ead2d149cc
> --- /dev/null
> +++ b/docs/devel/zoned-storage.rst
> @@ -0,0 +1,41 @@
> +=
> +zoned-storage
> +=
> +
> +Zoned Block Devices (ZBDs) devide the LBA space into block regions called 
> zones
> +that are larger than the LBA size. It can only allow sequential writes, which

s/It/They

> +reduces write amplification in SSDs, leading to higher throughput and 
> increased
> +capacity. More details about ZBDs can be found at:

I would rephrase this like this, to be less assertive about the potential
benefits (as they depend on the vendor implementation):

..., which can reduce write amplification in SSDs, and potentially lead to
higher throughput and increased device capacity.

> +
> +https://zonedstorage.io/docs/introduction/zoned-storage
> +
> +1. Block layer APIs for zoned storage
> +-
> +QEMU block layer has three zoned storage model:
> +- BLK_Z_HM: This model only allows sequential writes access. It supports a 
> set
> +of ZBD-specific I/O request that used by the host to manage device zones.
> +- BLK_Z_HA: It deals with both sequential writes and random writes access.
> +- BLK_Z_NONE: Regular block devices and drive-managed ZBDs are treated as
> +non-zoned devices.
> +
> +The block device information resides inside BlockDriverState. QEMU uses
> +BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
> +block layer while processing I/O requests. A BlockBackend has a root pointer 
> to
> +a BlockDriverState graph(for example, raw format on top of file-posix). The
> +zoned storage information can be propagated from the leaf BlockDriverState 
> all
> +the way up to the BlockBackend. If the zoned storage model in file-posix is
> +set to BLK_Z_HM, then block drivers will declare support for zoned host 
> device.
> +
> +The block layer APIs support commands needed for zoned storage devices,
> +including report zones, four zone operations, and zone append.
> +
> +2. Emulating zoned storage controllers
> +--
> +When the BlockBackend's BlockLimits model reports a zoned storage device, 
> users
> +like the virtio-blk emulation or the qemu-io-cmds.c utility can use block 
> layer
> +APIs for zoned storage emulation or testing.
> +
> +For example, the command line for zone report testing a null_blk device of
> +qemu-io-cmds.c is:
> +$ path/to/qemu-io --image-opts driver=zoned_host_device,filename=/dev/nullb0 
> -c
> +"zrp offset nr_zones"
> diff --git a/docs/system/qemu-block-drivers.rst.inc 
> b/docs/system/qemu-block-drivers.rst.inc
> index dfe5d2293d..0b97227fd9 100644
> --- a/docs/system/qemu-block-drivers.rst.inc
> +++ b/docs/system/qemu-block-drivers.rst.inc
> @@ -430,6 +430,12 @@ Hard disks
>you may corrupt your host data (use the ``-snapshot`` command
>line option or modify the device permissions accordingly).
>  
> +Zoned block devices
> +  Zoned block devices can be passed through to the guest if the emulated 
> storage
> +  controller supports zoned storage. Use ``--blockdev zoned_host_device,
> +  node-name=drive0,filename=/dev/nullb0`` to pass through ``/dev/nullb0``
> +  as ``drive0``.
> +
>  Windows
>  ^^^
>  

-- 
Damien Le Moal
Western Digital Research




Re: [PATCH v9 5/7] config: add check to block layer

2022-09-10 Thread Damien Le Moal
On 2022/09/10 14:27, Sam Li wrote:
> Putting zoned/non-zoned BlockDrivers on top of each other is not
> allowed.
> 
> Signed-off-by: Sam Li 
> Reviewed-by: Stefan Hajnoczi 
> ---
>  block.c  | 14 ++
>  block/file-posix.c   | 14 ++
>  block/raw-format.c   |  1 +
>  include/block/block_int-common.h |  5 +
>  4 files changed, 34 insertions(+)
> 
> diff --git a/block.c b/block.c
> index bc85f46eed..dad2ed3959 100644
> --- a/block.c
> +++ b/block.c
> @@ -7947,6 +7947,20 @@ void bdrv_add_child(BlockDriverState *parent_bs, 
> BlockDriverState *child_bs,
>  return;
>  }
>  
> +/*
> + * Non-zoned block drivers do not follow zoned storage constraints
> + * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
> + * drivers in a graph.
> + */
> +if (!parent_bs->drv->supports_zoned_children &&
> +child_bs->bl.zoned == BLK_Z_HM) {

Shouldn't this be "child_bs->bl.zoned != BLK_Z_NONE" ?

> +error_setg(errp, "Cannot add a %s child to a %s parent",
> +   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
> +   parent_bs->drv->supports_zoned_children ?
> +   "support zoned children" : "not support zoned children");
> +return;
> +}
> +
>  if (!QLIST_EMPTY(&child_bs->parents)) {
>  error_setg(errp, "The node %s already has a parent",
> child_bs->node_name);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 4edfa25d04..354de22860 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -779,6 +779,20 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> *options,
>  goto fail;
>  }
>  }
> +#ifdef CONFIG_BLKZONED
> +/*
> + * The kernel page chache does not reliably work for writes to SWR zones
> + * of zoned block device because it can not guarantee the order of 
> writes.
> + */
> +if (strcmp(bs->drv->format_name, "zoned_host_device") == 0) {
> +if (!(s->open_flags & O_DIRECT)) {
> +error_setg(errp, "driver=zoned_host_device was specified, but it 
> "
> + "requires cache.direct=on, which was not 
> specified.");
> +ret = -EINVAL;

This line is not needed. Simply "return -EINVAL;".

> +return ret; /* No host kernel page cache */
> +}
> +}
> +#endif
>  
>  if (S_ISBLK(st.st_mode)) {
>  #ifdef BLKDISCARDZEROES
> diff --git a/block/raw-format.c b/block/raw-format.c
> index 6b20bd22ef..9441536819 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -614,6 +614,7 @@ static void raw_child_perm(BlockDriverState *bs, 
> BdrvChild *c,
>  BlockDriver bdrv_raw = {
>  .format_name  = "raw",
>  .instance_size= sizeof(BDRVRawState),
> +.supports_zoned_children = true,
>  .bdrv_probe   = &raw_probe,
>  .bdrv_reopen_prepare  = &raw_reopen_prepare,
>  .bdrv_reopen_commit   = &raw_reopen_commit,
> diff --git a/include/block/block_int-common.h 
> b/include/block/block_int-common.h
> index 078ddd7e67..043aa161a0 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -127,6 +127,11 @@ struct BlockDriver {
>   */
>  bool is_format;
>  
> +/*
> + * Set to true if the BlockDriver supports zoned children.
> + */
> +bool supports_zoned_children;
> +
>  /*
>   * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
>   * this field set to true, except ones that are defined only by their

-- 
Damien Le Moal
Western Digital Research




Re: [PATCH] block: introduce zone append write for zoned devices

2022-09-10 Thread Damien Le Moal
On 2022/09/10 15:38, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of the write is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been placed in the zone.

s/placed/written

You need to explain more about what this patch does:

Since Linux does not provide a user API to issue zone append operations to zoned
devices from user space, the file-posix driver is modified to add zone append
emulation using regular write operations. To do this, the file-posix driver
tracks the wp location of all zones of the device Blah.

> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c  |  65 +++
>  block/file-posix.c | 169 -
>  block/io.c |  21 
>  block/raw-format.c |   7 ++
>  include/block/block-common.h   |   2 +
>  include/block/block-io.h   |   3 +
>  include/block/block_int-common.h   |   9 ++
>  include/block/raw-aio.h|   4 +-
>  include/sysemu/block-backend-io.h  |   9 ++
>  qemu-io-cmds.c |  62 +++
>  tests/qemu-iotests/tests/zoned.out |   7 ++
>  tests/qemu-iotests/tests/zoned.sh  |   9 ++
>  12 files changed, 360 insertions(+), 7 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index ebe8d7bdf3..b77a1cb24b 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>  struct {
>  BlockZoneOp op;
>  } zone_mgmt;
> +struct {
> +int64_t *append_sector;
> +} zone_append;
>  };
>  } BlkRwCo;
>  
> @@ -1869,6 +1872,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return &acb->common;
>  }
>  
> +static void blk_aio_zone_append_entry(void *opaque) {
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = &acb->rwco;
> +
> +rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,
> +   rwco->iobuf, rwco->flags);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags,
> +BlockCompletionFunc *cb, void *opaque) {
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.ret= NOT_DONE,
> +.flags  = flags,
> +.iobuf  = qiov,
> +.zone_append = {
> +.append_sector = offset,
> +},
> +};
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1920,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +int ret;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +blk_dec_in_flight(blk);
> +return -ENOMEDIUM;
> +}
> +
> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 354de22860..65500e43f4 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -173,6 +173,7 @@ typedef struct BDRVRawState {
>  } stats;
>  
>  PRManager *pr_mgr;
> +CoRwlock zones_lock;
>  } BDRVRawState;
>  
>  typedef struct BDRVRawReopenState {
> @@ -206,6 +207,8 @@ typedef struct RawPosixAIOData {
>  struct {
>  struct iovec *iov;
>  int niov;
> +int64_t *append_sector;
> +BlockZoneDescriptor *zone;
>  } io;
>  struct {
>  uint64_t cmd;
> @@ -1333,6 +1336,9 @@ static int hdev_get_max_segments(int fd, struct stat 
> *st) {
>  #endif
>  }
>  
> +static inline void parse_zone(struct

Re: [PATCH v2 3/5] msmouse: Use fifo8 instead of array

2022-09-10 Thread Volker Rümelin

Am 08.09.22 um 19:31 schrieb Arwed Meyer:


@@ -54,21 +60,15 @@ DECLARE_INSTANCE_CHECKER(MouseChardev, MOUSE_CHARDEV,
  static void msmouse_chr_accept_input(Chardev *chr)
  {
  MouseChardev *mouse = MOUSE_CHARDEV(chr);
-int len;
+uint32_t len_out, len;

-len = qemu_chr_be_can_write(chr);
-if (len > mouse->outlen) {
-len = mouse->outlen;
-}
-if (!len) {
+len_out = qemu_chr_be_can_write(chr);
+if (!len_out || fifo8_is_empty(&mouse->outbuf)) {
  return;
  }
-
-qemu_chr_be_write(chr, mouse->outbuf, len);
-mouse->outlen -= len;
-if (mouse->outlen) {
-memmove(mouse->outbuf, mouse->outbuf + len, mouse->outlen);
-}
+len = MIN(fifo8_num_used(&mouse->outbuf), len_out);
+qemu_chr_be_write(chr, fifo8_pop_buf(&mouse->outbuf, len, &len_out),
+len_out);


Hi Arwed,

I think C function arguments are not evaluated in a defined order. It's 
not defined if the third argument of function qemu_chr_be_write() is 
len_out before or after the call to fifo8_pop_buf().


The fifo_pop_buf() function uses a ringbuffer. When the buffer wraps 
around at the end and the ringbuffer contains more than one byte you may 
need two fifo8_pop_buf() and qemu_chr_be_write() calls to write all 
bytes. The code you replace doesn't have that problem.


Some chardev frontends don't return the total number of bytes to write 
in qemu_chr_be_can_write(). They return the number of bytes that can be 
written with one qemu_chr_be_write() call. You need another 
qemu_chr_be_can_write() call after the qemu_chr_be_write() call to see 
if more bytes can be written.


The code in function gd_vc_send_chars() in ui/gtk.c could be used as a 
template to avoid the three issues above.


With best regards,
Volker


  }

  static void msmouse_queue_event(MouseChardev *mouse)





Re: [PATCH v9 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-09-10 Thread Sam Li
Damien Le Moal  于2022年9月11日周日 13:31写道:
>
> On 2022/09/10 14:27, Sam Li wrote:
> [...]
> > +/*
> > + * Send a zone_report command.
> > + * offset is a byte offset from the start of the device. No alignment
> > + * required for offset.
> > + * nr_zones represents IN maximum and OUT actual.
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +unsigned int *nr_zones,
> > +BlockZoneDescriptor *zones)
> > +{
> > +int ret;
> > +IO_CODE();
> > +
> > +blk_inc_in_flight(blk); /* increase before waiting */
> > +blk_wait_while_drained(blk);
> > +if (!blk_is_available(blk)) {
> > +blk_dec_in_flight(blk);
> > +return -ENOMEDIUM;
> > +}
> > +ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
> > +blk_dec_in_flight(blk);
> > +return ret;
> > +}
> > +
> > +/*
> > + * Send a zone_management command.
> > + * op is the zone operation;
> > + * offset is the byte offset from the start of the zoned device;
> > + * len is the maximum number of bytes the command should operate on. It
> > + * should be aligned with the zone sector size.
>
> This should read:
>
> * offset is the byte offset of the start of the first zone to operate on;
> * len is the maximum number of bytes the command should operate on. It
> * should be aligned with the device zone size.
>
> No ?

Right. The zone sector size here is meant for the zone size whose unit
is a 512-byte sector.

>
> > + */
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> > +int64_t offset, int64_t len)
> > +{
> > +int ret;
> > +IO_CODE();
> > +
> > +
> > +blk_inc_in_flight(blk);
> > +blk_wait_while_drained(blk);
> > +
> > +ret = blk_check_byte_request(blk, offset, len);
> > +if (ret < 0) {
> > +return ret;
> > +}
> > +
> > +ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
> > +blk_dec_in_flight(blk);
> > +return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >  BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 0a8b4b426e..4edfa25d04 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -67,6 +67,9 @@
> >  #include 
> >  #include 
> >  #include 
> > +#if defined(CONFIG_BLKZONED)
> > +#include 
> > +#endif
> >  #include 
> >  #include 
> >  #include 
> > @@ -216,6 +219,15 @@ typedef struct RawPosixAIOData {
> >  PreallocMode prealloc;
> >  Error **errp;
> >  } truncate;
> > +struct {
> > +unsigned int *nr_zones;
> > +BlockZoneDescriptor *zones;
> > +} zone_report;
> > +struct {
> > +unsigned long zone_op;
> > +const char *zone_op_name;
> > +bool all;
> > +} zone_mgmt;
> >  };
> >  } RawPosixAIOData;
> >
> > @@ -1339,7 +1351,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> > Error **errp)
> >  #endif
> >
> >  if (bs->sg || S_ISBLK(st.st_mode)) {
> > -int ret = hdev_get_max_hw_transfer(s->fd, &st);
> > +ret = hdev_get_max_hw_transfer(s->fd, &st);
> >
> >  if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
> >  bs->bl.max_hw_transfer = ret;
> > @@ -1356,6 +1368,27 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> > Error **errp)
> >  zoned = BLK_Z_NONE;
> >  }
> >  bs->bl.zoned = zoned;
> > +if (zoned != BLK_Z_NONE) {
> > +ret = get_sysfs_long_val(&st, "chunk_sectors");
> > +if (ret > 0) {
> > +bs->bl.zone_sectors = ret;
> > +}
>
> It may be good to check that we are getting a valid zone size here. So may be
> change the check to something like this ?
>
> if (ret <= 0) {
> *** print some error message mentioning the invalid zone size ***
> bs->bl.zoned = BLK_Z_NONE;
> return;
> }
> bs->bl.zone_sectors = ret;
>

Ok, thanks!

> > +
> > +ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
> > +if (ret > 0) {
> > +bs->bl.max_append_sectors = ret / 512;
> > +}
> > +
> > +ret = get_sysfs_long_val(&st, "max_open_zones");
> > +if (ret >= 0) {
> > +bs->bl.max_open_zones = ret;
> > +}
> > +
> > +ret = get_sysfs_long_val(&st, "max_active_zones");
> > +if (ret >= 0) {
> > +bs->bl.max_active_zones = ret;
> > +}
> > +}
> >  }
> >
> >  static int check_for_dasd(int fd)
> > @@ -1850,6 +1883,145 @@ static off_t copy_file_range(int in_fd, off_t 
> > *in_off, int out_fd,
> >  }
> >  #endif
> >
> > +/*
> > + * parse_zone - Fill a zone descriptor
> > + */
> > +#if defined(CONFIG_BLKZONED)
> > +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> > +  const struct blk_zone *blkz) {
> > +zone->start = blkz->st

Re: [PATCH v9 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-09-10 Thread Damien Le Moal
On 2022/09/11 15:33, Sam Li wrote:
> Damien Le Moal  于2022年9月11日周日 13:31写道:
[...]
>>> +/*
>>> + * zone management operations - Execute an operation on a zone
>>> + */
>>> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp 
>>> op,
>>> +int64_t offset, int64_t len) {
>>> +#if defined(CONFIG_BLKZONED)
>>> +BDRVRawState *s = bs->opaque;
>>> +RawPosixAIOData acb;
>>> +int64_t zone_sector, zone_sector_mask;
>>> +const char *zone_op_name;
>>> +unsigned long zone_op;
>>> +bool is_all = false;
>>> +
>>> +zone_sector = bs->bl.zone_sectors;
>>> +zone_sector_mask = zone_sector - 1;
>>> +if (offset & zone_sector_mask) {
>>> +error_report("sector offset %" PRId64 " is not aligned to zone 
>>> size "
>>> + "%" PRId64 "", offset, zone_sector);
>>> +return -EINVAL;
>>> +}
>>> +
>>> +if (len & zone_sector_mask) {
>>
>> Linux allows SMR drives to have a smaller last zone. So this needs to be
>> accounted for here. Otherwise, a zone operation that includes the last 
>> smaller
>> zone would always fail. Something like this would work:
>>
>> if (((offset + len) < capacity &&
>> len & zone_sector_mask) ||
>> offset + len > capacity) {
>>
> 
> I see. I think the offset can be removed, like:
> if (((len < capacity && len & zone_sector_mask) || len > capacity) {
> Then if we use the previous zone's len for the last smaller zone, it
> will be greater than its capacity.

Nope, you cannot remove the offset since the zone operation may be for that last
zone only, that is, offset == last zone start and len == last zone smaller size.
In that case, len is alwats smaller than capacity.

> 
> I will also include "opening the last zone" as a test case later.

Note that you can create such smaller last zone on the host with null_blk by
specifying a device capacity that is *not* a multiple of the zone size.

> 
>>> +error_report("number of sectors %" PRId64 " is not aligned to zone 
>>> size"
>>> +  " %" PRId64 "", len, zone_sector);
>>> +return -EINVAL;
>>> +}
>>> +
>>> +switch (op) {
>>> +case BLK_ZO_OPEN:
>>> +zone_op_name = "BLKOPENZONE";
>>> +zone_op = BLKOPENZONE;
>>> +break;
>>> +case BLK_ZO_CLOSE:
>>> +zone_op_name = "BLKCLOSEZONE";
>>> +zone_op = BLKCLOSEZONE;
>>> +break;
>>> +case BLK_ZO_FINISH:
>>> +zone_op_name = "BLKFINISHZONE";
>>> +zone_op = BLKFINISHZONE;
>>> +break;
>>> +case BLK_ZO_RESET:
>>> +zone_op_name = "BLKRESETZONE";
>>> +zone_op = BLKRESETZONE;
>>> +break;
>>> +default:
>>> +g_assert_not_reached();
>>> +}
>>> +
>>> +acb = (RawPosixAIOData) {
>>> +.bs = bs,
>>> +.aio_fildes = s->fd,
>>> +.aio_type   = QEMU_AIO_ZONE_MGMT,
>>> +.aio_offset = offset,
>>> +.aio_nbytes = len,
>>> +.zone_mgmt  = {
>>> +.zone_op = zone_op,
>>> +.zone_op_name = zone_op_name,
>>> +.all = is_all,
>>> +},
>>> +};
>>> +
>>> +return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
>>> +#else
>>> +return -ENOTSUP;
>>> +#endif
>>> +}

-- 
Damien Le Moal
Western Digital Research




Re: [PATCH v9 5/7] config: add check to block layer

2022-09-10 Thread Sam Li
Damien Le Moal  于2022年9月11日周日 13:34写道:
>
> On 2022/09/10 14:27, Sam Li wrote:
> > Putting zoned/non-zoned BlockDrivers on top of each other is not
> > allowed.
> >
> > Signed-off-by: Sam Li 
> > Reviewed-by: Stefan Hajnoczi 
> > ---
> >  block.c  | 14 ++
> >  block/file-posix.c   | 14 ++
> >  block/raw-format.c   |  1 +
> >  include/block/block_int-common.h |  5 +
> >  4 files changed, 34 insertions(+)
> >
> > diff --git a/block.c b/block.c
> > index bc85f46eed..dad2ed3959 100644
> > --- a/block.c
> > +++ b/block.c
> > @@ -7947,6 +7947,20 @@ void bdrv_add_child(BlockDriverState *parent_bs, 
> > BlockDriverState *child_bs,
> >  return;
> >  }
> >
> > +/*
> > + * Non-zoned block drivers do not follow zoned storage constraints
> > + * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
> > + * drivers in a graph.
> > + */
> > +if (!parent_bs->drv->supports_zoned_children &&
> > +child_bs->bl.zoned == BLK_Z_HM) {
>
> Shouldn't this be "child_bs->bl.zoned != BLK_Z_NONE" ?

The host-aware model allows zoned storage constraints(sequentially
write) and random write. Is mixing HA and non-zoned drivers allowed?
What's the difference?

>
> > +error_setg(errp, "Cannot add a %s child to a %s parent",
> > +   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
> > +   parent_bs->drv->supports_zoned_children ?
> > +   "support zoned children" : "not support zoned 
> > children");
> > +return;
> > +}
> > +
> >  if (!QLIST_EMPTY(&child_bs->parents)) {
> >  error_setg(errp, "The node %s already has a parent",
> > child_bs->node_name);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 4edfa25d04..354de22860 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -779,6 +779,20 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> > *options,
> >  goto fail;
> >  }
> >  }
> > +#ifdef CONFIG_BLKZONED
> > +/*
> > + * The kernel page chache does not reliably work for writes to SWR 
> > zones
> > + * of zoned block device because it can not guarantee the order of 
> > writes.
> > + */
> > +if (strcmp(bs->drv->format_name, "zoned_host_device") == 0) {
> > +if (!(s->open_flags & O_DIRECT)) {
> > +error_setg(errp, "driver=zoned_host_device was specified, but 
> > it "
> > + "requires cache.direct=on, which was not 
> > specified.");
> > +ret = -EINVAL;
>
> This line is not needed. Simply "return -EINVAL;".
>
> > +return ret; /* No host kernel page cache */
> > +}
> > +}
> > +#endif
> >
> >  if (S_ISBLK(st.st_mode)) {
> >  #ifdef BLKDISCARDZEROES
> > diff --git a/block/raw-format.c b/block/raw-format.c
> > index 6b20bd22ef..9441536819 100644
> > --- a/block/raw-format.c
> > +++ b/block/raw-format.c
> > @@ -614,6 +614,7 @@ static void raw_child_perm(BlockDriverState *bs, 
> > BdrvChild *c,
> >  BlockDriver bdrv_raw = {
> >  .format_name  = "raw",
> >  .instance_size= sizeof(BDRVRawState),
> > +.supports_zoned_children = true,
> >  .bdrv_probe   = &raw_probe,
> >  .bdrv_reopen_prepare  = &raw_reopen_prepare,
> >  .bdrv_reopen_commit   = &raw_reopen_commit,
> > diff --git a/include/block/block_int-common.h 
> > b/include/block/block_int-common.h
> > index 078ddd7e67..043aa161a0 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -127,6 +127,11 @@ struct BlockDriver {
> >   */
> >  bool is_format;
> >
> > +/*
> > + * Set to true if the BlockDriver supports zoned children.
> > + */
> > +bool supports_zoned_children;
> > +
> >  /*
> >   * Drivers not implementing bdrv_parse_filename nor bdrv_open should 
> > have
> >   * this field set to true, except ones that are defined only by their
>
> --
> Damien Le Moal
> Western Digital Research
>



Re: [PATCH] block: introduce zone append write for zoned devices

2022-09-10 Thread Damien Le Moal
On 2022/09/11 15:06, Damien Le Moal wrote:
> On 2022/09/10 15:38, Sam Li wrote:
>> A zone append command is a write operation that specifies the first
>> logical block of a zone as the write position. When writing to a zoned
>> block device using zone append, the byte offset of the write is pointing
>> to the write pointer of that zone. Upon completion the device will
>> respond with the position the data has been placed in the zone.
> 
> s/placed/written
> 
> You need to explain more about what this patch does:
> 
> Since Linux does not provide a user API to issue zone append operations to 
> zoned
> devices from user space, the file-posix driver is modified to add zone append
> emulation using regular write operations. To do this, the file-posix driver
> tracks the wp location of all zones of the device Blah.

Thinking more about this, I think you should split this patch in 2:
1) first patch adding the tracking of the zones wp.
2) second patch adding zone append emulation

That will make the review far easier.

> 
>>
>> Signed-off-by: Sam Li 
>> ---
>>  block/block-backend.c  |  65 +++
>>  block/file-posix.c | 169 -
>>  block/io.c |  21 
>>  block/raw-format.c |   7 ++
>>  include/block/block-common.h   |   2 +
>>  include/block/block-io.h   |   3 +
>>  include/block/block_int-common.h   |   9 ++
>>  include/block/raw-aio.h|   4 +-
>>  include/sysemu/block-backend-io.h  |   9 ++
>>  qemu-io-cmds.c |  62 +++
>>  tests/qemu-iotests/tests/zoned.out |   7 ++
>>  tests/qemu-iotests/tests/zoned.sh  |   9 ++
>>  12 files changed, 360 insertions(+), 7 deletions(-)
>>
>> diff --git a/block/block-backend.c b/block/block-backend.c
>> index ebe8d7bdf3..b77a1cb24b 100644
>> --- a/block/block-backend.c
>> +++ b/block/block-backend.c
>> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>>  struct {
>>  BlockZoneOp op;
>>  } zone_mgmt;
>> +struct {
>> +int64_t *append_sector;
>> +} zone_append;
>>  };
>>  } BlkRwCo;
>>  
>> @@ -1869,6 +1872,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
>> BlockZoneOp op,
>>  return &acb->common;
>>  }
>>  
>> +static void blk_aio_zone_append_entry(void *opaque) {
>> +BlkAioEmAIOCB *acb = opaque;
>> +BlkRwCo *rwco = &acb->rwco;
>> +
>> +rwco->ret = blk_co_zone_append(rwco->blk, 
>> rwco->zone_append.append_sector,
>> +   rwco->iobuf, rwco->flags);
>> +blk_aio_complete(acb);
>> +}
>> +
>> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
>> +QEMUIOVector *qiov, BdrvRequestFlags flags,
>> +BlockCompletionFunc *cb, void *opaque) {
>> +BlkAioEmAIOCB *acb;
>> +Coroutine *co;
>> +IO_CODE();
>> +
>> +blk_inc_in_flight(blk);
>> +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>> +acb->rwco = (BlkRwCo) {
>> +.blk= blk,
>> +.ret= NOT_DONE,
>> +.flags  = flags,
>> +.iobuf  = qiov,
>> +.zone_append = {
>> +.append_sector = offset,
>> +},
>> +};
>> +acb->has_returned = false;
>> +
>> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
>> +bdrv_coroutine_enter(blk_bs(blk), co);
>> +
>> +acb->has_returned = true;
>> +if (acb->rwco.ret != NOT_DONE) {
>> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>> + blk_aio_complete_bh, acb);
>> +}
>> +
>> +return &acb->common;
>> +}
>> +
>>  /*
>>   * Send a zone_report command.
>>   * offset is a byte offset from the start of the device. No alignment
>> @@ -1920,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
>> BlockZoneOp op,
>>  return ret;
>>  }
>>  
>> +/*
>> + * Send a zone_append command.
>> + */
>> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
>> +QEMUIOVector *qiov, BdrvRequestFlags flags)
>> +{
>> +int ret;
>> +IO_CODE();
>> +
>> +blk_inc_in_flight(blk);
>> +blk_wait_while_drained(blk);
>> +if (!blk_is_available(blk)) {
>> +blk_dec_in_flight(blk);
>> +return -ENOMEDIUM;
>> +}
>> +
>> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
>> +blk_dec_in_flight(blk);
>> +return ret;
>> +}
>> +
>>  void blk_drain(BlockBackend *blk)
>>  {
>>  BlockDriverState *bs = blk_bs(blk);
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index 354de22860..65500e43f4 100644
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -173,6 +173,7 @@ typedef struct BDRVRawState {
>>  } stats;
>>  
>>  PRManager *pr_mgr;
>> +CoRwlock zones_lock;
>>  } BDRVRawState;
>>  
>>  typedef struct BDRVRawReopenState {
>> @@ -206,6 +207,8 @@ type