date:20221215

[PULL 0/1] loongarch-to-apply queue

2022-12-15 Thread Song Gao

The following changes since commit 5204b499a6cae4dfd9fe762d5e6e82224892383b:

  mailmap: Fix Stefan Weil author email (2022-12-13 15:56:57 -0500)

are available in the Git repository at:

  https://gitlab.com/gaosong/qemu.git tags/pull-loongarch-20221215

for you to fetch changes up to 288431a1fb9334d5d57ad7d5854d8475b23e7c42:

  hw/loongarch/virt: Add cfi01 pflash device (2022-12-15 15:46:12 +0800)


Add cfi01 pflash device


Xiaojuan Yang (1):
  hw/loongarch/virt: Add cfi01 pflash device

 hw/loongarch/Kconfig|  1 +
 hw/loongarch/acpi-build.c   | 18 +
 hw/loongarch/virt.c | 62 +
 include/hw/loongarch/virt.h |  5 
 4 files changed, 86 insertions(+)

[PULL 1/1] hw/loongarch/virt: Add cfi01 pflash device

2022-12-15 Thread Song Gao

From: Xiaojuan Yang 

Add cfi01 pflash device for LoongArch virt machine

Signed-off-by: Xiaojuan Yang 
Reviewed-by: Philippe Mathieu-Daudé 
Message-Id: <20221130100647.398565-1-yangxiaoj...@loongson.cn>
Signed-off-by: Song Gao 
---
 hw/loongarch/Kconfig|  1 +
 hw/loongarch/acpi-build.c   | 18 +++
 hw/loongarch/virt.c | 62 +
 include/hw/loongarch/virt.h |  5 +++
 4 files changed, 86 insertions(+)

diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig
index 17d15b6c90..eb112af990 100644
--- a/hw/loongarch/Kconfig
+++ b/hw/loongarch/Kconfig
@@ -20,3 +20,4 @@ config LOONGARCH_VIRT
 select ACPI_HW_REDUCED
 select FW_CFG_DMA
 select DIMM
+select PFLASH_CFI01
diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c
index 7d5f5a757d..c2b237736d 100644
--- a/hw/loongarch/acpi-build.c
+++ b/hw/loongarch/acpi-build.c
@@ -279,6 +279,23 @@ static void build_pci_device_aml(Aml *scope, 
LoongArchMachineState *lams)
 acpi_dsdt_add_gpex(scope, &cfg);
 }
 
+static void build_flash_aml(Aml *scope, LoongArchMachineState *lams)
+{
+Aml *dev, *crs;
+
+hwaddr flash_base = VIRT_FLASH_BASE;
+hwaddr flash_size = VIRT_FLASH_SIZE;
+
+dev = aml_device("FLS0");
+aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015")));
+aml_append(dev, aml_name_decl("_UID", aml_int(0)));
+
+crs = aml_resource_template();
+aml_append(crs, aml_memory32_fixed(flash_base, flash_size, 
AML_READ_WRITE));
+aml_append(dev, aml_name_decl("_CRS", crs));
+aml_append(scope, dev);
+}
+
 #ifdef CONFIG_TPM
 static void acpi_dsdt_add_tpm(Aml *scope, LoongArchMachineState *vms)
 {
@@ -328,6 +345,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, 
MachineState *machine)
 build_uart_device_aml(dsdt);
 build_pci_device_aml(dsdt, lams);
 build_la_ged_aml(dsdt, machine);
+build_flash_aml(dsdt, lams);
 #ifdef CONFIG_TPM
 acpi_dsdt_add_tpm(dsdt, lams);
 #endif
diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 958be74fa1..c8a495ea30 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -42,6 +42,63 @@
 #include "hw/display/ramfb.h"
 #include "hw/mem/pc-dimm.h"
 #include "sysemu/tpm.h"
+#include "sysemu/block-backend.h"
+#include "hw/block/flash.h"
+
+static void virt_flash_create(LoongArchMachineState *lams)
+{
+DeviceState *dev = qdev_new(TYPE_PFLASH_CFI01);
+
+qdev_prop_set_uint64(dev, "sector-length", VIRT_FLASH_SECTOR_SIZE);
+qdev_prop_set_uint8(dev, "width", 4);
+qdev_prop_set_uint8(dev, "device-width", 2);
+qdev_prop_set_bit(dev, "big-endian", false);
+qdev_prop_set_uint16(dev, "id0", 0x89);
+qdev_prop_set_uint16(dev, "id1", 0x18);
+qdev_prop_set_uint16(dev, "id2", 0x00);
+qdev_prop_set_uint16(dev, "id3", 0x00);
+qdev_prop_set_string(dev, "name", "virt.flash");
+object_property_add_child(OBJECT(lams), "virt.flash", OBJECT(dev));
+object_property_add_alias(OBJECT(lams), "pflash",
+  OBJECT(dev), "drive");
+
+lams->flash = PFLASH_CFI01(dev);
+}
+
+static void virt_flash_map(LoongArchMachineState *lams,
+   MemoryRegion *sysmem)
+{
+PFlashCFI01 *flash = lams->flash;
+DeviceState *dev = DEVICE(flash);
+hwaddr base = VIRT_FLASH_BASE;
+hwaddr size = VIRT_FLASH_SIZE;
+
+assert(QEMU_IS_ALIGNED(size, VIRT_FLASH_SECTOR_SIZE));
+assert(size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX);
+
+qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE);
+sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
+memory_region_add_subregion(sysmem, base,
+sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 
0));
+
+}
+
+static void fdt_add_flash_node(LoongArchMachineState *lams)
+{
+MachineState *ms = MACHINE(lams);
+char *nodename;
+
+hwaddr flash_base = VIRT_FLASH_BASE;
+hwaddr flash_size = VIRT_FLASH_SIZE;
+
+nodename = g_strdup_printf("/flash@%" PRIx64, flash_base);
+qemu_fdt_add_subnode(ms->fdt, nodename);
+qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cfi-flash");
+qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg",
+ 2, flash_base, 2, flash_size);
+qemu_fdt_setprop_cell(ms->fdt, nodename, "bank-width", 4);
+g_free(nodename);
+}
 
 static void fdt_add_rtc_node(LoongArchMachineState *lams)
 {
@@ -596,6 +653,9 @@ static void loongarch_firmware_init(LoongArchMachineState 
*lams)
 int bios_size;
 
 lams->bios_loaded = false;
+
+virt_flash_map(lams, get_system_memory());
+
 if (filename) {
 bios_name = qemu_find_file(QEMU_FILE_TYPE_BIOS, filename);
 if (!bios_name) {
@@ -779,6 +839,7 @@ static void loongarch_init(MachineState *machine)
 loongarch_direct_kernel_boot(lams);
 }
 }
+fdt_add_flash_node(lams);
 /* register reset function */
 for (i = 0; i < machine->smp.cpus; i++) {

Re: [PULL for 7.2-rc4 0/1] loongarch for 7.2-rc4 patch

2022-12-15 Thread gaosong




在 2022/12/15 下午3:29, Philippe Mathieu-Daudé 写道:

Hi,

On 2/12/22 11:25, Song Gao wrote:
The following changes since commit 
c4ffd91aba1c3d878e99a3e7ba8aad4826728ece:


   Update VERSION for v7.2.0-rc3 (2022-11-29 18:15:26 -0500)

are available in the Git repository at:

   https://gitlab.com/gaosong/qemu.git tags/pull-loongarch-20221202

for you to fetch changes up to 14dccc8ea6ece7ee63273144fb55e4770a05e0fd:

   hw/loongarch/virt: Add cfi01 pflash device (2022-12-02 18:03:05 
+0800)





Now than the 8.0 cycle started, can you respin this pull request
(preferably rebased on v7.2.0 or later)?


Done.

Thanks.
Song Gao

Re: [PATCH] linux-user: Add translation for argument of msync()

2022-12-15 Thread Helge Deller


On 12/15/22 08:58, Philippe Mathieu-Daudé wrote:

On 15/12/22 08:27, Helge Deller wrote:

msync() uses the flags MS_ASYNC, MS_INVALIDATE and MS_SYNC, which differ
between platforms, specifcally on alpha and hppa.

Add a target to host translation for those and wire up a nicer strace
output.

This fixes the testsuite of the macaulay2 debian package with a hppa-linux
guest on a x86-64 host.

Signed-off-by: Helge Deller 

diff --git a/linux-user/alpha/target_mman.h b/linux-user/alpha/target_mman.h
index cd6e3d70a6..051544f5ab 100644
--- a/linux-user/alpha/target_mman.h
+++ b/linux-user/alpha/target_mman.h
@@ -3,6 +3,10 @@

  #define TARGET_MADV_DONTNEED 6

+#define TARGET_MS_ASYNC 1
+#define TARGET_MS_SYNC 2
+#define TARGET_MS_INVALIDATE 4
+
  #include "../generic/target_mman.h"

  #endif
diff --git a/linux-user/generic/target_mman.h b/linux-user/generic/target_mman.h
index 1436a3c543..32bf1a52d0 100644
--- a/linux-user/generic/target_mman.h
+++ b/linux-user/generic/target_mman.h
@@ -89,4 +89,17 @@
  #define TARGET_MADV_DONTNEED_LOCKED 24
  #endif

+
+#ifndef TARGET_MS_ASYNC
+#define TARGET_MS_ASYNC 1


Hmm don't we want to keep the host flag instead?

    #define TARGET_MS_ASYNC MS_ASYNC


Yes, that would be possible, but the value is the same.
In the /*h files you usually want to have numerical values
which makes it easier to search for conversion bugs.

I'd prefer to keep it as is, it's done for the other
files/values like that.

Helge





+#endif
+
+#ifndef TARGET_MS_INVALIDATE
+#define TARGET_MS_INVALIDATE 2


Ditto,


+#endif
+
+#ifndef TARGET_MS_SYNC
+#define TARGET_MS_SYNC 4


ditto.

LGTM otherwise.


+#endif
+
  #endif
diff --git a/linux-user/hppa/target_mman.h b/linux-user/hppa/target_mman.h
index 66dd9f7941..f9b6b97032 100644
--- a/linux-user/hppa/target_mman.h
+++ b/linux-user/hppa/target_mman.h
@@ -10,6 +10,10 @@
  #define TARGET_MADV_WIPEONFORK 71
  #define TARGET_MADV_KEEPONFORK 72

+#define TARGET_MS_SYNC 1
+#define TARGET_MS_ASYNC 2
+#define TARGET_MS_INVALIDATE 4
+
  #include "../generic/target_mman.h"

  #endif
diff --git a/linux-user/strace.list b/linux-user/strace.list
index a75101fca1..ac8f872371 100644
--- a/linux-user/strace.list
+++ b/linux-user/strace.list
@@ -650,7 +650,7 @@
  { TARGET_NR_msgsnd, "msgsnd" , NULL, NULL, NULL },
  #endif
  #ifdef TARGET_NR_msync
-{ TARGET_NR_msync, "msync" , NULL, NULL, NULL },
+{ TARGET_NR_msync, "msync" , "%s(%p,%u,%d)", NULL, NULL },
  #endif
  #ifdef TARGET_NR_multiplexer
  { TARGET_NR_multiplexer, "multiplexer" , NULL, NULL, NULL },
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index d58e9b8d10..e541fbe09a 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -22,6 +22,7 @@
  #include "qemu/path.h"
  #include "qemu/memfd.h"
  #include "qemu/queue.h"
+#include "target_mman.h"
  #include 
  #include 
  #include 
@@ -7667,6 +7668,14 @@ static inline int target_to_host_mlockall_arg(int arg)
  }
  #endif

+static inline int target_to_host_msync_arg(abi_long arg)
+{
+    return ((arg & TARGET_MS_ASYNC) ? MS_ASYNC : 0) |
+   ((arg & TARGET_MS_INVALIDATE) ? MS_INVALIDATE : 0) |
+   ((arg & TARGET_MS_SYNC) ? MS_SYNC : 0) |
+   (arg & ~(TARGET_MS_ASYNC | TARGET_MS_INVALIDATE | TARGET_MS_SYNC));
+}
+
  #if (defined(TARGET_NR_stat64) || defined(TARGET_NR_lstat64) || \
   defined(TARGET_NR_fstat64) || defined(TARGET_NR_fstatat64) ||  \
   defined(TARGET_NR_newfstatat))
@@ -10163,7 +10172,8 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int 
num, abi_long arg1,
  /* ??? msync/mlock/munlock are broken for softmmu.  */
  #ifdef TARGET_NR_msync
  case TARGET_NR_msync:
-    return get_errno(msync(g2h(cpu, arg1), arg2, arg3));
+    return get_errno(msync(g2h(cpu, arg1), arg2,
+   target_to_host_msync_arg(arg3)));
  #endif
  #ifdef TARGET_NR_mlock
  case TARGET_NR_mlock:

Re: [PATCH v1 1/2] hw/intc/loongarch_pch_msi: add irq number property

2022-12-15 Thread Tianrui Zhao





在 2022年12月15日 15:40, Philippe Mathieu-Daudé 写道:

On 15/12/22 07:50, Tianrui Zhao wrote:

This patch adds irq number property for loongarch msi interrupt
controller, and remove hard coding irq number macro.

Signed-off-by: Tianrui Zhao 
---
  hw/intc/loongarch_pch_msi.c | 22 +++---
  hw/loongarch/virt.c | 11 +++
  include/hw/intc/loongarch_pch_msi.h |  3 ++-
  include/hw/pci-host/ls7a.h  |  1 -
  4 files changed, 28 insertions(+), 9 deletions(-)



@@ -49,6 +49,22 @@ static void pch_msi_irq_handler(void *opaque, int 
irq, int level)

  qemu_set_irq(s->pch_msi_irq[irq], level);
  }
  +static void loongarch_pch_msi_realize(DeviceState *dev, Error **errp)
+{
+LoongArchPCHMSI *s = LOONGARCH_PCH_MSI(dev);
+
+assert(s->irq_num > 0);


   if (!s->irq_num || s->irq_num  > PCH_MSI_IRQ_NUM) {
   error_setg(errp, "Invalid 'msi_irq_num'");
   return;
   }


+s->pch_msi_irq = g_malloc(sizeof(qemu_irq) * s->irq_num);


   s->pch_msi_irq = g_new(qemu_irq, s->irq_num);


+if (!s->pch_msi_irq) {
+error_report("loongarch_pch_msi: fail to alloc memory");
+exit(1);
+}
+
+qdev_init_gpio_out(dev, s->pch_msi_irq, s->irq_num);
+qdev_init_gpio_in(dev, pch_msi_irq_handler, s->irq_num);
+}


Missing g_free(s->pch_msi_irq) in loongarch_pch_msi_unrealize().


  static void loongarch_pch_msi_init(Object *obj)
  {
  LoongArchPCHMSI *s = LOONGARCH_PCH_MSI(obj);
@@ -59,12 +75,11 @@ static void loongarch_pch_msi_init(Object *obj)
  sysbus_init_mmio(sbd, &s->msi_mmio);
  msi_nonbroken = true;
  -qdev_init_gpio_out(DEVICE(obj), s->pch_msi_irq, PCH_MSI_IRQ_NUM);
-qdev_init_gpio_in(DEVICE(obj), pch_msi_irq_handler, 
PCH_MSI_IRQ_NUM);

  }
static Property loongarch_msi_properties[] = {
  DEFINE_PROP_UINT32("msi_irq_base", LoongArchPCHMSI, irq_base, 0),
+DEFINE_PROP_UINT32("msi_irq_num",  LoongArchPCHMSI, irq_num, 0),
  DEFINE_PROP_END_OF_LIST(),
  };
  @@ -72,6 +87,7 @@ static void 
loongarch_pch_msi_class_init(ObjectClass *klass, void *data)

  {
  DeviceClass *dc = DEVICE_CLASS(klass);
  +dc->realize = loongarch_pch_msi_realize;


   dc->unrealize = loongarch_pch_msi_unrealize;


  device_class_set_props(dc, loongarch_msi_properties);
  }
  diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 958be74fa1..3547d5f711 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -496,7 +496,7 @@ static void 
loongarch_irq_init(LoongArchMachineState *lams)

  LoongArchCPU *lacpu;
  CPULoongArchState *env;
  CPUState *cpu_state;
-int cpu, pin, i;
+int cpu, pin, i, start, num;
ipi = qdev_new(TYPE_LOONGARCH_IPI);
  sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal);
@@ -576,14 +576,17 @@ static void 
loongarch_irq_init(LoongArchMachineState *lams)

  }
pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI);
-qdev_prop_set_uint32(pch_msi, "msi_irq_base", PCH_MSI_IRQ_START);
+start   =  PCH_PIC_IRQ_NUM;
+num = 256 - start;


This part is confuse. So you don't need PCH_MSI_IRQ_START anymore?
What is this magic '256' value?

On loongarch platform, both PCH_pic and PCH_MSI intc are connected to upper
extioi controller, PCH_pic is triggered by irq line and PCH_MSI is 
trigger by message method.


No, PCH_MSI_IRQ_START is not necessary any more. 256 is total supported 
irq number with extioi controller,
we will replace it with macro EXTIOI_IRQS. We can adjust irq number 
between PCH_pic and PCH_MSI, only if
the total number is no larger than EXTIOI_IRQS. In general there are 
lots of msi vectors requirements
since there may be many virtio devices; there is no much requirements 
for PCH_pic intc, since gpex pcie
irq number is 4 and there is fewer legacy non-pci devices(such as 
rtc/uart/acpi ged).


I want to adjust number PCH_pic intc with smaller value, and increase 
irq number of PCH_MSI intc in future.




+qdev_prop_set_uint32(pch_msi, "msi_irq_base", start);
+qdev_prop_set_uint32(pch_msi, "msi_irq_num", num);
  d = SYS_BUS_DEVICE(pch_msi);
  sysbus_realize_and_unref(d, &error_fatal);
  sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW);
-for (i = 0; i < PCH_MSI_IRQ_NUM; i++) {
+for (i = 0; i < num; i++) {
  /* Connect 192 pch_msi irqs to extioi */
  qdev_connect_gpio_out(DEVICE(d), i,
-  qdev_get_gpio_in(extioi, i + 
PCH_MSI_IRQ_START));

+  qdev_get_gpio_in(extioi, i + start));
  }

[PULL 04/19] migration: Export ram_release_page()

2022-12-15 Thread Juan Quintela

Signed-off-by: Juan Quintela 
Reviewed-by: Leonardo Bras 
---
 migration/ram.h | 1 +
 migration/ram.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/ram.h b/migration/ram.h
index e844966f69..038d52f49f 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -66,6 +66,7 @@ int ram_load_postcopy(QEMUFile *f, int channel);
 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
 
 void ram_transferred_add(uint64_t bytes);
+void ram_release_page(const char *rbname, uint64_t offset);
 
 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr);
 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset);
diff --git a/migration/ram.c b/migration/ram.c
index 2cbe707bfc..8aad17c429 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1234,7 +1234,7 @@ static void migration_bitmap_sync_precopy(RAMState *rs)
 }
 }
 
-static void ram_release_page(const char *rbname, uint64_t offset)
+void ram_release_page(const char *rbname, uint64_t offset)
 {
 if (!migrate_release_ram() || !migration_in_postcopy()) {
 return;
-- 
2.38.1

[PULL 02/19] multifd: Create page_count fields into both MultiFD{Recv, Send}Params

2022-12-15 Thread Juan Quintela

We were recalculating it left and right.  We plan to change that
values on next patches.

Signed-off-by: Juan Quintela 
Reviewed-by: Leonardo Bras 
---
 migration/multifd.h | 4 
 migration/multifd.c | 7 ---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/migration/multifd.h b/migration/multifd.h
index 86fb9982b3..e2802a9ce2 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -82,6 +82,8 @@ typedef struct {
 uint32_t packet_len;
 /* guest page size */
 uint32_t page_size;
+/* number of pages in a full packet */
+uint32_t page_count;
 /* multifd flags for sending ram */
 int write_flags;
 
@@ -147,6 +149,8 @@ typedef struct {
 uint32_t packet_len;
 /* guest page size */
 uint32_t page_size;
+/* number of pages in a full packet */
+uint32_t page_count;
 
 /* syncs main thread and channels */
 QemuSemaphore sem_sync;
diff --git a/migration/multifd.c b/migration/multifd.c
index efffa77a76..b8dc559d24 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -279,7 +279,6 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 {
 MultiFDPacket_t *packet = p->packet;
-uint32_t page_count = MULTIFD_PACKET_SIZE / p->page_size;
 RAMBlock *block;
 int i;
 
@@ -306,10 +305,10 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams 
*p, Error **errp)
  * If we received a packet that is 100 times bigger than expected
  * just stop migration.  It is a magic number.
  */
-if (packet->pages_alloc > page_count) {
+if (packet->pages_alloc > p->page_count) {
 error_setg(errp, "multifd: received packet "
"with size %u and expected a size of %u",
-   packet->pages_alloc, page_count) ;
+   packet->pages_alloc, p->page_count) ;
 return -1;
 }
 
@@ -944,6 +943,7 @@ int multifd_save_setup(Error **errp)
 p->iov = g_new0(struct iovec, page_count + 1);
 p->normal = g_new0(ram_addr_t, page_count);
 p->page_size = qemu_target_page_size();
+p->page_count = page_count;
 
 if (migrate_use_zero_copy_send()) {
 p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
@@ -1191,6 +1191,7 @@ int multifd_load_setup(Error **errp)
 p->name = g_strdup_printf("multifdrecv_%d", i);
 p->iov = g_new0(struct iovec, page_count);
 p->normal = g_new0(ram_addr_t, page_count);
+p->page_count = page_count;
 p->page_size = qemu_target_page_size();
 }
 
-- 
2.38.1

[PULL 05/19] migration: Take bitmap mutex when completing ram migration

2022-12-15 Thread Juan Quintela

From: Peter Xu 

Any call to ram_find_and_save_block() needs to take the bitmap mutex.  We
used to not take it for most of ram_save_complete() because we thought
we're the only one left using the bitmap, but it's not true after the
preempt full patchset applied, since the return path can be taking it too.

Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 8aad17c429..cc72c24c18 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3406,6 +3406,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 /* try transferring iterative blocks of memory */
 
 /* flush all remaining blocks regardless of rate limiting */
+qemu_mutex_lock(&rs->bitmap_mutex);
 while (true) {
 int pages;
 
@@ -3419,6 +3420,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 break;
 }
 }
+qemu_mutex_unlock(&rs->bitmap_mutex);
 
 flush_compressed_data(rs);
 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
-- 
2.38.1

[PULL 08/19] migration: Trivial cleanup save_page_header() on same block check

2022-12-15 Thread Juan Quintela

From: Peter Xu 

The 2nd check on RAM_SAVE_FLAG_CONTINUE is a bit redundant.  Use a boolean
to be clearer.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 7124ff531c..41475431fc 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -661,14 +661,15 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f, 
 RAMBlock *block,
ram_addr_t offset)
 {
 size_t size, len;
+bool same_block = (block == rs->last_sent_block);
 
-if (block == rs->last_sent_block) {
+if (same_block) {
 offset |= RAM_SAVE_FLAG_CONTINUE;
 }
 qemu_put_be64(f, offset);
 size = 8;
 
-if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
+if (!same_block) {
 len = strlen(block->idstr);
 qemu_put_byte(f, len);
 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
-- 
2.38.1

[PULL 09/19] migration: Remove RAMState.f references in compression code

2022-12-15 Thread Juan Quintela

From: Peter Xu 

Removing referencing to RAMState.f in compress_page_with_multi_thread() and
flush_compressed_data().

Compression code by default isn't compatible with having >1 channels (or it
won't currently know which channel to flush the compressed data), so to
make it simple we always flush on the default to_dst_file port until
someone wants to add >1 ports support, as rs->f right now can really
change (after postcopy preempt is introduced).

There should be no functional change at all after patch applied, since as
long as rs->f referenced in compression code, it must be to_dst_file.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 41475431fc..6e3dc845c5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1461,6 +1461,7 @@ static bool save_page_use_compression(RAMState *rs);
 
 static void flush_compressed_data(RAMState *rs)
 {
+MigrationState *ms = migrate_get_current();
 int idx, len, thread_count;
 
 if (!save_page_use_compression(rs)) {
@@ -1479,7 +1480,7 @@ static void flush_compressed_data(RAMState *rs)
 for (idx = 0; idx < thread_count; idx++) {
 qemu_mutex_lock(&comp_param[idx].mutex);
 if (!comp_param[idx].quit) {
-len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
+len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
 /*
  * it's safe to fetch zero_page without holding comp_done_lock
  * as there is no further request submitted to the thread,
@@ -1498,11 +1499,11 @@ static inline void set_compress_params(CompressParam 
*param, RAMBlock *block,
 param->offset = offset;
 }
 
-static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
-   ram_addr_t offset)
+static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
 {
 int idx, thread_count, bytes_xmit = -1, pages = -1;
 bool wait = migrate_compress_wait_thread();
+MigrationState *ms = migrate_get_current();
 
 thread_count = migrate_compress_threads();
 qemu_mutex_lock(&comp_done_lock);
@@ -1510,7 +1511,8 @@ retry:
 for (idx = 0; idx < thread_count; idx++) {
 if (comp_param[idx].done) {
 comp_param[idx].done = false;
-bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
+bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
+comp_param[idx].file);
 qemu_mutex_lock(&comp_param[idx].mutex);
 set_compress_params(&comp_param[idx], block, offset);
 qemu_cond_signal(&comp_param[idx].cond);
@@ -2263,7 +2265,7 @@ static bool save_compress_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset)
 return false;
 }
 
-if (compress_page_with_multi_thread(rs, block, offset) > 0) {
+if (compress_page_with_multi_thread(block, offset) > 0) {
 return true;
 }
 
-- 
2.38.1

[PULL 00/19] Next 8.0 patches

2022-12-15 Thread Juan Quintela

The following changes since commit 5204b499a6cae4dfd9fe762d5e6e82224892383b:

  mailmap: Fix Stefan Weil author email (2022-12-13 15:56:57 -0500)

are available in the Git repository at:

  https://gitlab.com/juan.quintela/qemu.git tags/next-8.0-pull-request

for you to fetch changes up to 7f401b80445e8746202a6d643410ba1b9eeb3cb1:

  migration: Drop rs->f (2022-12-15 10:30:37 +0100)


Migration patches for 8.0

Hi

This are the patches that I had to drop form the last PULL request because they 
werent fixes:
- AVX2 is dropped, intel posted a fix, I have to redo it
- Fix for out of order channels is out
  Daniel nacked it and I need to redo it



Juan Quintela (4):
  multifd: Create page_size fields into both MultiFD{Recv,Send}Params
  multifd: Create page_count fields into both MultiFD{Recv,Send}Params
  migration: Export ram_transferred_ram()
  migration: Export ram_release_page()

Peter Xu (15):
  migration: Take bitmap mutex when completing ram migration
  migration: Add postcopy_preempt_active()
  migration: Cleanup xbzrle zero page cache update logic
  migration: Trivial cleanup save_page_header() on same block check
  migration: Remove RAMState.f references in compression code
  migration: Yield bitmap_mutex properly when sending/sleeping
  migration: Use atomic ops properly for page accountings
  migration: Teach PSS about host page
  migration: Introduce pss_channel
  migration: Add pss_init()
  migration: Make PageSearchStatus part of RAMState
  migration: Move last_sent_block into PageSearchStatus
  migration: Send requested page directly in rp-return thread
  migration: Remove old preempt code around state maintainance
  migration: Drop rs->f

 migration/migration.h|   7 -
 migration/multifd.h  |   8 +
 migration/ram.h  |  23 ++
 migration/migration.c|  47 +--
 migration/multifd-zlib.c |  14 +-
 migration/multifd-zstd.c |  12 +-
 migration/multifd.c  |  27 +-
 migration/ram.c  | 735 ++-
 8 files changed, 422 insertions(+), 451 deletions(-)

-- 
2.38.1

[PULL 13/19] migration: Introduce pss_channel

2022-12-15 Thread Juan Quintela

From: Peter Xu 

Introduce pss_channel for PageSearchStatus, define it as "the migration
channel to be used to transfer this host page".

We used to have rs->f, which is a mirror to MigrationState.to_dst_file.

After postcopy preempt initial version, rs->f can be dynamically changed
depending on which channel we want to use.

But that later work still doesn't grant full concurrency of sending pages
in e.g. different threads, because rs->f can either be the PRECOPY channel
or POSTCOPY channel.  This needs to be per-thread too.

PageSearchStatus is actually a good piece of struct which we can leverage
if we want to have multiple threads sending pages.  Sending a single guest
page may not make sense, so we make the granule to be "host page", and in
the PSS structure we allow specify a QEMUFile* to migrate a specific host
page.  Then we open the possibility to specify different channels in
different threads with different PSS structures.

The PSS prefix can be slightly misleading here because e.g. for the
upcoming usage of postcopy channel/thread it's not "searching" (or,
scanning) at all but sending the explicit page that was requested.  However
since PSS existed for some years keep it as-is until someone complains.

This patch mostly (simply) replace rs->f with pss->pss_channel only. No
functional change intended for this patch yet.  But it does prepare to
finally drop rs->f, and make ram_save_guest_page() thread safe.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 70 +++--
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 4d7b50ef79..571d780987 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -453,6 +453,8 @@ void dirty_sync_missed_zero_copy(void)
 
 /* used by the search for pages to send */
 struct PageSearchStatus {
+/* The migration channel used for a specific host page */
+QEMUFile*pss_channel;
 /* Current block being searched */
 RAMBlock*block;
 /* Current page to search from */
@@ -775,9 +777,9 @@ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t 
current_addr)
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
-ram_addr_t current_addr, RAMBlock *block,
-ram_addr_t offset)
+static int save_xbzrle_page(RAMState *rs, QEMUFile *file,
+uint8_t **current_data, ram_addr_t current_addr,
+RAMBlock *block, ram_addr_t offset)
 {
 int encoded_len = 0, bytes_xbzrle;
 uint8_t *prev_cached_page;
@@ -845,11 +847,11 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 }
 
 /* Send XBZRLE based compressed page */
-bytes_xbzrle = save_page_header(rs, rs->f, block,
+bytes_xbzrle = save_page_header(rs, file, block,
 offset | RAM_SAVE_FLAG_XBZRLE);
-qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
-qemu_put_be16(rs->f, encoded_len);
-qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
+qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
+qemu_put_be16(file, encoded_len);
+qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 bytes_xbzrle += encoded_len + 1 + 2;
 /*
  * Like compressed_size (please see update_compress_thread_counts),
@@ -1305,9 +1307,10 @@ static int save_zero_page_to_file(RAMState *rs, QEMUFile 
*file,
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
+static int save_zero_page(RAMState *rs, QEMUFile *file, RAMBlock *block,
+  ram_addr_t offset)
 {
-int len = save_zero_page_to_file(rs, rs->f, block, offset);
+int len = save_zero_page_to_file(rs, file, block, offset);
 
 if (len) {
 stat64_add(&ram_atomic_counters.duplicate, 1);
@@ -1324,15 +1327,15 @@ static int save_zero_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset)
  *
  * Return true if the pages has been saved, otherwise false is returned.
  */
-static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
-  int *pages)
+static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
+  ram_addr_t offset, int *pages)
 {
 uint64_t bytes_xmit = 0;
 int ret;
 
 *pages = -1;
-ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
-&bytes_xmit);
+ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
+TARGET_PAGE_SIZE, &bytes_xmit);
 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
 return

[PULL 07/19] migration: Cleanup xbzrle zero page cache update logic

2022-12-15 Thread Juan Quintela

From: Peter Xu 

The major change is to replace "!save_page_use_compression()" with
"xbzrle_enabled" to make it clear.

Reasonings:

(1) When compression enabled, "!save_page_use_compression()" is exactly the
same as checking "xbzrle_enabled".

(2) When compression disabled, "!save_page_use_compression()" always return
true.  We used to try calling the xbzrle code, but after this change we
won't, and we shouldn't need to.

Since at it, drop the xbzrle_enabled check in xbzrle_cache_zero_page()
because with this change it's not needed anymore.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 00a2e30322..7124ff531c 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -741,10 +741,6 @@ void mig_throttle_counter_reset(void)
  */
 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 {
-if (!rs->xbzrle_enabled) {
-return;
-}
-
 /* We don't care if this fails to allocate a new cache page
  * as long as it updated an old one */
 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
@@ -2301,7 +2297,7 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss)
 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
  * page would be stale
  */
-if (!save_page_use_compression(rs)) {
+if (rs->xbzrle_enabled) {
 XBZRLE_cache_lock();
 xbzrle_cache_zero_page(rs, block->offset + offset);
 XBZRLE_cache_unlock();
-- 
2.38.1

[PULL 11/19] migration: Use atomic ops properly for page accountings

2022-12-15 Thread Juan Quintela

From: Peter Xu 

To prepare for thread-safety on page accountings, at least below counters
need to be accessed only atomically, they are:

ram_counters.transferred
ram_counters.duplicate
ram_counters.normal
ram_counters.postcopy_bytes

There are a lot of other counters but they won't be accessed outside
migration thread, then they're still safe to be accessed without atomic
ops.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.h   | 20 
 migration/migration.c | 10 +-
 migration/multifd.c   |  4 ++--
 migration/ram.c   | 40 
 4 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/migration/ram.h b/migration/ram.h
index 038d52f49f..81cbb0947c 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -32,7 +32,27 @@
 #include "qapi/qapi-types-migration.h"
 #include "exec/cpu-common.h"
 #include "io/channel.h"
+#include "qemu/stats64.h"
 
+/*
+ * These are the migration statistic counters that need to be updated using
+ * atomic ops (can be accessed by more than one thread).  Here since we
+ * cannot modify MigrationStats directly to use Stat64 as it was defined in
+ * the QAPI scheme, we define an internal structure to hold them, and we
+ * propagate the real values when QMP queries happen.
+ *
+ * IOW, the corresponding fields within ram_counters on these specific
+ * fields will be always zero and not being used at all; they're just
+ * placeholders to make it QAPI-compatible.
+ */
+typedef struct {
+Stat64 transferred;
+Stat64 duplicate;
+Stat64 normal;
+Stat64 postcopy_bytes;
+} MigrationAtomicStats;
+
+extern MigrationAtomicStats ram_atomic_counters;
 extern MigrationStats ram_counters;
 extern XBZRLECacheStats xbzrle_counters;
 extern CompressionStats compression_counters;
diff --git a/migration/migration.c b/migration/migration.c
index f485eea5fb..de83c50f51 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1049,13 +1049,13 @@ static void populate_ram_info(MigrationInfo *info, 
MigrationState *s)
 
 info->has_ram = true;
 info->ram = g_malloc0(sizeof(*info->ram));
-info->ram->transferred = ram_counters.transferred;
+info->ram->transferred = stat64_get(&ram_atomic_counters.transferred);
 info->ram->total = ram_bytes_total();
-info->ram->duplicate = ram_counters.duplicate;
+info->ram->duplicate = stat64_get(&ram_atomic_counters.duplicate);
 /* legacy value.  It is not used anymore */
 info->ram->skipped = 0;
-info->ram->normal = ram_counters.normal;
-info->ram->normal_bytes = ram_counters.normal * page_size;
+info->ram->normal = stat64_get(&ram_atomic_counters.normal);
+info->ram->normal_bytes = info->ram->normal * page_size;
 info->ram->mbps = s->mbps;
 info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
 info->ram->dirty_sync_missed_zero_copy =
@@ -1066,7 +1066,7 @@ static void populate_ram_info(MigrationInfo *info, 
MigrationState *s)
 info->ram->pages_per_second = s->pages_per_second;
 info->ram->precopy_bytes = ram_counters.precopy_bytes;
 info->ram->downtime_bytes = ram_counters.downtime_bytes;
-info->ram->postcopy_bytes = ram_counters.postcopy_bytes;
+info->ram->postcopy_bytes = 
stat64_get(&ram_atomic_counters.postcopy_bytes);
 
 if (migrate_use_xbzrle()) {
 info->has_xbzrle_cache = true;
diff --git a/migration/multifd.c b/migration/multifd.c
index b8dc559d24..000ca4d4ec 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -432,7 +432,7 @@ static int multifd_send_pages(QEMUFile *f)
 transferred = ((uint64_t) pages->num) * p->page_size + p->packet_len;
 qemu_file_acct_rate_limit(f, transferred);
 ram_counters.multifd_bytes += transferred;
-ram_counters.transferred += transferred;
+stat64_add(&ram_atomic_counters.transferred, transferred);
 qemu_mutex_unlock(&p->mutex);
 qemu_sem_post(&p->sem);
 
@@ -624,7 +624,7 @@ int multifd_send_sync_main(QEMUFile *f)
 p->pending_job++;
 qemu_file_acct_rate_limit(f, p->packet_len);
 ram_counters.multifd_bytes += p->packet_len;
-ram_counters.transferred += p->packet_len;
+stat64_add(&ram_atomic_counters.transferred, p->packet_len);
 qemu_mutex_unlock(&p->mutex);
 qemu_sem_post(&p->sem);
 
diff --git a/migration/ram.c b/migration/ram.c
index 5379164749..f4cd9038f4 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -425,18 +425,25 @@ uint64_t ram_bytes_remaining(void)
0;
 }
 
+/*
+ * NOTE: not all stats in ram_counters are used in reality.  See comments
+ * for struct MigrationAtomicStats.  The ultimate result of ram migration
+ * counters will be a merged version with both ram_counters and the atomic
+ * fields in ram_atomic_counters.
+ */
 MigrationStats ram_counters;
+MigrationAtomicStats ram_atomic_counter

[PULL 19/19] migration: Drop rs->f

2022-12-15 Thread Juan Quintela

From: Peter Xu 

Now with rs->pss we can already cache channels in pss->pss_channels.  That
pss_channel contains more infromation than rs->f because it's per-channel.
So rs->f could be replaced by rss->pss[RAM_CHANNEL_PRECOPY].pss_channel,
while rs->f itself is a bit vague now.

Note that vanilla postcopy still send pages via pss[RAM_CHANNEL_PRECOPY],
that's slightly confusing but it reflects the reality.

Then, after the replacement we can safely drop rs->f.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 1ae093fb61..334309f1c6 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -323,8 +323,6 @@ struct RAMSrcPageRequest {
 
 /* State of RAM for migration */
 struct RAMState {
-/* QEMUFile used for this migration */
-QEMUFile *f;
 /*
  * PageSearchStatus structures for the channels when send pages.
  * Protected by the bitmap_mutex.
@@ -2532,8 +2530,6 @@ static int ram_find_and_save_block(RAMState *rs)
 }
 
 if (found) {
-/* Cache rs->f in pss_channel (TODO: remove rs->f) */
-pss->pss_channel = rs->f;
 pages = ram_save_host_page(rs, pss);
 }
 } while (!pages && again);
@@ -3089,7 +3085,7 @@ static void ram_state_resume_prepare(RAMState *rs, 
QEMUFile *out)
 ram_state_reset(rs);
 
 /* Update RAMState cache of output QEMUFile */
-rs->f = out;
+rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
 
 trace_ram_state_resume_prepare(pages);
 }
@@ -3180,7 +3176,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 return -1;
 }
 }
-(*rsp)->f = f;
+(*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
 
 WITH_RCU_READ_LOCK_GUARD() {
 qemu_put_be64(f, ram_bytes_total_common(true) | 
RAM_SAVE_FLAG_MEM_SIZE);
@@ -3315,7 +3311,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 out:
 if (ret >= 0
 && migration_is_setup_or_active(migrate_get_current()->state)) {
-ret = multifd_send_sync_main(rs->f);
+ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
 if (ret < 0) {
 return ret;
 }
@@ -3385,7 +3381,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 return ret;
 }
 
-ret = multifd_send_sync_main(rs->f);
+ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
 if (ret < 0) {
 return ret;
 }
-- 
2.38.1

[PULL 12/19] migration: Teach PSS about host page

2022-12-15 Thread Juan Quintela

From: Peter Xu 

Migration code has a lot to do with host pages.  Teaching PSS core about
the idea of host page helps a lot and makes the code clean.  Meanwhile,
this prepares for the future changes that can leverage the new PSS helpers
that this patch introduces to send host page in another thread.

Three more fields are introduced for this:

  (1) host_page_sending: this is set to true when QEMU is sending a host
  page, false otherwise.

  (2) host_page_{start|end}: these point to the start/end of host page
  we're sending, and it's only valid when host_page_sending==true.

For example, when we look up the next dirty page on the ramblock, with
host_page_sending==true, we'll not try to look for anything beyond the
current host page boundary.  This can be slightly efficient than current
code because currently we'll set pss->page to next dirty bit (which can be
over current host page boundary) and reset it to host page boundary if we
found it goes beyond that.

With above, we can easily make migration_bitmap_find_dirty() self contained
by updating pss->page properly.  rs* parameter is removed because it's not
even used in old code.

When sending a host page, we should use the pss helpers like this:

  - pss_host_page_prepare(pss): called before sending host page
  - pss_within_range(pss): whether we're still working on the cur host page?
  - pss_host_page_finish(pss): called after sending a host page

Then we can use ram_save_target_page() to save one small page.

Currently ram_save_host_page() is still the only user. If there'll be
another function to send host page (e.g. in return path thread) in the
future, it should follow the same style.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 95 +++--
 1 file changed, 76 insertions(+), 19 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index f4cd9038f4..4d7b50ef79 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -481,6 +481,11 @@ struct PageSearchStatus {
  * postcopy pages via postcopy preempt channel.
  */
 bool postcopy_target_channel;
+/* Whether we're sending a host page */
+bool  host_page_sending;
+/* The start/end of current host page.  Only valid if 
host_page_sending==true */
+unsigned long host_page_start;
+unsigned long host_page_end;
 };
 typedef struct PageSearchStatus PageSearchStatus;
 
@@ -858,26 +863,38 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 }
 
 /**
- * migration_bitmap_find_dirty: find the next dirty page from start
+ * pss_find_next_dirty: find the next dirty page of current ramblock
  *
- * Returns the page offset within memory region of the start of a dirty page
+ * This function updates pss->page to point to the next dirty page index
+ * within the ramblock to migrate, or the end of ramblock when nothing
+ * found.  Note that when pss->host_page_sending==true it means we're
+ * during sending a host page, so we won't look for dirty page that is
+ * outside the host page boundary.
  *
- * @rs: current RAM state
- * @rb: RAMBlock where to search for dirty pages
- * @start: page where we start the search
+ * @pss: the current page search status
  */
-static inline
-unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
-  unsigned long start)
+static void pss_find_next_dirty(PageSearchStatus *pss)
 {
+RAMBlock *rb = pss->block;
 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 unsigned long *bitmap = rb->bmap;
 
 if (ramblock_is_ignored(rb)) {
-return size;
+/* Points directly to the end, so we know no dirty page */
+pss->page = size;
+return;
 }
 
-return find_next_bit(bitmap, size, start);
+/*
+ * If during sending a host page, only look for dirty pages within the
+ * current host page being send.
+ */
+if (pss->host_page_sending) {
+assert(pss->host_page_end);
+size = MIN(size, pss->host_page_end);
+}
+
+pss->page = find_next_bit(bitmap, size, pss->page);
 }
 
 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
@@ -1563,7 +1580,9 @@ static bool find_dirty_block(RAMState *rs, 
PageSearchStatus *pss, bool *again)
 pss->postcopy_requested = false;
 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
 
-pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
+/* Update pss->page for the next dirty bit in ramblock */
+pss_find_next_dirty(pss);
+
 if (pss->complete_round && pss->block == rs->last_seen_block &&
 pss->page >= rs->last_page) {
 /*
@@ -2452,6 +2471,44 @@ static void postcopy_preempt_reset_channel(RAMState *rs)
 }
 }
 
+/* Should be called before sending a host page */
+static void pss_host_page_prepare(PageSearchStatus *pss)
+{
+/* How many guest pa

[PULL 18/19] migration: Remove old preempt code around state maintainance

2022-12-15 Thread Juan Quintela

From: Peter Xu 

With the new code to send pages in rp-return thread, there's little help to
keep lots of the old code on maintaining the preempt state in migration
thread, because the new way should always be faster..

Then if we'll always send pages in the rp-return thread anyway, we don't
need those logic to maintain preempt state anymore because now we serialize
things using the mutex directly instead of using those fields.

It's very unfortunate to have those code for a short period, but that's
still one intermediate step that we noticed the next bottleneck on the
migration thread.  Now what we can do best is to drop unnecessary code as
long as the new code is stable to reduce the burden.  It's actually a good
thing because the new "sending page in rp-return thread" model is (IMHO)
even cleaner and with better performance.

Remove the old code that was responsible for maintaining preempt states, at
the meantime also remove x-postcopy-preempt-break-huge parameter because
with concurrent sender threads we don't really need to break-huge anymore.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/migration.h |   7 -
 migration/migration.c |   2 -
 migration/ram.c   | 291 +-
 3 files changed, 3 insertions(+), 297 deletions(-)

diff --git a/migration/migration.h b/migration/migration.h
index cdad8aceaa..ae4ffd3454 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -340,13 +340,6 @@ struct MigrationState {
 bool send_configuration;
 /* Whether we send section footer during migration */
 bool send_section_footer;
-/*
- * Whether we allow break sending huge pages when postcopy preempt is
- * enabled.  When disabled, we won't interrupt precopy within sending a
- * host huge page, which is the old behavior of vanilla postcopy.
- * NOTE: this parameter is ignored if postcopy preempt is not enabled.
- */
-bool postcopy_preempt_break_huge;
 
 /* Needed by postcopy-pause state */
 QemuSemaphore postcopy_pause_sem;
diff --git a/migration/migration.c b/migration/migration.c
index c1d4d76d0c..c3490c495d 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -4402,8 +4402,6 @@ static Property migration_properties[] = {
 DEFINE_PROP_SIZE("announce-step", MigrationState,
   parameters.announce_step,
   DEFAULT_MIGRATE_ANNOUNCE_STEP),
-DEFINE_PROP_BOOL("x-postcopy-preempt-break-huge", MigrationState,
-  postcopy_preempt_break_huge, true),
 DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds),
 DEFINE_PROP_STRING("tls-hostname", MigrationState, 
parameters.tls_hostname),
 DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
diff --git a/migration/ram.c b/migration/ram.c
index 16ade7cb70..1ae093fb61 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -97,28 +97,6 @@ struct PageSearchStatus {
 unsigned long page;
 /* Set once we wrap around */
 bool complete_round;
-/*
- * [POSTCOPY-ONLY] Whether current page is explicitly requested by
- * postcopy.  When set, the request is "urgent" because the dest QEMU
- * threads are waiting for us.
- */
-bool postcopy_requested;
-/*
- * [POSTCOPY-ONLY] The target channel to use to send current page.
- *
- * Note: This may _not_ match with the value in postcopy_requested
- * above. Let's imagine the case where the postcopy request is exactly
- * the page that we're sending in progress during precopy. In this case
- * we'll have postcopy_requested set to true but the target channel
- * will be the precopy channel (so that we don't split brain on that
- * specific page since the precopy channel already contains partial of
- * that page data).
- *
- * Besides that specific use case, postcopy_target_channel should
- * always be equal to postcopy_requested, because by default we send
- * postcopy pages via postcopy preempt channel.
- */
-bool postcopy_target_channel;
 /* Whether we're sending a host page */
 bool  host_page_sending;
 /* The start/end of current host page.  Invalid if 
host_page_sending==false */
@@ -343,20 +321,6 @@ struct RAMSrcPageRequest {
 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 };
 
-typedef struct {
-/*
- * Cached ramblock/offset values if preempted.  They're only meaningful if
- * preempted==true below.
- */
-RAMBlock *ram_block;
-unsigned long ram_page;
-/*
- * Whether a postcopy preemption just happened.  Will be reset after
- * precopy recovered to background migration.
- */
-bool preempted;
-} PostcopyPreemptState;
-
 /* State of RAM for migration */
 struct RAMState {
 /* QEMUFile used for this migration */
@@ -419,14 +383,6 @@ struct RAMState {
 /*

[PULL 15/19] migration: Make PageSearchStatus part of RAMState

2022-12-15 Thread Juan Quintela

From: Peter Xu 

We used to allocate PSS structure on the stack for precopy when sending
pages.  Make it static, so as to describe per-channel ram migration status.

Here we declared RAM_CHANNEL_MAX instances, preparing for postcopy to use
it, even though this patch has not yet to start using the 2nd instance.

This should not have any functional change per se, but it already starts to
export PSS information via the RAMState, so that e.g. one PSS channel can
start to reference the other PSS channel.

Always protect PSS access using the same RAMState.bitmap_mutex.  We already
do so, so no code change needed, just some comment update.  Maybe we should
consider renaming bitmap_mutex some day as it's going to be a more commonly
and big mutex we use for ram states, but just leave it for later.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 112 ++--
 1 file changed, 61 insertions(+), 51 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index d81bf7b183..3194997738 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -85,6 +85,46 @@
 
 XBZRLECacheStats xbzrle_counters;
 
+/* used by the search for pages to send */
+struct PageSearchStatus {
+/* The migration channel used for a specific host page */
+QEMUFile*pss_channel;
+/* Current block being searched */
+RAMBlock*block;
+/* Current page to search from */
+unsigned long page;
+/* Set once we wrap around */
+bool complete_round;
+/*
+ * [POSTCOPY-ONLY] Whether current page is explicitly requested by
+ * postcopy.  When set, the request is "urgent" because the dest QEMU
+ * threads are waiting for us.
+ */
+bool postcopy_requested;
+/*
+ * [POSTCOPY-ONLY] The target channel to use to send current page.
+ *
+ * Note: This may _not_ match with the value in postcopy_requested
+ * above. Let's imagine the case where the postcopy request is exactly
+ * the page that we're sending in progress during precopy. In this case
+ * we'll have postcopy_requested set to true but the target channel
+ * will be the precopy channel (so that we don't split brain on that
+ * specific page since the precopy channel already contains partial of
+ * that page data).
+ *
+ * Besides that specific use case, postcopy_target_channel should
+ * always be equal to postcopy_requested, because by default we send
+ * postcopy pages via postcopy preempt channel.
+ */
+bool postcopy_target_channel;
+/* Whether we're sending a host page */
+bool  host_page_sending;
+/* The start/end of current host page.  Invalid if 
host_page_sending==false */
+unsigned long host_page_start;
+unsigned long host_page_end;
+};
+typedef struct PageSearchStatus PageSearchStatus;
+
 /* struct contains XBZRLE cache and a static page
used by the compression */
 static struct {
@@ -319,6 +359,11 @@ typedef struct {
 struct RAMState {
 /* QEMUFile used for this migration */
 QEMUFile *f;
+/*
+ * PageSearchStatus structures for the channels when send pages.
+ * Protected by the bitmap_mutex.
+ */
+PageSearchStatus pss[RAM_CHANNEL_MAX];
 /* UFFD file descriptor, used in 'write-tracking' migration */
 int uffdio_fd;
 /* Last block that we have visited searching for dirty pages */
@@ -362,7 +407,12 @@ struct RAMState {
 uint64_t target_page_count;
 /* number of dirty bits in the bitmap */
 uint64_t migration_dirty_pages;
-/* Protects modification of the bitmap and migration dirty pages */
+/*
+ * Protects:
+ * - dirty/clear bitmap
+ * - migration_dirty_pages
+ * - pss structures
+ */
 QemuMutex bitmap_mutex;
 /* The RAMBlock used in the last src_page_requests */
 RAMBlock *last_req_rb;
@@ -451,46 +501,6 @@ void dirty_sync_missed_zero_copy(void)
 ram_counters.dirty_sync_missed_zero_copy++;
 }
 
-/* used by the search for pages to send */
-struct PageSearchStatus {
-/* The migration channel used for a specific host page */
-QEMUFile*pss_channel;
-/* Current block being searched */
-RAMBlock*block;
-/* Current page to search from */
-unsigned long page;
-/* Set once we wrap around */
-bool complete_round;
-/*
- * [POSTCOPY-ONLY] Whether current page is explicitly requested by
- * postcopy.  When set, the request is "urgent" because the dest QEMU
- * threads are waiting for us.
- */
-bool postcopy_requested;
-/*
- * [POSTCOPY-ONLY] The target channel to use to send current page.
- *
- * Note: This may _not_ match with the value in postcopy_requested
- * above. Let's imagine the case where the postcopy request is exactly
- * the page that we're sending in progress during precopy. In this case
- * we'l

[PULL 01/19] multifd: Create page_size fields into both MultiFD{Recv, Send}Params

2022-12-15 Thread Juan Quintela

We were calling qemu_target_page_size() left and right.

Signed-off-by: Juan Quintela 
Reviewed-by: Leonardo Bras 
---
 migration/multifd.h  |  4 
 migration/multifd-zlib.c | 14 ++
 migration/multifd-zstd.c | 12 +---
 migration/multifd.c  | 18 --
 4 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/migration/multifd.h b/migration/multifd.h
index 519f498643..86fb9982b3 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -80,6 +80,8 @@ typedef struct {
 bool registered_yank;
 /* packet allocated len */
 uint32_t packet_len;
+/* guest page size */
+uint32_t page_size;
 /* multifd flags for sending ram */
 int write_flags;
 
@@ -143,6 +145,8 @@ typedef struct {
 QIOChannel *c;
 /* packet allocated len */
 uint32_t packet_len;
+/* guest page size */
+uint32_t page_size;
 
 /* syncs main thread and channels */
 QemuSemaphore sem_sync;
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 18213a9513..37770248e1 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -116,7 +116,6 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error 
**errp)
 static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
 {
 struct zlib_data *z = p->data;
-size_t page_size = qemu_target_page_size();
 z_stream *zs = &z->zs;
 uint32_t out_size = 0;
 int ret;
@@ -135,8 +134,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error 
**errp)
  * with compression. zlib does not guarantee that this is safe,
  * therefore copy the page before calling deflate().
  */
-memcpy(z->buf, p->pages->block->host + p->normal[i], page_size);
-zs->avail_in = page_size;
+memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size);
+zs->avail_in = p->page_size;
 zs->next_in = z->buf;
 
 zs->avail_out = available;
@@ -242,12 +241,11 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p)
 static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
 {
 struct zlib_data *z = p->data;
-size_t page_size = qemu_target_page_size();
 z_stream *zs = &z->zs;
 uint32_t in_size = p->next_packet_size;
 /* we measure the change of total_out */
 uint32_t out_size = zs->total_out;
-uint32_t expected_size = p->normal_num * page_size;
+uint32_t expected_size = p->normal_num * p->page_size;
 uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
 int ret;
 int i;
@@ -274,7 +272,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error 
**errp)
 flush = Z_SYNC_FLUSH;
 }
 
-zs->avail_out = page_size;
+zs->avail_out = p->page_size;
 zs->next_out = p->host + p->normal[i];
 
 /*
@@ -288,8 +286,8 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error 
**errp)
 do {
 ret = inflate(zs, flush);
 } while (ret == Z_OK && zs->avail_in
- && (zs->total_out - start) < page_size);
-if (ret == Z_OK && (zs->total_out - start) < page_size) {
+ && (zs->total_out - start) < p->page_size);
+if (ret == Z_OK && (zs->total_out - start) < p->page_size) {
 error_setg(errp, "multifd %u: inflate generated too few output",
p->id);
 return -1;
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index d788d309f2..f4a8e1ed1f 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -113,7 +113,6 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error 
**errp)
 static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
 {
 struct zstd_data *z = p->data;
-size_t page_size = qemu_target_page_size();
 int ret;
 uint32_t i;
 
@@ -128,7 +127,7 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error 
**errp)
 flush = ZSTD_e_flush;
 }
 z->in.src = p->pages->block->host + p->normal[i];
-z->in.size = page_size;
+z->in.size = p->page_size;
 z->in.pos = 0;
 
 /*
@@ -241,8 +240,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error 
**errp)
 {
 uint32_t in_size = p->next_packet_size;
 uint32_t out_size = 0;
-size_t page_size = qemu_target_page_size();
-uint32_t expected_size = p->normal_num * page_size;
+uint32_t expected_size = p->normal_num * p->page_size;
 uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
 struct zstd_data *z = p->data;
 int ret;
@@ -265,7 +263,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error 
**errp)
 
 for (i = 0; i < p->normal_num; i++) {
 z->out.dst = p->host + p->normal[i];
-z->out.size = page_size;
+z->out.size = p->page_size;
 z->out.pos = 0;
 
 /*
@@ -279,8 +277,8 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error 
**errp)
 do {

Re: [PULL 00/51] Block layer patches

2022-12-15 Thread Kevin Wolf

Am 14.12.2022 um 23:35 hat Peter Maydell geschrieben:
> On Wed, 14 Dec 2022 at 13:45, Kevin Wolf  wrote:
> >
> > The following changes since commit 5204b499a6cae4dfd9fe762d5e6e82224892383b:
> >
> >   mailmap: Fix Stefan Weil author email (2022-12-13 15:56:57 -0500)
> >
> > are available in the Git repository at:
> >
> >   https://repo.or.cz/qemu/kevin.git tags/for-upstream
> >
> > for you to fetch changes up to 2ad19e5dc950d4b340894846b9e71c0b20f9a1cc:
> >
> >   block: GRAPH_RDLOCK for functions only called by co_wrappers (2022-12-14 
> > 13:13:07 +0100)
> >
> > 
> > Block layer patches
> >
> > - Code cleanups around block graph modification
> > - Simplify drain
> > - coroutine_fn correctness fixes, including splitting generated
> >   coroutine wrappers into co_wrapper (to be called only from
> >   non-coroutine context) and co_wrapper_mixed (both coroutine and
> >   non-coroutine context)
> > - Introduce a block graph rwlock
> >
> > 
> 
> Fails to build on the tsan-build job:
> https://gitlab.com/qemu-project/qemu/-/jobs/3476176683
> 
> In file included from ../hw/nvram/fw_cfg-interface.c:10:
> In file included from /builds/qemu-project/qemu/include/hw/nvram/fw_cfg.h:7:
> In file included from /builds/qemu-project/qemu/include/sysemu/dma.h:15:
> In file included from /builds/qemu-project/qemu/include/block/block.h:27:
> In file included from
> /builds/qemu-project/qemu/include/block/block-global-state.h:27:
> In file included from 
> /builds/qemu-project/qemu/include/block/block-common.h:27:
> In file included from /builds/qemu-project/qemu/include/block/aio.h:25:
> /builds/qemu-project/qemu/include/block/graph-lock.h:62:31: error:
> invalid capability name 'graph-lock'; capability name must be 'mutex'
> or 'role' [-Werror,-Wthread-safety-attributes]
> typedef struct TSA_CAPABILITY("graph-lock") BdrvGraphLock {
>^
> 
> (I see the same error on my x86 macos system.)

Ah, surprise, clang 11 lifted this arbitrary restriction for capability
names and that it existed in older compiler versions isn't documented
(any more?).

We can either just name it "mutex" and have slightly misleading error
messages (it's semantically not a mutex, but an rwlock), or add a
configure check and leave TSA disabled if it doesn't work. I think I'll
try the former for now, "mutex 'graph_lock'" should still be good enough
to know what it means.

Kevin

[PULL 14/19] migration: Add pss_init()

2022-12-15 Thread Juan Quintela

From: Peter Xu 

Helper to init PSS structures.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 571d780987..d81bf7b183 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -542,6 +542,14 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream 
*stream, RAMBlock *block,
 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
  bool postcopy_requested);
 
+/* NOTE: page is the PFN not real ram_addr_t. */
+static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
+{
+pss->block = rb;
+pss->page = page;
+pss->complete_round = false;
+}
+
 static void *do_data_compress(void *opaque)
 {
 CompressParam *param = opaque;
@@ -2650,9 +2658,7 @@ static int ram_find_and_save_block(RAMState *rs)
 rs->last_page = 0;
 }
 
-pss.block = rs->last_seen_block;
-pss.page = rs->last_page;
-pss.complete_round = false;
+pss_init(&pss, rs->last_seen_block, rs->last_page);
 
 do {
 again = true;
-- 
2.38.1

[PULL 10/19] migration: Yield bitmap_mutex properly when sending/sleeping

2022-12-15 Thread Juan Quintela

From: Peter Xu 

Don't take the bitmap mutex when sending pages, or when being throttled by
migration_rate_limit() (which is a bit tricky to call it here in ram code,
but seems still helpful).

It prepares for the possibility of concurrently sending pages in >1 threads
using the function ram_save_host_page() because all threads may need the
bitmap_mutex to operate on bitmaps, so that either sendmsg() or any kind of
qemu_sem_wait() blocking for one thread will not block the other from
progressing.

Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 46 +++---
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 6e3dc845c5..5379164749 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2452,9 +2452,14 @@ static void postcopy_preempt_reset_channel(RAMState *rs)
  * a host page in which case the remainder of the hostpage is sent.
  * Only dirty target pages are sent. Note that the host page size may
  * be a huge page for this block.
+ *
  * The saving stops at the boundary of the used_length of the block
  * if the RAMBlock isn't a multiple of the host page size.
  *
+ * The caller must be with ram_state.bitmap_mutex held to call this
+ * function.  Note that this function can temporarily release the lock, but
+ * when the function is returned it'll make sure the lock is still held.
+ *
  * Returns the number of pages written or negative on error
  *
  * @rs: current RAM state
@@ -2462,6 +2467,7 @@ static void postcopy_preempt_reset_channel(RAMState *rs)
  */
 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
 {
+bool page_dirty, preempt_active = postcopy_preempt_active();
 int tmppages, pages = 0;
 size_t pagesize_bits =
 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
@@ -2485,22 +2491,40 @@ static int ram_save_host_page(RAMState *rs, 
PageSearchStatus *pss)
 break;
 }
 
+page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
+
 /* Check the pages is dirty and if it is send it */
-if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
+if (page_dirty) {
+/*
+ * Properly yield the lock only in postcopy preempt mode
+ * because both migration thread and rp-return thread can
+ * operate on the bitmaps.
+ */
+if (preempt_active) {
+qemu_mutex_unlock(&rs->bitmap_mutex);
+}
 tmppages = ram_save_target_page(rs, pss);
-if (tmppages < 0) {
-return tmppages;
+if (tmppages >= 0) {
+pages += tmppages;
+/*
+ * Allow rate limiting to happen in the middle of huge pages if
+ * something is sent in the current iteration.
+ */
+if (pagesize_bits > 1 && tmppages > 0) {
+migration_rate_limit();
+}
 }
-
-pages += tmppages;
-/*
- * Allow rate limiting to happen in the middle of huge pages if
- * something is sent in the current iteration.
- */
-if (pagesize_bits > 1 && tmppages > 0) {
-migration_rate_limit();
+if (preempt_active) {
+qemu_mutex_lock(&rs->bitmap_mutex);
 }
+} else {
+tmppages = 0;
+}
+
+if (tmppages < 0) {
+return tmppages;
 }
+
 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
 } while ((pss->page < hostpage_boundary) &&
  offset_in_ramblock(pss->block,
-- 
2.38.1

[PULL 16/19] migration: Move last_sent_block into PageSearchStatus

2022-12-15 Thread Juan Quintela

From: Peter Xu 

Since we use PageSearchStatus to represent a channel, it makes perfect
sense to keep last_sent_block (aka, leverage RAM_SAVE_FLAG_CONTINUE) to be
per-channel rather than global because each channel can be sending
different pages on ramblocks.

Hence move it from RAMState into PageSearchStatus.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 71 -
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 3194997738..1233ff53ac 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -89,6 +89,8 @@ XBZRLECacheStats xbzrle_counters;
 struct PageSearchStatus {
 /* The migration channel used for a specific host page */
 QEMUFile*pss_channel;
+/* Last block from where we have sent data */
+RAMBlock *last_sent_block;
 /* Current block being searched */
 RAMBlock*block;
 /* Current page to search from */
@@ -368,8 +370,6 @@ struct RAMState {
 int uffdio_fd;
 /* Last block that we have visited searching for dirty pages */
 RAMBlock *last_seen_block;
-/* Last block from where we have sent data */
-RAMBlock *last_sent_block;
 /* Last dirty target page we have sent */
 ram_addr_t last_page;
 /* last ram version we have seen */
@@ -684,16 +684,17 @@ exit:
  *
  * Returns the number of bytes written
  *
- * @f: QEMUFile where to send the data
+ * @pss: current PSS channel status
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  *  in the lower bits, it contains flags
  */
-static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
+static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
ram_addr_t offset)
 {
 size_t size, len;
-bool same_block = (block == rs->last_sent_block);
+bool same_block = (block == pss->last_sent_block);
+QEMUFile *f = pss->pss_channel;
 
 if (same_block) {
 offset |= RAM_SAVE_FLAG_CONTINUE;
@@ -706,7 +707,7 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f,  
RAMBlock *block,
 qemu_put_byte(f, len);
 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 size += 1 + len;
-rs->last_sent_block = block;
+pss->last_sent_block = block;
 }
 return size;
 }
@@ -790,17 +791,19 @@ static void xbzrle_cache_zero_page(RAMState *rs, 
ram_addr_t current_addr)
  *  -1 means that xbzrle would be longer than normal
  *
  * @rs: current RAM state
+ * @pss: current PSS channel
  * @current_data: pointer to the address of the page contents
  * @current_addr: addr of the page
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_xbzrle_page(RAMState *rs, QEMUFile *file,
+static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 uint8_t **current_data, ram_addr_t current_addr,
 RAMBlock *block, ram_addr_t offset)
 {
 int encoded_len = 0, bytes_xbzrle;
 uint8_t *prev_cached_page;
+QEMUFile *file = pss->pss_channel;
 
 if (!cache_is_cached(XBZRLE.cache, current_addr,
  ram_counters.dirty_sync_count)) {
@@ -865,7 +868,7 @@ static int save_xbzrle_page(RAMState *rs, QEMUFile *file,
 }
 
 /* Send XBZRLE based compressed page */
-bytes_xbzrle = save_page_header(rs, file, block,
+bytes_xbzrle = save_page_header(pss, block,
 offset | RAM_SAVE_FLAG_XBZRLE);
 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 qemu_put_be16(file, encoded_len);
@@ -1296,19 +1299,19 @@ void ram_release_page(const char *rbname, uint64_t 
offset)
  * Returns the size of data written to the file, 0 means the page is not
  * a zero page
  *
- * @rs: current RAM state
- * @file: the file where the data is saved
+ * @pss: current PSS channel
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
+static int save_zero_page_to_file(PageSearchStatus *pss,
   RAMBlock *block, ram_addr_t offset)
 {
 uint8_t *p = block->host + offset;
+QEMUFile *file = pss->pss_channel;
 int len = 0;
 
 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
-len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
+len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
 qemu_put_byte(file, 0);
 len += 1;
 ram_release_page(block->idstr, offset);
@@ -1321,14 +1324,14 @@ static int save_zero_page_to_file(RAMState *rs, 
QEMUFile *file,
  *
  * Returns the number of pages written.
  *
- * @rs: current RAM state
+ * @pss: current PSS channel
  * @block

Re: [PATCH 1/5] io: Add support for MSG_PEEK for socket channel

2022-12-15 Thread Daniel P . Berrangé

On Wed, Dec 14, 2022 at 04:30:48PM -0500, Peter Xu wrote:
> On Wed, Dec 14, 2022 at 09:14:09AM +, Daniel P. Berrangé wrote:
> > On Tue, Dec 13, 2022 at 04:38:46PM -0500, Peter Xu wrote:
> > > From: "manish.mishra" 
> > > 
> > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > unread and the next read shall still return this data. This
> > > support is currently added only for socket class. Extra parameter
> > > 'flags' is added to io_readv calls to pass extra read flags like
> > > MSG_PEEK.
> > > 
> > > Reviewed-by: Daniel P. Berrang??  > > Suggested-by: Daniel P. Berrang??  > 
> > The last letter of my name has been mangled - whatever tools used
> > to pull in manish's patches seem to not be UTF-8 clean.
> > 
> > Also the email addr isn't terminated, but that was pre-existing
> > in manish's previous posting.
> 
> I'll fix at least the latter in my next post, sorry.
> 
> For the 1st one - I am still looking at what went wrong.
> 
> Here from the web interfaces it all looks good (besides the wrong
> ending..), e.g. on lore or patchew:
> 
> https://lore.kernel.org/all/20221213213850.1481858-2-pet...@redhat.com/
> https://patchew.org/QEMU/20221213213850.1481858-1-pet...@redhat.com/20221213213850.1481858-2-pet...@redhat.com/
> 
> It also looks good with e.g. Gmail webclient.
> 
> Then I digged into the email headers and I found that comparing to Manish's
> original message, the patches I posted has one more line of "Content-type":
> 
>   Content-Type: text/plain; charset="utf-8"
>   Content-type: text/plain
>   https://patchew.org/QEMU/20221213213850.1481858-2-pet...@redhat.com/mbox
> 
> While Manish's patch only has one line:
> 
>   Content-Type: text/plain; charset="utf-8"
>   
> https://patchew.org/QEMU/20221123172735.25181-2-manish.mis...@nutanix.com/mbox

Don't trust what is shown by patchew, as that's been through many
hops.

The copy I receieved came directly to me via CC, so didn't hit mailman,
nor patchew, and that *only* has  "Content-type: text/plain".  So the
extra Content-type line with utf8 must have been added either by
mailman or patchew.

So it probably looks like a config problem in the tool you use to send
the patches originally.

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PULL 03/19] migration: Export ram_transferred_ram()

2022-12-15 Thread Juan Quintela

Signed-off-by: Juan Quintela 
Reviewed-by: Dr. David Alan Gilbert 
Reviewed-by: David Edmondson 
Reviewed-by: Leonardo Bras 
---
 migration/ram.h | 2 ++
 migration/ram.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/migration/ram.h b/migration/ram.h
index c7af65ac74..e844966f69 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -65,6 +65,8 @@ int ram_load_postcopy(QEMUFile *f, int channel);
 
 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
 
+void ram_transferred_add(uint64_t bytes);
+
 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr);
 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset);
 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr);
diff --git a/migration/ram.c b/migration/ram.c
index 1338e47665..2cbe707bfc 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -422,7 +422,7 @@ uint64_t ram_bytes_remaining(void)
 
 MigrationStats ram_counters;
 
-static void ram_transferred_add(uint64_t bytes)
+void ram_transferred_add(uint64_t bytes)
 {
 if (runstate_is_running()) {
 ram_counters.precopy_bytes += bytes;
-- 
2.38.1

[PULL 06/19] migration: Add postcopy_preempt_active()

2022-12-15 Thread Juan Quintela

From: Peter Xu 

Add the helper to show that postcopy preempt enabled, meanwhile active.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/ram.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index cc72c24c18..00a2e30322 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -162,6 +162,11 @@ out:
 return ret;
 }
 
+static bool postcopy_preempt_active(void)
+{
+return migrate_postcopy_preempt() && migration_in_postcopy();
+}
+
 bool ramblock_is_ignored(RAMBlock *block)
 {
 return !qemu_ram_is_migratable(block) ||
@@ -2433,7 +2438,7 @@ static void postcopy_preempt_choose_channel(RAMState *rs, 
PageSearchStatus *pss)
 /* We need to make sure rs->f always points to the default channel elsewhere */
 static void postcopy_preempt_reset_channel(RAMState *rs)
 {
-if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+if (postcopy_preempt_active()) {
 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
 rs->f = migrate_get_current()->to_dst_file;
 trace_postcopy_preempt_reset_channel();
@@ -2471,7 +2476,7 @@ static int ram_save_host_page(RAMState *rs, 
PageSearchStatus *pss)
 return 0;
 }
 
-if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+if (postcopy_preempt_active()) {
 postcopy_preempt_choose_channel(rs, pss);
 }
 
-- 
2.38.1

RE: [PATCH] target/i386/hax: Add XCR0 support

2022-12-15 Thread Wang, Wenchao

Hi, Thomas,

Thanks for your reply. I have attempted to follow you suggestions but it always 
failed on tagging a GPG-signed tag before submitting the pull request. I have 
used GPG 2.2.4 to generate a RSA4096 GPG secret key and pasted the public key 
on GitHub successfully.

$ git tag -s pull-request-hax -m 'target/i386/hax: Add XCR0 support'
error: gpg failed to sign the data
error: unable to sign the tag

Meanwhile, could @Paolo Bonzini or @Stefan Hajnoczi help to pick the patch up 
as there is only one-line change for HAX and we have verified it for all guest 
launching? Thanks a lot.


Best Regards,
Wenchao

-Original Message-
From: Thomas Huth  
Sent: Wednesday, December 14, 2022 17:39
To: Wang, Wenchao 
Cc: qemu-devel@nongnu.org; Paolo Bonzini ; Peter Maydell 
; Stefan Hajnoczi ; Richard 
Henderson 
Subject: Re: [PATCH] target/i386/hax: Add XCR0 support

On 14/12/2022 10.15, Wang, Wenchao wrote:
> Hi, Thomas,
> 
> As HAXM v7.8.0 is released and it added XCR0 support, it needs this 
> patch to add corresponding support into HAX user space of QEMU. I have 
> pushed this merge request before and Philippe has reviewed it and he 
> thought the change is correct. If no one else raises any other 
> opinion, could you help to merge this patch for HAX?

  Hi,

sorry, I don't have a stake in the target/i386 code ... but you're listed as 
maintainer for the hax/ folder, so if no other x86 maintainer picks this up, I 
think you could send a pull request for this patch on your own. See:

  https://www.qemu.org/docs/master/devel/submitting-a-pull-request.html

  HTH,
   Thomas


> 
> -
> 
>  From b1789f2523d06798b8883664bfa9a9df797bfccf Mon Sep 17 00:00:00 
> 2001
> 
> From: Wenchao Wang  >
> 
> Date: Fri, 25 Nov 2022 18:37:34 +0800
> 
> Subject: [PATCH] target/i386/hax: Add XCR0 support
> 
> Introduce extended control register XCR0 to support XSAVE feature set.
> 
> Note: This change requires at least HAXM v7.8.0 to support.
> 
> Reviewed-by: Hang Yuan  >
> 
> Signed-off-by: Wenchao Wang  >
> 
> ---
> 
> target/i386/hax/hax-interface.h | 2 ++
> 
> 1 file changed, 2 insertions(+)
> 
> diff --git a/target/i386/hax/hax-interface.h 
> b/target/i386/hax/hax-interface.h index 537ae084e9..1d13bb2380 100644
> 
> --- a/target/i386/hax/hax-interface.h
> 
> +++ b/target/i386/hax/hax-interface.h
> 
> @@ -201,6 +201,8 @@ struct vcpu_state_t {
> 
>   uint64_t _cr3;
> 
>   uint64_t _cr4;
> 
> +    uint64_t _xcr0;
> 
> +
> 
>   uint64_t _dr0;
> 
>   uint64_t _dr1;
> 
>   uint64_t _dr2;
> 
> --
> 
> 2.17.1
>

[PULL 17/19] migration: Send requested page directly in rp-return thread

2022-12-15 Thread Juan Quintela

From: Peter Xu 

With all the facilities ready, send the requested page directly in the
rp-return thread rather than queuing it in the request queue, if and only
if postcopy preempt is enabled.  It can achieve so because it uses separate
channel for sending urgent pages.  The only shared data is bitmap and it's
protected by the bitmap_mutex.

Note that since we're moving the ownership of the urgent channel from the
migration thread to rp thread it also means the rp thread is responsible
for managing the qemufile, e.g. properly close it when pausing migration
happens.  For this, let migration_release_from_dst_file to cover shutdown
of the urgent channel too, renaming it as migration_release_dst_files() to
better show what it does.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
---
 migration/migration.c |  35 +++--
 migration/ram.c   | 112 ++
 2 files changed, 131 insertions(+), 16 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index de83c50f51..c1d4d76d0c 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2848,8 +2848,11 @@ static int migrate_handle_rp_resume_ack(MigrationState 
*s, uint32_t value)
 return 0;
 }
 
-/* Release ms->rp_state.from_dst_file in a safe way */
-static void migration_release_from_dst_file(MigrationState *ms)
+/*
+ * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
+ * existed) in a safe way.
+ */
+static void migration_release_dst_files(MigrationState *ms)
 {
 QEMUFile *file;
 
@@ -2862,6 +2865,18 @@ static void 
migration_release_from_dst_file(MigrationState *ms)
 ms->rp_state.from_dst_file = NULL;
 }
 
+/*
+ * Do the same to postcopy fast path socket too if there is.  No
+ * locking needed because this qemufile should only be managed by
+ * return path thread.
+ */
+if (ms->postcopy_qemufile_src) {
+migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
+qemu_file_shutdown(ms->postcopy_qemufile_src);
+qemu_fclose(ms->postcopy_qemufile_src);
+ms->postcopy_qemufile_src = NULL;
+}
+
 qemu_fclose(file);
 }
 
@@ -3006,7 +3021,7 @@ out:
  * Maybe there is something we can do: it looks like a
  * network down issue, and we pause for a recovery.
  */
-migration_release_from_dst_file(ms);
+migration_release_dst_files(ms);
 rp = NULL;
 if (postcopy_pause_return_path_thread(ms)) {
 /*
@@ -3024,7 +3039,7 @@ out:
 }
 
 trace_source_return_path_thread_end();
-migration_release_from_dst_file(ms);
+migration_release_dst_files(ms);
 rcu_unregister_thread();
 return NULL;
 }
@@ -3547,18 +3562,6 @@ static MigThrError postcopy_pause(MigrationState *s)
 qemu_file_shutdown(file);
 qemu_fclose(file);
 
-/*
- * Do the same to postcopy fast path socket too if there is.  No
- * locking needed because no racer as long as we do this before setting
- * status to paused.
- */
-if (s->postcopy_qemufile_src) {
-migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
-qemu_file_shutdown(s->postcopy_qemufile_src);
-qemu_fclose(s->postcopy_qemufile_src);
-s->postcopy_qemufile_src = NULL;
-}
-
 migrate_set_state(&s->state, s->state,
   MIGRATION_STATUS_POSTCOPY_PAUSED);
 
diff --git a/migration/ram.c b/migration/ram.c
index 1233ff53ac..16ade7cb70 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -546,6 +546,8 @@ static QemuThread *decompress_threads;
 static QemuMutex decomp_done_lock;
 static QemuCond decomp_done_cond;
 
+static int ram_save_host_page_urgent(PageSearchStatus *pss);
+
 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock 
*block,
  ram_addr_t offset, uint8_t *source_buf);
 
@@ -560,6 +562,16 @@ static void pss_init(PageSearchStatus *pss, RAMBlock *rb, 
ram_addr_t page)
 pss->complete_round = false;
 }
 
+/*
+ * Check whether two PSSs are actively sending the same page.  Return true
+ * if it is, false otherwise.
+ */
+static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
+{
+return pss1->host_page_sending && pss2->host_page_sending &&
+(pss1->host_page_start == pss2->host_page_start);
+}
+
 static void *do_data_compress(void *opaque)
 {
 CompressParam *param = opaque;
@@ -2260,6 +2272,57 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t 
start, ram_addr_t len)
 return -1;
 }
 
+/*
+ * When with postcopy preempt, we send back the page directly in the
+ * rp-return thread.
+ */
+if (postcopy_preempt_active()) {
+ram_addr_t page_start = start >> TARGET_PAGE_BITS;
+size_t page_size = qemu_ram_page

[PATCH v2] hostmem: Honor multiple preferred nodes if possible

2022-12-15 Thread Michal Privoznik

If a memory-backend is configured with mode
HOST_MEM_POLICY_PREFERRED then
host_memory_backend_memory_complete() calls mbind() as:

  mbind(..., MPOL_PREFERRED, nodemask, ...);

Here, 'nodemask' is a bitmap of host NUMA nodes and corresponds
to the .host-nodes attribute. Therefore, there can be multiple
nodes specified. However, the documentation to MPOL_PREFERRED
says:

  MPOL_PREFERRED
This mode sets the preferred node for allocation. ...
If nodemask specifies more than one node ID, the first node
in the mask will be selected as the preferred node.

Therefore, only the first node is honored and the rest is
silently ignored. Well, with recent changes to the kernel and
numactl we can do better.

The Linux kernel added in v5.15 via commit cfcaa66f8032
("mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY")
support for MPOL_PREFERRED_MANY, which accepts multiple preferred
NUMA nodes instead.

Then, numa_has_preferred_many() API was introduced to numactl
(v2.0.15~26) allowing applications to query kernel support.

Wiring this all together, we can pass MPOL_PREFERRED_MANY to the
mbind() call instead and stop ignoring multiple nodes, silently.

Signed-off-by: Michal Privoznik 
---

v2 of:

https://lists.gnu.org/archive/html/qemu-devel/2022-12/msg01354.html

diff to v1 (thanks to David for his rewiew):
- Don't cache numa_has_preferred_many() retval
- Reword comments and commit message
- Switch compile time detection from numa_set_preferred_many() to
  numa_has_preferred_many()

 backends/hostmem.c | 19 +--
 meson.build|  5 +
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 8640294c10..163ea9af04 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -23,7 +23,12 @@
 
 #ifdef CONFIG_NUMA
 #include 
+#include 
 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT);
+/*
+ * HOST_MEM_POLICY_PREFERRED may either translate to MPOL_PREFERRED or
+ * MPOL_PREFERRED_MANY, see comments further below.
+ */
 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED);
 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND);
 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE);
@@ -346,6 +351,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
Error **errp)
  * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
  * this doesn't catch hugepage case. */
 unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
+int mode = backend->policy;
 
 /* check for invalid host-nodes and policies and give more verbose
  * error messages than mbind(). */
@@ -369,9 +375,18 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
Error **errp)
BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
 assert(maxnode <= MAX_NODES);
 
+#ifdef HAVE_NUMA_SET_PREFERRED_MANY
+if (mode == MPOL_PREFERRED && numa_has_preferred_many() > 0) {
+/*
+ * Replace with MPOL_PREFERRED_MANY otherwise the mbind() below
+ * silently picks the first node.
+ */
+mode = MPOL_PREFERRED_MANY;
+}
+#endif
+
 if (maxnode &&
-mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 1,
-  flags)) {
+mbind(ptr, sz, mode, backend->host_nodes, maxnode + 1, flags)) {
 if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
 error_setg_errno(errp, errno,
  "cannot bind memory to host NUMA nodes");
diff --git a/meson.build b/meson.build
index 5c6b5a1c75..fb6979349d 100644
--- a/meson.build
+++ b/meson.build
@@ -1858,6 +1858,11 @@ config_host_data.set('CONFIG_LINUX_AIO', libaio.found())
 config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
 config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
 config_host_data.set('CONFIG_NUMA', numa.found())
+if numa.found()
+  config_host_data.set('HAVE_NUMA_HAS_PREFERRED_MANY',
+   cc.has_function('numa_has_preferred_many',
+   dependencies: numa))
+endif
 config_host_data.set('CONFIG_OPENGL', opengl.found())
 config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
 config_host_data.set('CONFIG_RBD', rbd.found())
-- 
2.37.4

[PATCH] tests/tcg/multiarch: remove unused variable in linux-test

2022-12-15 Thread Mukilan Thiyagarajan

LLVM 15.0.0 has improved diagnostics for the
'nounused-but-set-variable' warning and now warns when
a variable is modified using pre/post increment/decrement
operators but is otherwise never read.

linux-tests.c has such an unused variable 'wcount' and since
TCG tests are compiled with -Wall -Werror, this is causing
'make check-tcg' to fail when using LLVM 15.0.0 and above:

```
qemu/tests/tcg/multiarch/linux/linux-test.c:335:9:
error: variable 'wcount' set but not used [-Werror,-Wunused-but-set-variable]
int wcount, rcount;
^
1 error generated.
Makefile:119: recipe for target 'linux-test' failed
```

This patch simply removes the 'wcount' variable as it
doesn't appear to impact the semantics of the test.
The WCOUNT_MAX constant is also renamed to RCOUNT_MAX to
better reflect the usage.
---
 tests/tcg/multiarch/linux/linux-test.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/tcg/multiarch/linux/linux-test.c 
b/tests/tcg/multiarch/linux/linux-test.c
index 5a2a4f2258..35540dc357 100644
--- a/tests/tcg/multiarch/linux/linux-test.c
+++ b/tests/tcg/multiarch/linux/linux-test.c
@@ -325,19 +325,18 @@ static void test_socket(void)
 chk_error(close(server_fd));
 }
 
-#define WCOUNT_MAX 512
+#define RCOUNT_MAX 512
 
 static void test_pipe(void)
 {
 fd_set rfds, wfds;
 int fds[2], fd_max, ret;
 uint8_t ch;
-int wcount, rcount;
+int rcount;
 
 chk_error(pipe(fds));
 chk_error(fcntl(fds[0], F_SETFL, O_NONBLOCK));
 chk_error(fcntl(fds[1], F_SETFL, O_NONBLOCK));
-wcount = 0;
 rcount = 0;
 for(;;) {
 FD_ZERO(&rfds);
@@ -354,13 +353,12 @@ static void test_pipe(void)
 if (FD_ISSET(fds[0], &rfds)) {
 chk_error(read(fds[0], &ch, 1));
 rcount++;
-if (rcount >= WCOUNT_MAX)
+if (rcount >= RCOUNT_MAX)
 break;
 }
 if (FD_ISSET(fds[1], &wfds)) {
 ch = 'a';
 chk_error(write(fds[1], &ch, 1));
-wcount++;
 }
 }
 }
-- 
2.17.1

Re: [PATCH-for-8.0] tests/vm: Update get_default_jobs() to work on non-x86_64 non-KVM hosts

2022-12-15 Thread Thomas Huth


On 09/12/2022 17.47, Philippe Mathieu-Daudé wrote:

On non-x86_64 host, if KVM is not available we get:

   Traceback (most recent call last):
 File "tests/vm/basevm.py", line 634, in main
   vm = vmcls(args, config=config)
 File "tests/vm/basevm.py", line 104, in __init__
   mem = max(4, args.jobs)
   TypeError: '>' not supported between instances of 'NoneType' and 'int'

Fix by always returning a -- not ideal but safe -- '1' value.

Fixes: b09539444a ("tests/vm: allow us to take advantage of MTTCG")
Signed-off-by: Philippe Mathieu-Daudé 
---
  tests/vm/basevm.py | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/vm/basevm.py b/tests/vm/basevm.py
index 2276364c42..23229e23d1 100644
--- a/tests/vm/basevm.py
+++ b/tests/vm/basevm.py
@@ -569,8 +569,7 @@ def get_default_jobs():
  # more cores. but only up to a reasonable limit. User
  # can always override these limits with --jobs.
  return min(multiprocessing.cpu_count() // 2, 8)
-else:
-return 1
+return 1
  
  parser = argparse.ArgumentParser(

  formatter_class=argparse.ArgumentDefaultsHelpFormatter,


Reviewed-by: Thomas Huth

Re: [PULL v2 00/30] QAPI patches patches for 2022-12-14

2022-12-15 Thread Peter Maydell

On Wed, 14 Dec 2022 at 19:20, Markus Armbruster  wrote:
>
> The following changes since commit ea3a008d2d9ced9c4f93871c823baee237047f93:
>
>   Update VERSION for v7.2.0-rc4 (2022-12-06 19:53:34 -0500)
>
> are available in the Git repository at:
>
>   https://repo.or.cz/qemu/armbru.git tags/pull-qapi-2022-12-14-v2
>
> for you to fetch changes up to 5efb40d6571457c3cc35b7a91088cc2fceee5763:
>
>   qapi: Drop temporary logic to support conversion step by step (2022-12-14 
> 20:05:07 +0100)
>
> 
> QAPI patches patches for 2022-12-14
>
> 


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/7.1
for any user-visible changes.

-- PMM

Re: [PATCH] target/i386/hax: Add XCR0 support

2022-12-15 Thread Peter Maydell

On Thu, 15 Dec 2022 at 09:45, Wang, Wenchao  wrote:
>
> Hi, Thomas,
>
> Thanks for your reply. I have attempted to follow you suggestions but it 
> always failed on tagging a GPG-signed tag before submitting the pull request. 
> I have used GPG 2.2.4 to generate a RSA4096 GPG secret key and pasted the 
> public key on GitHub successfully.
>
> $ git tag -s pull-request-hax -m 'target/i386/hax: Add XCR0 support'
> error: gpg failed to sign the data
> error: unable to sign the tag
>
> Meanwhile, could @Paolo Bonzini or @Stefan Hajnoczi help to pick the patch up 
> as there is only one-line change for HAX and we have verified it for all 
> guest launching? Thanks a lot.

Yes, please. For a single trivial patch I strongly prefer
that some existing (in this case x86) maintainer takes it in
their pullreq, rather than my having to deal with a
pullreq submission from a new-to-the-process person.
(It's extra work to check submissions from new people,
which is fine if they're going to be doing them a lot
in future, but for a one-off it's a waste of their time
and mine.)

thanks
-- PMM

migration qtest failure: "query-migrate shows failed migration: Unable to write to socket: Broken pipe"

2022-12-15 Thread Peter Maydell

Hi; I see this migration qtest failure on my x86 macos box:


▶  32/591 
ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
assertion fail
ed: (!g_str_equal(status, "failed")) ERROR
 32/591 qemu:qtest+qtest-aarch64 / qtest-aarch64/migration-test
   ERROR  152.27s   killed by signal 6 SIGABRT
― ✀  ―
stderr:
query-migrate shows failed migration: Unable to write to socket: Broken pipe
**
ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
assertion failed: (!g_str_equal(status, "failed"))
Assertion failed: (pid == s->qemu_pid), function qtest_wait_qemu, file
../../tests/qtest/libqtest.c, line 207.

(test program exited with status code -6)
――

and similarly:

▶  34/591 
ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
assertion failed: (!g_str_equal(status, "failed")) ERROR
 34/591 qemu:qtest+qtest-i386 / qtest-i386/migration-test
   ERROR  169.44s   killed by signal 6 SIGABRT
― ✀  ―
stderr:
query-migrate shows failed migration: Unable to write to socket: Broken pipe
**
ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
assertion failed: (!g_str_equal(status, "failed"))
Assertion failed: (pid == s->qemu_pid), function qtest_wait_qemu, file
../../tests/qtest/libqtest.c, line 207.

(test program exited with status code -6)
――

It seems to be fairly consistent. Any ideas what it might be?
Maybe the QEMU process has already exited before the test binary
gets round to querying the status ?

thanks
-- PMM

[PATCH v9 00/12] ASID support in vhost-vdpa net

2022-12-15 Thread Eugenio Pérez

Control VQ is the way net devices use to send changes to the device state, like
the number of active queues or its mac address.

QEMU needs to intercept this queue so it can track these changes and is able to
migrate the device. It can do it from 1576dbb5bbc4 ("vdpa: Add x-svq to
NetdevVhostVDPAOptions"). However, to enable x-svq implies to shadow all VirtIO
device's virtqueues, which will damage performance.

This series adds address space isolation, so the device and the guest
communicate directly with them (passthrough) and CVQ communication is split in
two: The guest communicates with qemu and qemu forwards the commands to the
device.

Comments are welcome. Thanks!

v9:
- Reuse iova_range fetched from the device at initialization, instead of
  fetch it again at vhost_vdpa_net_cvq_start.
- Add comment about how migration is blocked in case ASID does not met
  our expectations.
- Delete warning about CVQ group not being independent.

v8:
- Do not allocate iova_tree on net_init_vhost_vdpa if only CVQ is
  shadowed. Move the iova_tree allocation to
  vhost_vdpa_net_cvq_start and vhost_vdpa_net_cvq_stop in this case.

v7:
- Never ask for number of address spaces, just react if isolation is not
  possible.
- Return ASID ioctl errors instead of masking them as if the device has
  no asid.
- Rename listener_shadow_vq to shadow_data
- Move comment on zero initailization of vhost_vdpa_dma_map above the
  functions.
- Add VHOST_VDPA_GUEST_PA_ASID macro.

v6:
- Do not allocate SVQ resources like file descriptors if SVQ cannot be used.
- Disable shadow CVQ if the device does not support it because of net
  features.

v5:
- Move vring state in vhost_vdpa_get_vring_group instead of using a
  parameter.
- Rename VHOST_VDPA_NET_CVQ_PASSTHROUGH to VHOST_VDPA_NET_DATA_ASID

v4:
- Rebased on last CVQ start series, that allocated CVQ cmd bufs at load
- Squash vhost_vdpa_cvq_group_is_independent.
- Do not check for cvq index on vhost_vdpa_net_prepare, we only have one
  that callback registered in that NetClientInfo.
- Add comment specifying behavior if device does not support _F_ASID
- Update headers to a later Linux commit to not to remove SETUP_RNG_SEED

v3:
- Do not return an error but just print a warning if vdpa device initialization
  returns failure while getting AS num of VQ groups
- Delete extra newline

v2:
- Much as commented on series [1], handle vhost_net backend through
  NetClientInfo callbacks instead of directly.
- Fix not freeing SVQ properly when device does not support CVQ
- Add BIT_ULL missed checking device's backend feature for _F_ASID.

Eugenio Pérez (12):
  vdpa: use v->shadow_vqs_enabled in vhost_vdpa_svqs_start & stop
  vhost: set SVQ device call handler at SVQ start
  vhost: allocate SVQ device file descriptors at device start
  vhost: move iova_tree set to vhost_svq_start
  vdpa: add vhost_vdpa_net_valid_svq_features
  vdpa: request iova_range only once
  vdpa: move SVQ vring features check to net/
  vdpa: allocate SVQ array unconditionally
  vdpa: add asid parameter to vhost_vdpa_dma_map/unmap
  vdpa: store x-svq parameter in VhostVDPAState
  vdpa: add shadow_data to vhost_vdpa
  vdpa: always start CVQ in SVQ mode if possible

 hw/virtio/vhost-shadow-virtqueue.h |   5 +-
 include/hw/virtio/vhost-vdpa.h |  16 ++-
 hw/virtio/vhost-shadow-virtqueue.c |  44 ++--
 hw/virtio/vhost-vdpa.c | 140 +++
 net/vhost-vdpa.c   | 174 -
 hw/virtio/trace-events |   4 +-
 6 files changed, 237 insertions(+), 146 deletions(-)

-- 
2.31.1

[PATCH v9 04/12] vhost: move iova_tree set to vhost_svq_start

2022-12-15 Thread Eugenio Pérez

Since we don't know if we will use SVQ at qemu initialization, let's
allocate iova_tree only if needed. To do so, accept it at SVQ start, not
at initialization.

This will avoid to create it if the device does not support SVQ.

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
 hw/virtio/vhost-shadow-virtqueue.h | 5 ++---
 hw/virtio/vhost-shadow-virtqueue.c | 9 -
 hw/virtio/vhost-vdpa.c | 5 ++---
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.h 
b/hw/virtio/vhost-shadow-virtqueue.h
index d04c34a589..926a4897b1 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -126,11 +126,10 @@ size_t vhost_svq_driver_area_size(const 
VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
- VirtQueue *vq);
+ VirtQueue *vq, VhostIOVATree *iova_tree);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
-const VhostShadowVirtqueueOps *ops,
+VhostShadowVirtqueue *vhost_svq_new(const VhostShadowVirtqueueOps *ops,
 void *ops_opaque);
 
 void vhost_svq_free(gpointer vq);
diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index 3b05bab44d..4307296358 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -642,9 +642,10 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, 
int svq_kick_fd)
  * @svq: Shadow Virtqueue
  * @vdev: VirtIO device
  * @vq: Virtqueue to shadow
+ * @iova_tree: Tree to perform descriptors translations
  */
 void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
- VirtQueue *vq)
+ VirtQueue *vq, VhostIOVATree *iova_tree)
 {
 size_t desc_size, driver_size, device_size;
 
@@ -655,6 +656,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, 
VirtIODevice *vdev,
 svq->last_used_idx = 0;
 svq->vdev = vdev;
 svq->vq = vq;
+svq->iova_tree = iova_tree;
 
 svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 driver_size = vhost_svq_driver_area_size(svq);
@@ -712,18 +714,15 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
  * shadow methods and file descriptors.
  *
- * @iova_tree: Tree to perform descriptors translations
  * @ops: SVQ owner callbacks
  * @ops_opaque: ops opaque pointer
  */
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
-const VhostShadowVirtqueueOps *ops,
+VhostShadowVirtqueue *vhost_svq_new(const VhostShadowVirtqueueOps *ops,
 void *ops_opaque)
 {
 VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 
 event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
-svq->iova_tree = iova_tree;
 svq->ops = ops;
 svq->ops_opaque = ops_opaque;
 return svq;
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 3df2775760..691bcc811a 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -430,8 +430,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, 
struct vhost_vdpa *v,
 for (unsigned n = 0; n < hdev->nvqs; ++n) {
 VhostShadowVirtqueue *svq;
 
-svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
-v->shadow_vq_ops_opaque);
+svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque);
 g_ptr_array_add(shadow_vqs, svq);
 }
 
@@ -1063,7 +1062,7 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 goto err;
 }
 
-vhost_svq_start(svq, dev->vdev, vq);
+vhost_svq_start(svq, dev->vdev, vq, v->iova_tree);
 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
 if (unlikely(!ok)) {
 goto err_map;
-- 
2.31.1

[PATCH v9 07/12] vdpa: move SVQ vring features check to net/

2022-12-15 Thread Eugenio Pérez

The next patches will start control SVQ if possible. However, we don't
know if that will be possible at qemu boot anymore.

Since the moved checks will be already evaluated at net/ to know if it
is ok to shadow CVQ, move them.

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
 hw/virtio/vhost-vdpa.c | 32 ++--
 net/vhost-vdpa.c   |  3 ++-
 2 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 9b7f4ef083..5039d9bb2f 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -389,29 +389,9 @@ static int vhost_vdpa_get_dev_features(struct vhost_dev 
*dev,
 return ret;
 }
 
-static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
-   Error **errp)
+static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v)
 {
 g_autoptr(GPtrArray) shadow_vqs = NULL;
-uint64_t dev_features, svq_features;
-int r;
-bool ok;
-
-if (!v->shadow_vqs_enabled) {
-return 0;
-}
-
-r = vhost_vdpa_get_dev_features(hdev, &dev_features);
-if (r != 0) {
-error_setg_errno(errp, -r, "Can't get vdpa device features");
-return r;
-}
-
-svq_features = dev_features;
-ok = vhost_svq_valid_features(svq_features, errp);
-if (unlikely(!ok)) {
-return -1;
-}
 
 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 for (unsigned n = 0; n < hdev->nvqs; ++n) {
@@ -422,7 +402,6 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, 
struct vhost_vdpa *v,
 }
 
 v->shadow_vqs = g_steal_pointer(&shadow_vqs);
-return 0;
 }
 
 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
@@ -447,10 +426,7 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void 
*opaque, Error **errp)
 dev->opaque =  opaque ;
 v->listener = vhost_vdpa_memory_listener;
 v->msg_type = VHOST_IOTLB_MSG_V2;
-ret = vhost_vdpa_init_svq(dev, v, errp);
-if (ret) {
-goto err;
-}
+vhost_vdpa_init_svq(dev, v);
 
 if (!vhost_vdpa_first_dev(dev)) {
 return 0;
@@ -460,10 +436,6 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void 
*opaque, Error **errp)
VIRTIO_CONFIG_S_DRIVER);
 
 return 0;
-
-err:
-ram_block_discard_disable(false);
-return ret;
 }
 
 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index b6462f0192..e829ef1f43 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -118,9 +118,10 @@ static bool vhost_vdpa_net_valid_svq_features(uint64_t 
features, Error **errp)
 if (invalid_dev_features) {
 error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
invalid_dev_features);
+return false;
 }
 
-return !invalid_dev_features;
+return vhost_svq_valid_features(features, errp);
 }
 
 static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
-- 
2.31.1

[PATCH v9 06/12] vdpa: request iova_range only once

2022-12-15 Thread Eugenio Pérez

Currently iova range is requested once per queue pair in the case of
net. Reduce the number of ioctls asking it once at initialization and
reusing that value for each vhost_vdpa.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-vdpa.c | 15 ---
 net/vhost-vdpa.c   | 27 ++-
 2 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 691bcc811a..9b7f4ef083 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -365,19 +365,6 @@ static int vhost_vdpa_add_status(struct vhost_dev *dev, 
uint8_t status)
 return 0;
 }
 
-static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v)
-{
-int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE,
-  &v->iova_range);
-if (ret != 0) {
-v->iova_range.first = 0;
-v->iova_range.last = UINT64_MAX;
-}
-
-trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first,
-v->iova_range.last);
-}
-
 /*
  * The use of this function is for requests that only need to be
  * applied once. Typically such request occurs at the beginning
@@ -465,8 +452,6 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void 
*opaque, Error **errp)
 goto err;
 }
 
-vhost_vdpa_get_iova_range(v);
-
 if (!vhost_vdpa_first_dev(dev)) {
 return 0;
 }
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 2c0ff6d7b0..b6462f0192 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -541,14 +541,15 @@ static const VhostShadowVirtqueueOps 
vhost_vdpa_net_svq_ops = {
 };
 
 static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
-   const char *device,
-   const char *name,
-   int vdpa_device_fd,
-   int queue_pair_index,
-   int nvqs,
-   bool is_datapath,
-   bool svq,
-   VhostIOVATree *iova_tree)
+   const char *device,
+   const char *name,
+   int vdpa_device_fd,
+   int queue_pair_index,
+   int nvqs,
+   bool is_datapath,
+   bool svq,
+   struct vhost_vdpa_iova_range iova_range,
+   VhostIOVATree *iova_tree)
 {
 NetClientState *nc = NULL;
 VhostVDPAState *s;
@@ -567,6 +568,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
 s->vhost_vdpa.device_fd = vdpa_device_fd;
 s->vhost_vdpa.index = queue_pair_index;
 s->vhost_vdpa.shadow_vqs_enabled = svq;
+s->vhost_vdpa.iova_range = iova_range;
 s->vhost_vdpa.iova_tree = iova_tree;
 if (!is_datapath) {
 s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
@@ -646,6 +648,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 int vdpa_device_fd;
 g_autofree NetClientState **ncs = NULL;
 g_autoptr(VhostIOVATree) iova_tree = NULL;
+struct vhost_vdpa_iova_range iova_range;
 NetClientState *nc;
 int queue_pairs, r, i = 0, has_cvq = 0;
 
@@ -689,14 +692,12 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 return queue_pairs;
 }
 
+vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
 if (opts->x_svq) {
-struct vhost_vdpa_iova_range iova_range;
-
 if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
 goto err_svq;
 }
 
-vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
 iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
 }
 
@@ -705,7 +706,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 for (i = 0; i < queue_pairs; i++) {
 ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
  vdpa_device_fd, i, 2, true, opts->x_svq,
- iova_tree);
+ iova_range, iova_tree);
 if (!ncs[i])
 goto err;
 }
@@ -713,7 +714,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 if (has_cvq) {
 nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
  vdpa_device_fd, i, 1, false,
- opts->x_svq, iova_tree);
+ opts->x_svq, iova_range, iova_tree);
 if (!nc)
 goto err;
 }
-- 
2.31.1

[PATCH v9 08/12] vdpa: allocate SVQ array unconditionally

2022-12-15 Thread Eugenio Pérez

SVQ may run or not in a device depending on runtime conditions (for
example, if the device can move CVQ to its own group or not).

Allocate the SVQ array unconditionally at startup, since its hard to
move this allocation elsewhere.

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
 hw/virtio/vhost-vdpa.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 5039d9bb2f..86e1fa8e9e 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -532,10 +532,6 @@ static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
 struct vhost_vdpa *v = dev->opaque;
 size_t idx;
 
-if (!v->shadow_vqs) {
-return;
-}
-
 for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
 }
-- 
2.31.1

[PATCH v9 01/12] vdpa: use v->shadow_vqs_enabled in vhost_vdpa_svqs_start & stop

2022-12-15 Thread Eugenio Pérez

This function used to trust in v->shadow_vqs != NULL to know if it must
start svq or not.

This is not going to be valid anymore, as qemu is going to allocate svq
array unconditionally (but it will only start them conditionally).

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
 hw/virtio/vhost-vdpa.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 7468e44b87..7f0ff4df5b 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -1029,7 +1029,7 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 Error *err = NULL;
 unsigned i;
 
-if (!v->shadow_vqs) {
+if (!v->shadow_vqs_enabled) {
 return true;
 }
 
@@ -1082,7 +1082,7 @@ static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 {
 struct vhost_vdpa *v = dev->opaque;
 
-if (!v->shadow_vqs) {
+if (!v->shadow_vqs_enabled) {
 return;
 }
 
-- 
2.31.1

[PATCH v9 12/12] vdpa: always start CVQ in SVQ mode if possible

2022-12-15 Thread Eugenio Pérez

Isolate control virtqueue in its own group, allowing to intercept control
commands but letting dataplane run totally passthrough to the guest.

Signed-off-by: Eugenio Pérez 
---
v9:
* Reuse iova_range fetched from the device at initialization, instead of
  fetch it again at vhost_vdpa_net_cvq_start.
* Add comment about how migration is blocked in case ASID does not met
  our expectations.
* Delete warning about CVQ group not being independent.

v8:
* Do not allocate iova_tree on net_init_vhost_vdpa if only CVQ is
  shadowed. Move the iova_tree handling in this case to
  vhost_vdpa_net_cvq_start and vhost_vdpa_net_cvq_stop.

v7:
* Never ask for number of address spaces, just react if isolation is not
  possible.
* Return ASID ioctl errors instead of masking them as if the device has
  no asid.
* Simplify net_init_vhost_vdpa logic
* Add "if possible" suffix

v6:
* Disable control SVQ if the device does not support it because of
features.

v5:
* Fixing the not adding cvq buffers when x-svq=on is specified.
* Move vring state in vhost_vdpa_get_vring_group instead of using a
  parameter.
* Rename VHOST_VDPA_NET_CVQ_PASSTHROUGH to VHOST_VDPA_NET_DATA_ASID

v4:
* Squash vhost_vdpa_cvq_group_is_independent.
* Rebased on last CVQ start series, that allocated CVQ cmd bufs at load
* Do not check for cvq index on vhost_vdpa_net_prepare, we only have one
  that callback registered in that NetClientInfo.

v3:
* Make asid related queries print a warning instead of returning an
  error and stop the start of qemu.
---
 hw/virtio/vhost-vdpa.c |   3 +-
 net/vhost-vdpa.c   | 110 -
 2 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 48d8c60e76..8cd00f5a96 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -638,7 +638,8 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
 {
 uint64_t features;
 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
-0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
+0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
+0x1ULL << VHOST_BACKEND_F_IOTLB_ASID;
 int r;
 
 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 710c5efe96..d36664f33a 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -102,6 +102,8 @@ static const uint64_t vdpa_svq_device_features =
 BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
 BIT_ULL(VIRTIO_NET_F_STANDBY);
 
+#define VHOST_VDPA_NET_CVQ_ASID 1
+
 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
 {
 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
@@ -243,6 +245,40 @@ static NetClientInfo net_vhost_vdpa_info = {
 .check_peer_type = vhost_vdpa_check_peer_type,
 };
 
+static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index)
+{
+struct vhost_vring_state state = {
+.index = vq_index,
+};
+int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state);
+
+if (unlikely(r < 0)) {
+error_report("Cannot get VQ %u group: %s", vq_index,
+ g_strerror(errno));
+return r;
+}
+
+return state.num;
+}
+
+static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
+   unsigned vq_group,
+   unsigned asid_num)
+{
+struct vhost_vring_state asid = {
+.index = vq_group,
+.num = asid_num,
+};
+int r;
+
+r = ioctl(v->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
+if (unlikely(r < 0)) {
+error_report("Can't set vq group %u asid %u, errno=%d (%s)",
+ asid.index, asid.num, errno, g_strerror(errno));
+}
+return r;
+}
+
 static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
 {
 VhostIOVATree *tree = v->iova_tree;
@@ -317,11 +353,75 @@ dma_map_err:
 static int vhost_vdpa_net_cvq_start(NetClientState *nc)
 {
 VhostVDPAState *s;
-int r;
+struct vhost_vdpa *v;
+uint64_t backend_features;
+int64_t cvq_group;
+int cvq_index, r;
 
 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 
 s = DO_UPCAST(VhostVDPAState, nc, nc);
+v = &s->vhost_vdpa;
+
+v->shadow_data = s->always_svq;
+v->shadow_vqs_enabled = s->always_svq;
+s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
+
+if (s->always_svq) {
+/* SVQ is already configured for all virtqueues */
+goto out;
+}
+
+/*
+ * If we early return in these cases SVQ will not be enabled. The migration
+ * will be blocked as long as vhost-vdpa backends will not offer _F_LOG.
+ *
+ * Calling VHOST_GET_BACKEND_FEATURES as they are not available in v->dev
+ * yet.
+ */
+r = ioctl(v->device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features);
+if (unlikely(r < 0)) {
+error_report("Cannot get vdpa backend_features: %s(%d)",
+g_str

[PATCH v9 03/12] vhost: allocate SVQ device file descriptors at device start

2022-12-15 Thread Eugenio Pérez

The next patches will start control SVQ if possible. However, we don't
know if that will be possible at qemu boot anymore.

Delay device file descriptors until we know it at device start. This
will avoid to create them if the device does not support SVQ.

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
 hw/virtio/vhost-shadow-virtqueue.c | 31 ++
 hw/virtio/vhost-vdpa.c | 35 --
 2 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index 264ddc166d..3b05bab44d 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -715,43 +715,18 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * @iova_tree: Tree to perform descriptors translations
  * @ops: SVQ owner callbacks
  * @ops_opaque: ops opaque pointer
- *
- * Returns the new virtqueue or NULL.
- *
- * In case of error, reason is reported through error_report.
  */
 VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
 const VhostShadowVirtqueueOps *ops,
 void *ops_opaque)
 {
-g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
-int r;
-
-r = event_notifier_init(&svq->hdev_kick, 0);
-if (r != 0) {
-error_report("Couldn't create kick event notifier: %s (%d)",
- g_strerror(errno), errno);
-goto err_init_hdev_kick;
-}
-
-r = event_notifier_init(&svq->hdev_call, 0);
-if (r != 0) {
-error_report("Couldn't create call event notifier: %s (%d)",
- g_strerror(errno), errno);
-goto err_init_hdev_call;
-}
+VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 
 event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
 svq->iova_tree = iova_tree;
 svq->ops = ops;
 svq->ops_opaque = ops_opaque;
-return g_steal_pointer(&svq);
-
-err_init_hdev_call:
-event_notifier_cleanup(&svq->hdev_kick);
-
-err_init_hdev_kick:
-return NULL;
+return svq;
 }
 
 /**
@@ -763,7 +738,5 @@ void vhost_svq_free(gpointer pvq)
 {
 VhostShadowVirtqueue *vq = pvq;
 vhost_svq_stop(vq);
-event_notifier_cleanup(&vq->hdev_kick);
-event_notifier_cleanup(&vq->hdev_call);
 g_free(vq);
 }
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 7f0ff4df5b..3df2775760 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -428,15 +428,11 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, 
struct vhost_vdpa *v,
 
 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 for (unsigned n = 0; n < hdev->nvqs; ++n) {
-g_autoptr(VhostShadowVirtqueue) svq;
+VhostShadowVirtqueue *svq;
 
 svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
 v->shadow_vq_ops_opaque);
-if (unlikely(!svq)) {
-error_setg(errp, "Cannot create svq %u", n);
-return -1;
-}
-g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
+g_ptr_array_add(shadow_vqs, svq);
 }
 
 v->shadow_vqs = g_steal_pointer(&shadow_vqs);
@@ -864,11 +860,23 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 const EventNotifier *event_notifier = &svq->hdev_kick;
 int r;
 
+r = event_notifier_init(&svq->hdev_kick, 0);
+if (r != 0) {
+error_setg_errno(errp, -r, "Couldn't create kick event notifier");
+goto err_init_hdev_kick;
+}
+
+r = event_notifier_init(&svq->hdev_call, 0);
+if (r != 0) {
+error_setg_errno(errp, -r, "Couldn't create call event notifier");
+goto err_init_hdev_call;
+}
+
 file.fd = event_notifier_get_fd(event_notifier);
 r = vhost_vdpa_set_vring_dev_kick(dev, &file);
 if (unlikely(r != 0)) {
 error_setg_errno(errp, -r, "Can't set device kick fd");
-return r;
+goto err_init_set_dev_fd;
 }
 
 event_notifier = &svq->hdev_call;
@@ -876,8 +884,18 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 r = vhost_vdpa_set_vring_dev_call(dev, &file);
 if (unlikely(r != 0)) {
 error_setg_errno(errp, -r, "Can't set device call fd");
+goto err_init_set_dev_fd;
 }
 
+return 0;
+
+err_init_set_dev_fd:
+event_notifier_set_handler(&svq->hdev_call, NULL);
+
+err_init_hdev_call:
+event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
 return r;
 }
 
@@ -1089,6 +1107,9 @@ static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 vhost_vdpa_svq_unmap_rings(dev, svq);
+
+event_notifier_cleanup(&svq->hdev_kick);
+event_notifier_cleanup(&svq->hdev_call);
 }
 }
 
-- 
2.31.1

[PATCH v9 05/12] vdpa: add vhost_vdpa_net_valid_svq_features

2022-12-15 Thread Eugenio Pérez

It will be reused at vdpa device start so let's extract in its own
function.

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
 net/vhost-vdpa.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 260e474863..2c0ff6d7b0 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -107,6 +107,22 @@ VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
 return s->vhost_net;
 }
 
+static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp)
+{
+uint64_t invalid_dev_features =
+features & ~vdpa_svq_device_features &
+/* Transport are all accepted at this point */
+~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
+ VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
+
+if (invalid_dev_features) {
+error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
+   invalid_dev_features);
+}
+
+return !invalid_dev_features;
+}
+
 static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
 {
 uint32_t device_id;
@@ -676,15 +692,7 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char 
*name,
 if (opts->x_svq) {
 struct vhost_vdpa_iova_range iova_range;
 
-uint64_t invalid_dev_features =
-features & ~vdpa_svq_device_features &
-/* Transport are all accepted at this point */
-~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
- VIRTIO_TRANSPORT_F_END - 
VIRTIO_TRANSPORT_F_START);
-
-if (invalid_dev_features) {
-error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
-   invalid_dev_features);
+if (!vhost_vdpa_net_valid_svq_features(features, errp)) {
 goto err_svq;
 }
 
-- 
2.31.1

[PATCH v9 10/12] vdpa: store x-svq parameter in VhostVDPAState

2022-12-15 Thread Eugenio Pérez

CVQ can be shadowed two ways:
- Device has x-svq=on parameter (current way)
- The device can isolate CVQ in its own vq group

QEMU needs to check for the second condition dynamically, because CVQ
index is not known before the driver ack the features. Since this is
dynamic, the CVQ isolation could vary with different conditions, making
it possible to go from "not isolated group" to "isolated".

Saving the cmdline parameter in an extra field so we never disable CVQ
SVQ in case the device was started with x-svq cmdline.

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
 net/vhost-vdpa.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index a592ee07ec..bff72717d0 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -38,6 +38,8 @@ typedef struct VhostVDPAState {
 void *cvq_cmd_out_buffer;
 virtio_net_ctrl_ack *status;
 
+/* The device always have SVQ enabled */
+bool always_svq;
 bool started;
 } VhostVDPAState;
 
@@ -568,6 +570,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
 
 s->vhost_vdpa.device_fd = vdpa_device_fd;
 s->vhost_vdpa.index = queue_pair_index;
+s->always_svq = svq;
 s->vhost_vdpa.shadow_vqs_enabled = svq;
 s->vhost_vdpa.iova_range = iova_range;
 s->vhost_vdpa.iova_tree = iova_tree;
-- 
2.31.1

[PATCH v9 11/12] vdpa: add shadow_data to vhost_vdpa

2022-12-15 Thread Eugenio Pérez

The memory listener that thells the device how to convert GPA to qemu's
va is registered against CVQ vhost_vdpa. memory listener translations
are always ASID 0, CVQ ones are ASID 1 if supported.

Let's tell the listener if it needs to register them on iova tree or
not.

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
v7: Rename listener_shadow_vq to shadow_data
v5: Solve conflict about vhost_iova_tree_remove accepting mem_region by
value.
---
 include/hw/virtio/vhost-vdpa.h | 2 ++
 hw/virtio/vhost-vdpa.c | 6 +++---
 net/vhost-vdpa.c   | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index e57dfa1fd1..45b969a311 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -40,6 +40,8 @@ typedef struct vhost_vdpa {
 struct vhost_vdpa_iova_range iova_range;
 uint64_t acked_features;
 bool shadow_vqs_enabled;
+/* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */
+bool shadow_data;
 /* IOVA mapping used by the Shadow Virtqueue */
 VhostIOVATree *iova_tree;
 GPtrArray *shadow_vqs;
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 5e591a8fda..48d8c60e76 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -224,7 +224,7 @@ static void vhost_vdpa_listener_region_add(MemoryListener 
*listener,
  vaddr, section->readonly);
 
 llsize = int128_sub(llend, int128_make64(iova));
-if (v->shadow_vqs_enabled) {
+if (v->shadow_data) {
 int r;
 
 mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
@@ -251,7 +251,7 @@ static void vhost_vdpa_listener_region_add(MemoryListener 
*listener,
 return;
 
 fail_map:
-if (v->shadow_vqs_enabled) {
+if (v->shadow_data) {
 vhost_iova_tree_remove(v->iova_tree, mem_region);
 }
 
@@ -296,7 +296,7 @@ static void vhost_vdpa_listener_region_del(MemoryListener 
*listener,
 
 llsize = int128_sub(llend, int128_make64(iova));
 
-if (v->shadow_vqs_enabled) {
+if (v->shadow_data) {
 const DMAMap *result;
 const void *vaddr = memory_region_get_ram_ptr(section->mr) +
 section->offset_within_region +
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index bff72717d0..710c5efe96 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -573,6 +573,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
 s->always_svq = svq;
 s->vhost_vdpa.shadow_vqs_enabled = svq;
 s->vhost_vdpa.iova_range = iova_range;
+s->vhost_vdpa.shadow_data = svq;
 s->vhost_vdpa.iova_tree = iova_tree;
 if (!is_datapath) {
 s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
-- 
2.31.1

[PATCH v9 09/12] vdpa: add asid parameter to vhost_vdpa_dma_map/unmap

2022-12-15 Thread Eugenio Pérez

So the caller can choose which ASID is destined.

No need to update the batch functions as they will always be called from
memory listener updates at the moment. Memory listener updates will
always update ASID 0, as it's the passthrough ASID.

All vhost devices's ASID are 0 at this moment.

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
v7:
* Move comment on zero initailization of vhost_vdpa_dma_map above the
  functions.
* Add VHOST_VDPA_GUEST_PA_ASID macro.

v5:
* Solve conflict, now vhost_vdpa_svq_unmap_ring returns void
* Change comment on zero initialization.

v4: Add comment specifying behavior if device does not support _F_ASID

v3: Deleted unneeded space
---
 include/hw/virtio/vhost-vdpa.h | 14 ++---
 hw/virtio/vhost-vdpa.c | 36 +++---
 net/vhost-vdpa.c   |  6 +++---
 hw/virtio/trace-events |  4 ++--
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index d85643..e57dfa1fd1 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -19,6 +19,12 @@
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
+/*
+ * ASID dedicated to map guest's addresses.  If SVQ is disabled it maps GPA to
+ * qemu's IOVA.  If SVQ is enabled it maps also the SVQ vring here
+ */
+#define VHOST_VDPA_GUEST_PA_ASID 0
+
 typedef struct VhostVDPAHostNotifier {
 MemoryRegion mr;
 void *addr;
@@ -29,6 +35,7 @@ typedef struct vhost_vdpa {
 int index;
 uint32_t msg_type;
 bool iotlb_batch_begin_sent;
+uint32_t address_space_id;
 MemoryListener listener;
 struct vhost_vdpa_iova_range iova_range;
 uint64_t acked_features;
@@ -42,8 +49,9 @@ typedef struct vhost_vdpa {
 VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostVDPA;
 
-int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
-   void *vaddr, bool readonly);
-int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size);
+int vhost_vdpa_dma_map(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
+   hwaddr size, void *vaddr, bool readonly);
+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
+ hwaddr size);
 
 #endif
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 86e1fa8e9e..5e591a8fda 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -72,22 +72,28 @@ static bool 
vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
 return false;
 }
 
-int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
-   void *vaddr, bool readonly)
+/*
+ * The caller must set asid = 0 if the device does not support asid.
+ * This is not an ABI break since it is set to 0 by the initializer anyway.
+ */
+int vhost_vdpa_dma_map(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
+   hwaddr size, void *vaddr, bool readonly)
 {
 struct vhost_msg_v2 msg = {};
 int fd = v->device_fd;
 int ret = 0;
 
 msg.type = v->msg_type;
+msg.asid = asid;
 msg.iotlb.iova = iova;
 msg.iotlb.size = size;
 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
 msg.iotlb.type = VHOST_IOTLB_UPDATE;
 
-   trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
-msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
+trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.asid, msg.iotlb.iova,
+ msg.iotlb.size, msg.iotlb.uaddr, msg.iotlb.perm,
+ msg.iotlb.type);
 
 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
 error_report("failed to write, fd=%d, errno=%d (%s)",
@@ -98,18 +104,24 @@ int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, 
hwaddr size,
 return ret;
 }
 
-int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
+/*
+ * The caller must set asid = 0 if the device does not support asid.
+ * This is not an ABI break since it is set to 0 by the initializer anyway.
+ */
+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
+ hwaddr size)
 {
 struct vhost_msg_v2 msg = {};
 int fd = v->device_fd;
 int ret = 0;
 
 msg.type = v->msg_type;
+msg.asid = asid;
 msg.iotlb.iova = iova;
 msg.iotlb.size = size;
 msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
 
-trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
+trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.asid, msg.iotlb.iova,
msg.iotlb.size, msg.iotlb.type);
 
 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
@@ -229,8 +241,8 @@ static void vhost_vdpa_listener_region_add(MemoryListener 
*listener,
 }
 
 vhost_vdpa_iotlb_batch_begin_once(v);
-ret = vho

[PATCH v9 02/12] vhost: set SVQ device call handler at SVQ start

2022-12-15 Thread Eugenio Pérez

By the end of this series CVQ is shadowed as long as the features
support it.

Since we don't know at the beginning of qemu running if this is
supported, move the event notifier handler setting to the start of the
SVQ, instead of the start of qemu run. This will avoid to create them if
the device does not support SVQ.

Signed-off-by: Eugenio Pérez 
Acked-by: Jason Wang 
---
 hw/virtio/vhost-shadow-virtqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index 5bd14cad96..264ddc166d 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -648,6 +648,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, 
VirtIODevice *vdev,
 {
 size_t desc_size, driver_size, device_size;
 
+event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 svq->next_guest_avail_elem = NULL;
 svq->shadow_avail_idx = 0;
 svq->shadow_used_idx = 0;
@@ -704,6 +705,7 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
 g_free(svq->desc_state);
 qemu_vfree(svq->vring.desc);
 qemu_vfree(svq->vring.used);
+event_notifier_set_handler(&svq->hdev_call, NULL);
 }
 
 /**
@@ -740,7 +742,6 @@ VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree 
*iova_tree,
 }
 
 event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
-event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 svq->iova_tree = iova_tree;
 svq->ops = ops;
 svq->ops_opaque = ops_opaque;
@@ -763,7 +764,6 @@ void vhost_svq_free(gpointer pvq)
 VhostShadowVirtqueue *vq = pvq;
 vhost_svq_stop(vq);
 event_notifier_cleanup(&vq->hdev_kick);
-event_notifier_set_handler(&vq->hdev_call, NULL);
 event_notifier_cleanup(&vq->hdev_call);
 g_free(vq);
 }
-- 
2.31.1

Re: migration qtest failure: "query-migrate shows failed migration: Unable to write to socket: Broken pipe"

2022-12-15 Thread Dr. David Alan Gilbert

* Peter Maydell (peter.mayd...@linaro.org) wrote:
> Hi; I see this migration qtest failure on my x86 macos box:
> 
> 
> ▶  32/591 
> ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> assertion fail
> ed: (!g_str_equal(status, "failed")) ERROR
>  32/591 qemu:qtest+qtest-aarch64 / qtest-aarch64/migration-test
>ERROR  152.27s   killed by signal 6 SIGABRT
> ― ✀  ―
> stderr:
> query-migrate shows failed migration: Unable to write to socket: Broken pipe
> **
> ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> assertion failed: (!g_str_equal(status, "failed"))
> Assertion failed: (pid == s->qemu_pid), function qtest_wait_qemu, file
> ../../tests/qtest/libqtest.c, line 207.
> 
> (test program exited with status code -6)
> ――
> 
> and similarly:
> 
> ▶  34/591 
> ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> assertion failed: (!g_str_equal(status, "failed")) ERROR
>  34/591 qemu:qtest+qtest-i386 / qtest-i386/migration-test
>ERROR  169.44s   killed by signal 6 SIGABRT
> ― ✀  ―
> stderr:
> query-migrate shows failed migration: Unable to write to socket: Broken pipe
> **
> ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> assertion failed: (!g_str_equal(status, "failed"))
> Assertion failed: (pid == s->qemu_pid), function qtest_wait_qemu, file
> ../../tests/qtest/libqtest.c, line 207.
> 
> (test program exited with status code -6)
> ――
> 
> It seems to be fairly consistent. Any ideas what it might be?
> Maybe the QEMU process has already exited before the test binary
> gets round to querying the status ?

Yes, it sounds like it, can you get a backtrace to figure out which test
it was in/where it was upto when it died?

Dave

> thanks
> -- PMM
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: migration qtest failure: "query-migrate shows failed migration: Unable to write to socket: Broken pipe"

2022-12-15 Thread Peter Maydell

On Thu, 15 Dec 2022 at 11:40, Dr. David Alan Gilbert
 wrote:
>
> * Peter Maydell (peter.mayd...@linaro.org) wrote:
> > Hi; I see this migration qtest failure on my x86 macos box:
> >
> >
> > ▶  32/591 
> > ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> > assertion fail
> > ed: (!g_str_equal(status, "failed")) ERROR
> >  32/591 qemu:qtest+qtest-aarch64 / qtest-aarch64/migration-test
> >ERROR  152.27s   killed by signal 6 SIGABRT
> > ― ✀  
> > ―
> > stderr:
> > query-migrate shows failed migration: Unable to write to socket: Broken pipe
> > **
> > ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> > assertion failed: (!g_str_equal(status, "failed"))
> > Assertion failed: (pid == s->qemu_pid), function qtest_wait_qemu, file
> > ../../tests/qtest/libqtest.c, line 207.
> >
> > (test program exited with status code -6)
> > ――
> >
> > and similarly:
> >
> > ▶  34/591 
> > ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> > assertion failed: (!g_str_equal(status, "failed")) ERROR
> >  34/591 qemu:qtest+qtest-i386 / qtest-i386/migration-test
> >ERROR  169.44s   killed by signal 6 SIGABRT
> > ― ✀  
> > ―
> > stderr:
> > query-migrate shows failed migration: Unable to write to socket: Broken pipe
> > **
> > ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> > assertion failed: (!g_str_equal(status, "failed"))
> > Assertion failed: (pid == s->qemu_pid), function qtest_wait_qemu, file
> > ../../tests/qtest/libqtest.c, line 207.
> >
> > (test program exited with status code -6)
> > ――
> >
> > It seems to be fairly consistent. Any ideas what it might be?
> > Maybe the QEMU process has already exited before the test binary
> > gets round to querying the status ?
>
> Yes, it sounds like it, can you get a backtrace to figure out which test
> it was in/where it was upto when it died?

The logfile says it had just done
ok 23 /aarch64/migration/multifd/tcp/plain/none
so I think the one it was in the middle of when it failed was
/aarch64/migration/multifd/tcp/plain/cancel.
Similarly the log suggests the x86 failure was for
/i386/migration/miltifd/tcp/plain/cancel.

It doesn't seem to repro running manually, my guess is that
it happens because the machine is heavily loaded doing the
whole build-and-test cycle.

thanks
-- PMM

[PULL v2 00/51] Block layer patches

2022-12-15 Thread Kevin Wolf

The following changes since commit 5204b499a6cae4dfd9fe762d5e6e82224892383b:

  mailmap: Fix Stefan Weil author email (2022-12-13 15:56:57 -0500)

are available in the Git repository at:

  https://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 347fe9e156a3e00c40ae1802978276a1f7d5545f:

  block: GRAPH_RDLOCK for functions only called by co_wrappers (2022-12-15 
10:11:45 +0100)

v2:
- Changed TSA capability name to "mutex" to work with older clang
  versions. The tsan-build CI job succeeds now.


Block layer patches

- Code cleanups around block graph modification
- Simplify drain
- coroutine_fn correctness fixes, including splitting generated
  coroutine wrappers into co_wrapper (to be called only from
  non-coroutine context) and co_wrapper_mixed (both coroutine and
  non-coroutine context)
- Introduce a block graph rwlock


Emanuele Giuseppe Esposito (21):
  block-io: introduce coroutine_fn duplicates for 
bdrv_common_block_status_above callers
  block-copy: add coroutine_fn annotations
  nbd/server.c: add coroutine_fn annotations
  block-backend: replace bdrv_*_above with blk_*_above
  block/vmdk: add coroutine_fn annotations
  block: avoid duplicating filename string in bdrv_create
  block: distinguish between bdrv_create running in coroutine and not
  block: bdrv_create_file is a coroutine_fn
  block: rename generated_co_wrapper in co_wrapper_mixed
  block-coroutine-wrapper.py: introduce co_wrapper
  block-coroutine-wrapper.py: support functions without bs arg
  block-coroutine-wrapper.py: support also basic return types
  block: convert bdrv_create to co_wrapper
  block/dirty-bitmap: convert coroutine-only functions to co_wrapper
  graph-lock: Implement guard macros
  async: Register/unregister aiocontext in graph lock list
  block: wrlock in bdrv_replace_child_noperm
  block: remove unnecessary assert_bdrv_graph_writable()
  block: assert that graph read and writes are performed correctly
  block-coroutine-wrapper.py: introduce annotations that take the graph 
rdlock
  block: use co_wrapper_mixed_bdrv_rdlock in functions taking the rdlock

Kevin Wolf (25):
  qed: Don't yield in bdrv_qed_co_drain_begin()
  test-bdrv-drain: Don't yield in .bdrv_co_drained_begin/end()
  block: Revert .bdrv_drained_begin/end to non-coroutine_fn
  block: Remove drained_end_counter
  block: Inline bdrv_drain_invoke()
  block: Fix locking for bdrv_reopen_queue_child()
  block: Drain individual nodes during reopen
  block: Don't use subtree drains in bdrv_drop_intermediate()
  stream: Replace subtree drain with a single node drain
  block: Remove subtree drains
  block: Call drain callbacks only once
  block: Remove ignore_bds_parents parameter from drain_begin/end.
  block: Drop out of coroutine in bdrv_do_drained_begin_quiesce()
  block: Don't poll in bdrv_replace_child_noperm()
  block: Remove poll parameter from bdrv_parent_drained_begin_single()
  block: Factor out bdrv_drain_all_begin_nopoll()
  Import clang-tsa.h
  clang-tsa: Add TSA_ASSERT() macro
  clang-tsa: Add macros for shared locks
  configure: Enable -Wthread-safety if present
  test-bdrv-drain: Fix incorrrect drain assumptions
  block: Fix locking in external_snapshot_prepare()
  graph-lock: TSA annotations for lock/unlock functions
  Mark assert_bdrv_graph_readable/writable() GRAPH_RD/WRLOCK
  block: GRAPH_RDLOCK for functions only called by co_wrappers

Paolo Bonzini (1):
  graph-lock: Introduce a lock to protect block graph operations

Vladimir Sementsov-Ogievskiy (4):
  block: Inline bdrv_detach_child()
  block: drop bdrv_remove_filter_or_cow_child
  block: bdrv_refresh_perms(): allow external tran
  block: refactor bdrv_list_refresh_perms to allow any list of nodes

 docs/devel/block-coroutine-wrapper.rst |   6 +-
 configure  |   1 +
 block/block-gen.h  |  11 +-
 block/coroutines.h |  21 +-
 include/block/aio.h|   9 +
 include/block/block-common.h   |  27 ++-
 include/block/block-copy.h |   5 +-
 include/block/block-global-state.h |  15 +-
 include/block/block-io.h   | 136 +--
 include/block/block_int-common.h   |  49 ++--
 include/block/block_int-global-state.h |  17 --
 include/block/block_int-io.h   |  12 -
 include/block/block_int.h  |   1 +
 include/block/dirty-bitmap.h   |  10 +-
 include/block/graph-lock.h | 280 +++
 include/qemu/clang-tsa.h   | 114 ++
 include/sysemu/block-backend-io.h  |  77 ---
 block.c| 404 ++

[PATCH Trivial] hw/cxl/cxl-cdat.c: spelling: missmatch

2022-12-15 Thread Michael Tokarev

Introduced by: aba578bdace5303a441f8a37aad781b5cb06f38c

Signed-off-by: Michael Tokarev 
---
 hw/cxl/cxl-cdat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-cdat.c b/hw/cxl/cxl-cdat.c
index 3653aa56f0..137abd0992 100644
--- a/hw/cxl/cxl-cdat.c
+++ b/hw/cxl/cxl-cdat.c
@@ -146,7 +146,7 @@ static void ct3_load_cdat(CDATObject *cdat, Error **errp)
 num_ent++;
 }
 if (i != file_size) {
-error_setg(errp, "CDAT: File length missmatch");
+error_setg(errp, "CDAT: File length mismatch");
 return;
 }
 
-- 
2.30.2

Re: [PATCH 00/14] block: Move more functions to coroutines

2022-12-15 Thread Emanuele Giuseppe Esposito




Am 13/12/2022 um 09:53 schrieb Kevin Wolf:
> This series converts some IO_CODE() functions to coroutine_fn because
> they access the graph and will need to hold the graph lock in the
> future. IO_CODE() functions can be called from iothreads, so taking the
> graph lock requires the function to run in coroutine context.
> 
> Pretty much all of the changes in this series were posted by Emanuele
> before as part of "Protect the block layer with a rwlock: part 3". The
> major difference is that in the old version, the patches did two things
> at once: Converting functions to coroutine_fn, and adding the locking to
> them. This series does only the coroutine conversion. The locking part
> will be in another series which now comes with TSA annotations and makes
> the locking related changes big enough to have separate patches.
> 

Reviewed-by: Emanuele Giuseppe Esposito 

> Emanuele Giuseppe Esposito (14):
>   block-coroutine-wrapper: support void functions
>   block: Convert bdrv_io_plug() to co_wrapper
>   block: Convert bdrv_io_unplug() to co_wrapper
>   block: Rename refresh_total_sectors to bdrv_refresh_total_sectors
>   block: Convert bdrv_refresh_total_sectors() to co_wrapper_mixed
>   block-backend: use bdrv_getlength instead of blk_getlength
>   block: use bdrv_co_refresh_total_sectors when possible
>   block: Convert bdrv_get_allocated_file_size() to co_wrapper
>   block: Convert bdrv_get_info() to co_wrapper_mixed
>   block: Convert bdrv_is_inserted() to co_wrapper
>   block: Convert bdrv_eject() to co_wrapper
>   block: convert bdrv_lock_medium in co_wrapper
>   block: Convert bdrv_debug_event to co_wrapper_mixed
>   block: Rename newly converted BlockDriver IO coroutine functions
> 
>  include/block/block-io.h   | 36 +
>  include/block/block_int-common.h   | 26 ++
>  include/block/block_int-io.h   |  5 +-
>  include/sysemu/block-backend-io.h  | 31 ---
>  block.c| 82 ++
>  block/blkdebug.c   |  4 +-
>  block/blkio.c  |  6 +--
>  block/blklogwrites.c   |  2 +-
>  block/blkreplay.c  |  2 +-
>  block/blkverify.c  |  2 +-
>  block/block-backend.c  | 36 ++---
>  block/commit.c |  4 +-
>  block/copy-on-read.c   | 12 ++---
>  block/crypto.c |  6 +--
>  block/curl.c   |  8 +--
>  block/file-posix.c | 48 -
>  block/file-win32.c | 12 ++---
>  block/filter-compress.c| 10 ++--
>  block/gluster.c| 16 +++---
>  block/io.c | 76 +--
>  block/iscsi.c  |  8 +--
>  block/mirror.c |  6 +--
>  block/nbd.c|  6 +--
>  block/nfs.c|  2 +-
>  block/null.c   |  8 +--
>  block/nvme.c   |  6 +--
>  block/preallocate.c|  2 +-
>  block/qcow.c   |  2 +-
>  block/qcow2-refcount.c |  2 +-
>  block/qcow2.c  |  6 +--
>  block/qed.c|  4 +-
>  block/quorum.c |  2 +-
>  block/raw-format.c | 14 ++---
>  block/rbd.c|  4 +-
>  block/replication.c|  2 +-
>  block/ssh.c|  2 +-
>  block/throttle.c   |  2 +-
>  block/vdi.c|  2 +-
>  block/vhdx.c   |  2 +-
>  block/vmdk.c   |  4 +-
>  block/vpc.c|  2 +-
>  blockdev.c |  8 ++-
>  hw/scsi/scsi-disk.c|  5 ++
>  tests/unit/test-block-iothread.c   |  3 ++
>  scripts/block-coroutine-wrapper.py | 20 ++--
>  block/meson.build  |  1 +
>  46 files changed, 316 insertions(+), 233 deletions(-)
>

Re: [PATCH v1 17/24] vfio-user: dma map/unmap operations

2022-12-15 Thread Cédric Le Goater


On 11/9/22 00:13, John Johnson wrote:

Add ability to do async operations during memory transactions

Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: John G Johnson 
---
  hw/vfio/common.c  |  63 +---
  hw/vfio/user-protocol.h   |  32 ++
  hw/vfio/user.c| 220 ++
  include/hw/vfio/vfio-common.h |   9 +-
  4 files changed, 308 insertions(+), 16 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index e73a772..fe6eddd 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -507,7 +507,7 @@ static int vfio_dma_unmap(VFIOContainer *container,
  return CONT_DMA_UNMAP(container, &unmap, NULL);
  }
  
-static int vfio_dma_map(VFIOContainer *container, hwaddr iova,

+static int vfio_dma_map(VFIOContainer *container, MemoryRegion *mr, hwaddr 
iova,
  ram_addr_t size, void *vaddr, bool readonly)
  {
  struct vfio_iommu_type1_dma_map map = {
@@ -523,7 +523,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
iova,
  map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
  }
  
-ret = CONT_DMA_MAP(container, &map);

+ret = CONT_DMA_MAP(container, mr, &map);
  
  if (ret < 0) {

  error_report("VFIO_MAP_DMA failed: %s", strerror(-ret));
@@ -586,7 +586,8 @@ static bool 
vfio_listener_skipped_section(MemoryRegionSection *section)
  
  /* Called with rcu_read_lock held.  */

  static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
-   ram_addr_t *ram_addr, bool *read_only)
+   ram_addr_t *ram_addr, bool *read_only,
+   MemoryRegion **mrp)
  {


This needs a small update. A memory_get_xlat_addr() routine was introduced 
since.


  MemoryRegion *mr;
  hwaddr xlat;
@@ -667,6 +668,10 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void 
**vaddr,
  *read_only = !writable || mr->readonly;
  }
  
+if (mrp != NULL) {

+*mrp = mr;
+}
+
  return true;
  }
  
@@ -674,6 +679,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)

  {
  VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
  VFIOContainer *container = giommu->container;
+MemoryRegion *mr;
  hwaddr iova = iotlb->iova + giommu->iommu_offset;
  void *vaddr;
  int ret;
@@ -692,7 +698,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
  if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
  bool read_only;
  
-if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {

+if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &mr)) {
  goto out;
  }
  /*
@@ -702,14 +708,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
   * of vaddr will always be there, even if the memory object is
   * destroyed and its backing memory munmap-ed.
   */
-ret = vfio_dma_map(container, iova,
+ret = vfio_dma_map(container, mr, iova,
 iotlb->addr_mask + 1, vaddr,
 read_only);
  if (ret) {
  error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx", %p) = %d (%m)",
+ "0x%"HWADDR_PRIx", %p)",
   container, iova,
- iotlb->addr_mask + 1, vaddr, ret);
+ iotlb->addr_mask + 1, vaddr);
  }
  } else {
  ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
@@ -764,7 +770,7 @@ static int 
vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
 section->offset_within_address_space;
  vaddr = memory_region_get_ram_ptr(section->mr) + start;
  
-ret = vfio_dma_map(vrdl->container, iova, next - start,

+ret = vfio_dma_map(vrdl->container, section->mr, iova, next - start,
 vaddr, section->readonly);
  if (ret) {
  /* Rollback */
@@ -888,6 +894,29 @@ static bool 
vfio_known_safe_misalignment(MemoryRegionSection *section)
  return true;
  }
  
+static void vfio_listener_begin(MemoryListener *listener)

+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+/*
+ * When DMA space is the physical address space,
+ * the region add/del listeners will fire during
+ * memory update transactions.  These depend on BQL
+ * being held, so do any resulting map/demap ops async
+ * while keeping BQL.
+ */
+container->async_ops = true;
+}
+
+static void vfio_listener_commit(MemoryListener *listener)
+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+/* wait here for any async requests sent during the transaction */
+CONT_WAIT_COMMIT(container);
+container->asy

[PULL 13/29] target/arm: Implement HCR_EL2.TTLBOS traps

2022-12-15 Thread Peter Maydell

For FEAT_EVT, the HCR_EL2.TTLBOS bit allows trapping on EL1
use of TLB maintenance instructions that operate on the
outer shareable domain:

TLBI VMALLE1OS, TLBI VAE1OS, TLBI ASIDE1OS,TLBI VAAE1OS,
TLBI VALE1OS, TLBI VAALE1OS, TLBI RVAE1OS, TLBI RVAAE1OS,
TLBI RVALE1OS, and TLBI RVAALE1OS.

(There are no AArch32 outer-shareable TLB maintenance ops.)

Implement the trapping.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
---
 target/arm/helper.c | 33 +++--
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 475b48750e9..0ec1c3ffbd6 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -373,6 +373,19 @@ static CPAccessResult access_ttlbis(CPUARMState *env, 
const ARMCPRegInfo *ri,
 return CP_ACCESS_OK;
 }
 
+#ifdef TARGET_AARCH64
+/* Check for traps from EL1 due to HCR_EL2.TTLB or TTLBOS. */
+static CPAccessResult access_ttlbos(CPUARMState *env, const ARMCPRegInfo *ri,
+bool isread)
+{
+if (arm_current_el(env) == 1 &&
+(arm_hcr_el2_eff(env) & (HCR_TTLB | HCR_TTLBOS))) {
+return CP_ACCESS_TRAP_EL2;
+}
+return CP_ACCESS_OK;
+}
+#endif
+
 static void dacr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t 
value)
 {
 ARMCPU *cpu = env_archcpu(env);
@@ -6753,19 +6766,19 @@ static const ARMCPRegInfo tlbirange_reginfo[] = {
   .writefn = tlbi_aa64_rvae1is_write },
 { .name = "TLBI_RVAE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 5, .opc2 = 1,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_rvae1is_write },
 { .name = "TLBI_RVAAE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 5, .opc2 = 3,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_rvae1is_write },
{ .name = "TLBI_RVALE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 5, .opc2 = 5,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_rvae1is_write },
 { .name = "TLBI_RVAALE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 5, .opc2 = 7,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_rvae1is_write },
 { .name = "TLBI_RVAE1", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 6, .opc2 = 1,
@@ -6852,27 +6865,27 @@ static const ARMCPRegInfo tlbirange_reginfo[] = {
 static const ARMCPRegInfo tlbios_reginfo[] = {
 { .name = "TLBI_VMALLE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 1, .opc2 = 0,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vmalle1is_write },
 { .name = "TLBI_VAE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 1, .opc2 = 1,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vae1is_write },
 { .name = "TLBI_ASIDE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 1, .opc2 = 2,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vmalle1is_write },
 { .name = "TLBI_VAAE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 1, .opc2 = 3,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vae1is_write },
 { .name = "TLBI_VALE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 1, .opc2 = 5,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vae1is_write },
 { .name = "TLBI_VAALE1OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 1, .opc2 = 7,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbos, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vae1is_write },
 { .name = "TLBI_ALLE2OS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 1, .opc2 = 0,
-- 
2.25.1

[PULL 09/29] target/arm: Add Cortex-A55 CPU

2022-12-15 Thread Peter Maydell

From: Timofey Kutergin 

The Cortex-A55 is one of the newer armv8.2+ CPUs; in particular
it supports the Privileged Access Never (PAN) feature. Add
a model of this CPU, so you can use a CPU type on the virt
board that models a specific real hardware CPU, rather than
having to use the QEMU-specific "max" CPU type.

Signed-off-by: Timofey Kutergin 
Message-id: 20221121150819.2782817-1-tkuter...@gmail.com
[PMM: tweaked commit message]
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 docs/system/arm/virt.rst |  1 +
 hw/arm/virt.c|  1 +
 target/arm/cpu64.c   | 69 
 3 files changed, 71 insertions(+)

diff --git a/docs/system/arm/virt.rst b/docs/system/arm/virt.rst
index 188a4f211f4..1cab33f02e3 100644
--- a/docs/system/arm/virt.rst
+++ b/docs/system/arm/virt.rst
@@ -54,6 +54,7 @@ Supported guest CPU types:
 - ``cortex-a15`` (32-bit; the default)
 - ``cortex-a35`` (64-bit)
 - ``cortex-a53`` (64-bit)
+- ``cortex-a55`` (64-bit)
 - ``cortex-a57`` (64-bit)
 - ``cortex-a72`` (64-bit)
 - ``cortex-a76`` (64-bit)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index bf59784aefa..a2dd48dfb80 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -207,6 +207,7 @@ static const char *valid_cpus[] = {
 ARM_CPU_TYPE_NAME("cortex-a15"),
 ARM_CPU_TYPE_NAME("cortex-a35"),
 ARM_CPU_TYPE_NAME("cortex-a53"),
+ARM_CPU_TYPE_NAME("cortex-a55"),
 ARM_CPU_TYPE_NAME("cortex-a57"),
 ARM_CPU_TYPE_NAME("cortex-a72"),
 ARM_CPU_TYPE_NAME("cortex-a76"),
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 3d74f134f57..cec64471b4e 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -792,6 +792,74 @@ static void aarch64_a53_initfn(Object *obj)
 define_cortex_a72_a57_a53_cp_reginfo(cpu);
 }
 
+static void aarch64_a55_initfn(Object *obj)
+{
+ARMCPU *cpu = ARM_CPU(obj);
+
+cpu->dtb_compatible = "arm,cortex-a55";
+set_feature(&cpu->env, ARM_FEATURE_V8);
+set_feature(&cpu->env, ARM_FEATURE_NEON);
+set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER);
+set_feature(&cpu->env, ARM_FEATURE_AARCH64);
+set_feature(&cpu->env, ARM_FEATURE_CBAR_RO);
+set_feature(&cpu->env, ARM_FEATURE_EL2);
+set_feature(&cpu->env, ARM_FEATURE_EL3);
+set_feature(&cpu->env, ARM_FEATURE_PMU);
+
+/* Ordered by B2.4 AArch64 registers by functional group */
+cpu->clidr = 0x8223;
+cpu->ctr = 0x84448004; /* L1Ip = VIPT */
+cpu->dcz_blocksize = 4; /* 64 bytes */
+cpu->isar.id_aa64dfr0  = 0x10305408ull;
+cpu->isar.id_aa64isar0 = 0x100010211120ull;
+cpu->isar.id_aa64isar1 = 0x0011ull;
+cpu->isar.id_aa64mmfr0 = 0x00101122ull;
+cpu->isar.id_aa64mmfr1 = 0x10212122ull;
+cpu->isar.id_aa64mmfr2 = 0x1011ull;
+cpu->isar.id_aa64pfr0  = 0x1011ull;
+cpu->isar.id_aa64pfr1  = 0x0010ull;
+cpu->id_afr0   = 0x;
+cpu->isar.id_dfr0  = 0x04010088;
+cpu->isar.id_isar0 = 0x02101110;
+cpu->isar.id_isar1 = 0x13112111;
+cpu->isar.id_isar2 = 0x21232042;
+cpu->isar.id_isar3 = 0x01112131;
+cpu->isar.id_isar4 = 0x00011142;
+cpu->isar.id_isar5 = 0x01011121;
+cpu->isar.id_isar6 = 0x0010;
+cpu->isar.id_mmfr0 = 0x10201105;
+cpu->isar.id_mmfr1 = 0x4000;
+cpu->isar.id_mmfr2 = 0x0126;
+cpu->isar.id_mmfr3 = 0x02122211;
+cpu->isar.id_mmfr4 = 0x00021110;
+cpu->isar.id_pfr0  = 0x10010131;
+cpu->isar.id_pfr1  = 0x00011011;
+cpu->isar.id_pfr2  = 0x0011;
+cpu->midr = 0x412FD050;  /* r2p0 */
+cpu->revidr = 0;
+
+/* From B2.23 CCSIDR_EL1 */
+cpu->ccsidr[0] = 0x700fe01a; /* 32KB L1 dcache */
+cpu->ccsidr[1] = 0x200fe01a; /* 32KB L1 icache */
+cpu->ccsidr[2] = 0x703fe07a; /* 512KB L2 cache */
+
+/* From B2.96 SCTLR_EL3 */
+cpu->reset_sctlr = 0x30c50838;
+
+/* From B4.45 ICH_VTR_EL2 */
+cpu->gic_num_lrs = 4;
+cpu->gic_vpribits = 5;
+cpu->gic_vprebits = 5;
+cpu->gic_pribits = 5;
+
+cpu->isar.mvfr0 = 0x10110222;
+cpu->isar.mvfr1 = 0x1321;
+cpu->isar.mvfr2 = 0x0043;
+
+/* From D5.4 AArch64 PMU register summary */
+cpu->isar.reset_pmcr_el0 = 0x410b3000;
+}
+
 static void aarch64_a72_initfn(Object *obj)
 {
 ARMCPU *cpu = ARM_CPU(obj);
@@ -1243,6 +1311,7 @@ static const ARMCPUInfo aarch64_cpus[] = {
 { .name = "cortex-a35", .initfn = aarch64_a35_initfn },
 { .name = "cortex-a57", .initfn = aarch64_a57_initfn },
 { .name = "cortex-a53", .initfn = aarch64_a53_initfn },
+{ .name = "cortex-a55", .initfn = aarch64_a55_initfn },
 { .name = "cortex-a72", .initfn = aarch64_a72_initfn },
 { .name = "cortex-a76", .initfn = aarch64_a76_initfn },
 { .name = "a64fx",  .initfn = aarch64_a64fx_initfn },
-- 
2.25.1

[PULL 19/29] hw/intc: Convert TYPE_ARM_GIC_COMMON to 3-phase reset

2022-12-15 Thread Peter Maydell

Convert the TYPE_ARM_GIC_COMMON device to 3-phase reset.  This is a
simple no-behaviour-change conversion.

Signed-off-by: Peter Maydell 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Message-id: 20221109161444.3397405-4-peter.mayd...@linaro.org
---
 hw/intc/arm_gic_common.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hw/intc/arm_gic_common.c b/hw/intc/arm_gic_common.c
index 7b44d5625b6..a379cea3959 100644
--- a/hw/intc/arm_gic_common.c
+++ b/hw/intc/arm_gic_common.c
@@ -261,9 +261,9 @@ static inline void arm_gic_common_reset_irq_state(GICState 
*s, int first_cpu,
 }
 }
 
-static void arm_gic_common_reset(DeviceState *dev)
+static void arm_gic_common_reset_hold(Object *obj)
 {
-GICState *s = ARM_GIC_COMMON(dev);
+GICState *s = ARM_GIC_COMMON(obj);
 int i, j;
 int resetprio;
 
@@ -364,9 +364,10 @@ static Property arm_gic_common_properties[] = {
 static void arm_gic_common_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ResettableClass *rc = RESETTABLE_CLASS(klass);
 ARMLinuxBootIfClass *albifc = ARM_LINUX_BOOT_IF_CLASS(klass);
 
-dc->reset = arm_gic_common_reset;
+rc->phases.hold = arm_gic_common_reset_hold;
 dc->realize = arm_gic_common_realize;
 device_class_set_props(dc, arm_gic_common_properties);
 dc->vmsd = &vmstate_gic;
-- 
2.25.1

[PULL 14/29] target/arm: Implement HCR_EL2.TICAB,TOCU traps

2022-12-15 Thread Peter Maydell

For FEAT_EVT, the HCR_EL2.TICAB bit allows trapping of the ICIALLUIS
and IC IALLUIS cache maintenance instructions.

The HCR_EL2.TOCU bit traps all the other cache maintenance
instructions that operate to the point of unification:
 AArch64 IC IVAU, IC IALLU, DC CVAU
 AArch32 ICIMVAU, ICIALLU, DCCMVAU

The two trap bits between them cover all of the cache maintenance
instructions which must also check the HCR_TPU flag.  Turn the old
aa64_cacheop_pou_access() function into a helper function which takes
the set of HCR_EL2 flags to check as an argument, and call it from
new access_ticab() and access_tocu() functions as appropriate for
each cache op.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
---
 target/arm/helper.c | 36 +++-
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 0ec1c3ffbd6..eee95a42f7f 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -4273,9 +4273,7 @@ static CPAccessResult aa64_cacheop_poc_access(CPUARMState 
*env,
 return CP_ACCESS_OK;
 }
 
-static CPAccessResult aa64_cacheop_pou_access(CPUARMState *env,
-  const ARMCPRegInfo *ri,
-  bool isread)
+static CPAccessResult do_cacheop_pou_access(CPUARMState *env, uint64_t 
hcrflags)
 {
 /* Cache invalidate/clean to Point of Unification... */
 switch (arm_current_el(env)) {
@@ -4286,8 +4284,8 @@ static CPAccessResult aa64_cacheop_pou_access(CPUARMState 
*env,
 }
 /* fall through */
 case 1:
-/* ... EL1 must trap to EL2 if HCR_EL2.TPU is set.  */
-if (arm_hcr_el2_eff(env) & HCR_TPU) {
+/* ... EL1 must trap to EL2 if relevant HCR_EL2 flags are set.  */
+if (arm_hcr_el2_eff(env) & hcrflags) {
 return CP_ACCESS_TRAP_EL2;
 }
 break;
@@ -4295,6 +4293,18 @@ static CPAccessResult 
aa64_cacheop_pou_access(CPUARMState *env,
 return CP_ACCESS_OK;
 }
 
+static CPAccessResult access_ticab(CPUARMState *env, const ARMCPRegInfo *ri,
+   bool isread)
+{
+return do_cacheop_pou_access(env, HCR_TICAB | HCR_TPU);
+}
+
+static CPAccessResult access_tocu(CPUARMState *env, const ARMCPRegInfo *ri,
+  bool isread)
+{
+return do_cacheop_pou_access(env, HCR_TOCU | HCR_TPU);
+}
+
 /* See: D4.7.2 TLB maintenance requirements and the TLB maintenance 
instructions
  * Page D4-1736 (DDI0487A.b)
  */
@@ -4935,15 +4945,15 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
 { .name = "IC_IALLUIS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 1, .opc2 = 0,
   .access = PL1_W, .type = ARM_CP_NOP,
-  .accessfn = aa64_cacheop_pou_access },
+  .accessfn = access_ticab },
 { .name = "IC_IALLU", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 5, .opc2 = 0,
   .access = PL1_W, .type = ARM_CP_NOP,
-  .accessfn = aa64_cacheop_pou_access },
+  .accessfn = access_tocu },
 { .name = "IC_IVAU", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 5, .opc2 = 1,
   .access = PL0_W, .type = ARM_CP_NOP,
-  .accessfn = aa64_cacheop_pou_access },
+  .accessfn = access_tocu },
 { .name = "DC_IVAC", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 1,
   .access = PL1_W, .accessfn = aa64_cacheop_poc_access,
@@ -4961,7 +4971,7 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
 { .name = "DC_CVAU", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 11, .opc2 = 1,
   .access = PL0_W, .type = ARM_CP_NOP,
-  .accessfn = aa64_cacheop_pou_access },
+  .accessfn = access_tocu },
 { .name = "DC_CIVAC", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 14, .opc2 = 1,
   .access = PL0_W, .type = ARM_CP_NOP,
@@ -5138,13 +5148,13 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
   .writefn = tlbiipas2is_hyp_write },
 /* 32 bit cache operations */
 { .name = "ICIALLUIS", .cp = 15, .opc1 = 0, .crn = 7, .crm = 1, .opc2 = 0,
-  .type = ARM_CP_NOP, .access = PL1_W, .accessfn = aa64_cacheop_pou_access 
},
+  .type = ARM_CP_NOP, .access = PL1_W, .accessfn = access_ticab },
 { .name = "BPIALLUIS", .cp = 15, .opc1 = 0, .crn = 7, .crm = 1, .opc2 = 6,
   .type = ARM_CP_NOP, .access = PL1_W },
 { .name = "ICIALLU", .cp = 15, .opc1 = 0, .crn = 7, .crm = 5, .opc2 = 0,
-  .type = ARM_CP_NOP, .access = PL1_W, .accessfn = aa64_cacheop_pou_access 
},
+  .type = ARM_CP_NOP, .access = PL1_W, .accessfn = access_tocu },
 { .name = "ICIMVAU", .cp = 15, .opc1 = 0, .crn = 7, .crm = 5, .opc2 = 1,
-  .type = ARM_CP_NOP, .access = PL1_W, .accessfn = aa64_cacheop_pou_access 
},
+  .type = ARM_CP_NOP, .access = PL1_W, .accessfn = access_tocu },
 { .name = "BPIALL", .cp = 15, .opc1 = 0, .

[PULL 11/29] target/arm: Allow relevant HCR bits to be written for FEAT_EVT

2022-12-15 Thread Peter Maydell

FEAT_EVT adds five new bits to the HCR_EL2 register: TTLBIS, TTLBOS,
TICAB, TOCU and TID4.  These allow the guest to enable trapping of
various EL1 instructions to EL2.  In this commit, add the necessary
code to allow the guest to set these bits if the feature is present;
because the bit is always zero when the feature isn't present we
won't need to use explicit feature checks in the "trap on condition"
tests in the following commits.

Note that although full implementation of the feature (mandatory from
Armv8.5 onward) requires all five trap bits, the ID registers permit
a value indicating that only TICAB, TOCU and TID4 are implemented,
which might be the case for CPUs between Armv8.2 and Armv8.5.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
---
 target/arm/cpu.h| 30 ++
 target/arm/helper.c |  6 ++
 2 files changed, 36 insertions(+)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 9aeed3c8481..2b4bd20f9d0 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3757,6 +3757,16 @@ static inline bool isar_feature_aa32_tts2uxn(const 
ARMISARegisters *id)
 return FIELD_EX32(id->id_mmfr4, ID_MMFR4, XNX) != 0;
 }
 
+static inline bool isar_feature_aa32_half_evt(const ARMISARegisters *id)
+{
+return FIELD_EX32(id->id_mmfr4, ID_MMFR4, EVT) >= 1;
+}
+
+static inline bool isar_feature_aa32_evt(const ARMISARegisters *id)
+{
+return FIELD_EX32(id->id_mmfr4, ID_MMFR4, EVT) >= 2;
+}
+
 static inline bool isar_feature_aa32_dit(const ARMISARegisters *id)
 {
 return FIELD_EX32(id->id_pfr0, ID_PFR0, DIT) != 0;
@@ -4029,6 +4039,16 @@ static inline bool isar_feature_aa64_ids(const 
ARMISARegisters *id)
 return FIELD_EX64(id->id_aa64mmfr2, ID_AA64MMFR2, IDS) != 0;
 }
 
+static inline bool isar_feature_aa64_half_evt(const ARMISARegisters *id)
+{
+return FIELD_EX64(id->id_aa64mmfr2, ID_AA64MMFR2, EVT) >= 1;
+}
+
+static inline bool isar_feature_aa64_evt(const ARMISARegisters *id)
+{
+return FIELD_EX64(id->id_aa64mmfr2, ID_AA64MMFR2, EVT) >= 2;
+}
+
 static inline bool isar_feature_aa64_bti(const ARMISARegisters *id)
 {
 return FIELD_EX64(id->id_aa64pfr1, ID_AA64PFR1, BT) != 0;
@@ -4313,6 +4333,16 @@ static inline bool isar_feature_any_ras(const 
ARMISARegisters *id)
 return isar_feature_aa64_ras(id) || isar_feature_aa32_ras(id);
 }
 
+static inline bool isar_feature_any_half_evt(const ARMISARegisters *id)
+{
+return isar_feature_aa64_half_evt(id) || isar_feature_aa32_half_evt(id);
+}
+
+static inline bool isar_feature_any_evt(const ARMISARegisters *id)
+{
+return isar_feature_aa64_evt(id) || isar_feature_aa32_evt(id);
+}
+
 /*
  * Forward to the above feature tests given an ARMCPU pointer.
  */
diff --git a/target/arm/helper.c b/target/arm/helper.c
index d8c8223ec38..751c360ce45 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -5267,6 +5267,12 @@ static void do_hcr_write(CPUARMState *env, uint64_t 
value, uint64_t valid_mask)
 }
 }
 
+if (cpu_isar_feature(any_evt, cpu)) {
+valid_mask |= HCR_TTLBIS | HCR_TTLBOS | HCR_TICAB | HCR_TOCU | 
HCR_TID4;
+} else if (cpu_isar_feature(any_half_evt, cpu)) {
+valid_mask |= HCR_TICAB | HCR_TOCU | HCR_TID4;
+}
+
 /* Clear RES0 bits.  */
 value &= valid_mask;
 
-- 
2.25.1

[PULL 08/29] hw/arm/virt: build SMBIOS 19 table

2022-12-15 Thread Peter Maydell

From: Mihai Carabas 

Use the base_memmap to build the SMBIOS 19 table which provides the address
mapping for a Physical Memory Array (from spec [1] chapter 7.20).

This was present on i386 from commit c97294ec1b9e36887e119589d456557d72ab37b5
("SMBIOS: Build aggregate smbios tables and entry point").

[1] 
https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.5.0.pdf

The absence of this table is a breach of the specs and is
detected by the FirmwareTestSuite (FWTS), but it doesn't
cause any known problems for guest OSes.

Signed-off-by: Mihai Carabas 
Message-id: 1668789029-5432-1-git-send-email-mihai.cara...@oracle.com
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 hw/arm/virt.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 0acb71be962..bf59784aefa 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1614,9 +1614,11 @@ static void *machvirt_dtb(const struct arm_boot_info 
*binfo, int *fdt_size)
 static void virt_build_smbios(VirtMachineState *vms)
 {
 MachineClass *mc = MACHINE_GET_CLASS(vms);
+MachineState *ms = MACHINE(vms);
 VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms);
 uint8_t *smbios_tables, *smbios_anchor;
 size_t smbios_tables_len, smbios_anchor_len;
+struct smbios_phys_mem_area mem_array;
 const char *product = "QEMU Virtual Machine";
 
 if (kvm_enabled()) {
@@ -1627,7 +1629,11 @@ static void virt_build_smbios(VirtMachineState *vms)
 vmc->smbios_old_sys_ver ? "1.0" : mc->name, false,
 true, SMBIOS_ENTRY_POINT_TYPE_64);
 
-smbios_get_tables(MACHINE(vms), NULL, 0,
+/* build the array of physical mem area from base_memmap */
+mem_array.address = vms->memmap[VIRT_MEM].base;
+mem_array.length = ms->ram_size;
+
+smbios_get_tables(ms, &mem_array, 1,
   &smbios_tables, &smbios_tables_len,
   &smbios_anchor, &smbios_anchor_len,
   &error_fatal);
-- 
2.25.1

[PULL 03/29] hw/arm/virt: Introduce variable region_base in virt_set_high_memmap()

2022-12-15 Thread Peter Maydell

From: Gavin Shan 

This introduces variable 'region_base' for the base address of the
specific high memory region. It's the preparatory work to optimize
high memory region address assignment.

No functional change intended.

Signed-off-by: Gavin Shan 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
Reviewed-by: Marc Zyngier 
Tested-by: Zhenyu Zhang 
Message-id: 20221029224307.138822-4-gs...@redhat.com
Signed-off-by: Peter Maydell 
---
 hw/arm/virt.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 2659f4db15c..3bb1bf079ff 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1693,15 +1693,15 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState 
*vms, int idx)
 static void virt_set_high_memmap(VirtMachineState *vms,
  hwaddr base, int pa_bits)
 {
-hwaddr region_size;
+hwaddr region_base, region_size;
 bool fits;
 int i;
 
 for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
+region_base = ROUND_UP(base, extended_memmap[i].size);
 region_size = extended_memmap[i].size;
 
-base = ROUND_UP(base, region_size);
-vms->memmap[i].base = base;
+vms->memmap[i].base = region_base;
 vms->memmap[i].size = region_size;
 
 /*
@@ -1710,9 +1710,9 @@ static void virt_set_high_memmap(VirtMachineState *vms,
  *
  * For each device that doesn't fit, disable it.
  */
-fits = (base + region_size) <= BIT_ULL(pa_bits);
+fits = (region_base + region_size) <= BIT_ULL(pa_bits);
 if (fits) {
-vms->highest_gpa = base + region_size - 1;
+vms->highest_gpa = region_base + region_size - 1;
 }
 
 switch (i) {
@@ -1727,7 +1727,7 @@ static void virt_set_high_memmap(VirtMachineState *vms,
 break;
 }
 
-base += region_size;
+base = region_base + region_size;
 }
 }
 
-- 
2.25.1

[PULL 25/29] hw/intc: Convert TYPE_KVM_ARM_ITS to 3-phase reset

2022-12-15 Thread Peter Maydell

Convert the TYPE_KVM_ARM_ITS device to 3-phase reset.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Message-id: 20221109161444.3397405-10-peter.mayd...@linaro.org
---
 hw/intc/arm_gicv3_its_kvm.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/hw/intc/arm_gicv3_its_kvm.c b/hw/intc/arm_gicv3_its_kvm.c
index 529c7bd4946..7eda9fb86ea 100644
--- a/hw/intc/arm_gicv3_its_kvm.c
+++ b/hw/intc/arm_gicv3_its_kvm.c
@@ -37,7 +37,7 @@ DECLARE_OBJ_CHECKERS(GICv3ITSState, KVMARMITSClass,
 
 struct KVMARMITSClass {
 GICv3ITSCommonClass parent_class;
-void (*parent_reset)(DeviceState *dev);
+ResettablePhases parent_phases;
 };
 
 
@@ -197,13 +197,15 @@ static void kvm_arm_its_post_load(GICv3ITSState *s)
   GITS_CTLR, &s->ctlr, true, &error_abort);
 }
 
-static void kvm_arm_its_reset(DeviceState *dev)
+static void kvm_arm_its_reset_hold(Object *obj)
 {
-GICv3ITSState *s = ARM_GICV3_ITS_COMMON(dev);
+GICv3ITSState *s = ARM_GICV3_ITS_COMMON(obj);
 KVMARMITSClass *c = KVM_ARM_ITS_GET_CLASS(s);
 int i;
 
-c->parent_reset(dev);
+if (c->parent_phases.hold) {
+c->parent_phases.hold(obj);
+}
 
 if (kvm_device_check_attr(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
KVM_DEV_ARM_ITS_CTRL_RESET)) {
@@ -241,12 +243,14 @@ static Property kvm_arm_its_props[] = {
 static void kvm_arm_its_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ResettableClass *rc = RESETTABLE_CLASS(klass);
 GICv3ITSCommonClass *icc = ARM_GICV3_ITS_COMMON_CLASS(klass);
 KVMARMITSClass *ic = KVM_ARM_ITS_CLASS(klass);
 
 dc->realize = kvm_arm_its_realize;
 device_class_set_props(dc, kvm_arm_its_props);
-device_class_set_parent_reset(dc, kvm_arm_its_reset, &ic->parent_reset);
+resettable_class_set_parent_phases(rc, NULL, kvm_arm_its_reset_hold, NULL,
+   &ic->parent_phases);
 icc->send_msi = kvm_its_send_msi;
 icc->pre_save = kvm_arm_its_pre_save;
 icc->post_load = kvm_arm_its_post_load;
-- 
2.25.1

[PULL 06/29] hw/arm/virt: Add 'compact-highmem' property

2022-12-15 Thread Peter Maydell

From: Gavin Shan 

After the improvement to high memory region address assignment is
applied, the memory layout can be changed, introducing possible
migration breakage. For example, VIRT_HIGH_PCIE_MMIO memory region
is disabled or enabled when the optimization is applied or not, with
the following configuration. The configuration is only achievable by
modifying the source code until more properties are added to allow
users selectively disable those high memory regions.

  pa_bits  = 40;
  vms->highmem_redists = false;
  vms->highmem_ecam= false;
  vms->highmem_mmio= true;

  # qemu-system-aarch64 -accel kvm -cpu host\
-machine virt-7.2,compact-highmem={on, off} \
-m 4G,maxmem=511G -monitor stdio

  Region compact-highmem=off compact-highmem=on
  
  MEM[1GB 512GB][1GB 512GB]
  HIGH_GIC_REDISTS2  [512GB   512GB+64MB]   [disabled]
  HIGH_PCIE_ECAM [512GB+256MB 512GB+512MB]  [disabled]
  HIGH_PCIE_MMIO [disabled] [512GB   1TB]

In order to keep backwords compatibility, we need to disable the
optimization on machine, which is virt-7.1 or ealier than it. It
means the optimization is enabled by default from virt-7.2. Besides,
'compact-highmem' property is added so that the optimization can be
explicitly enabled or disabled on all machine types by users.

Signed-off-by: Gavin Shan 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
Reviewed-by: Marc Zyngier 
Tested-by: Zhenyu Zhang 
Message-id: 20221029224307.138822-7-gs...@redhat.com
Signed-off-by: Peter Maydell 
---
 docs/system/arm/virt.rst |  4 
 include/hw/arm/virt.h|  1 +
 hw/arm/virt.c| 32 
 3 files changed, 37 insertions(+)

diff --git a/docs/system/arm/virt.rst b/docs/system/arm/virt.rst
index 20442ea2c13..4454706392c 100644
--- a/docs/system/arm/virt.rst
+++ b/docs/system/arm/virt.rst
@@ -94,6 +94,10 @@ highmem
   address space above 32 bits. The default is ``on`` for machine types
   later than ``virt-2.12``.
 
+compact-highmem
+  Set ``on``/``off`` to enable/disable the compact layout for high memory 
regions.
+  The default is ``on`` for machine types later than ``virt-7.2``.
+
 gic-version
   Specify the version of the Generic Interrupt Controller (GIC) to provide.
   Valid values are:
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 709f6237412..c7dd59d7f1f 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -125,6 +125,7 @@ struct VirtMachineClass {
 bool no_pmu;
 bool claim_edge_triggered_timers;
 bool smbios_old_sys_ver;
+bool no_highmem_compact;
 bool no_highmem_ecam;
 bool no_ged;   /* Machines < 4.2 have no support for ACPI GED device */
 bool kvm_no_adjvtime;
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 807175707e7..3d1371c05c0 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -174,6 +174,12 @@ static const MemMapEntry base_memmap[] = {
  * Note the extended_memmap is sized so that it eventually also includes the
  * base_memmap entries (VIRT_HIGH_GIC_REDIST2 index is greater than the last
  * index of base_memmap).
+ *
+ * The memory map for these Highmem IO Regions can be in legacy or compact
+ * layout, depending on 'compact-highmem' property. With legacy layout, the
+ * PA space for one specific region is always reserved, even if the region
+ * has been disabled or doesn't fit into the PA space. However, the PA space
+ * for the region won't be reserved in these circumstances with compact layout.
  */
 static MemMapEntry extended_memmap[] = {
 /* Additional 64 MB redist region (can contain up to 512 redistributors) */
@@ -2352,6 +2358,20 @@ static void virt_set_highmem(Object *obj, bool value, 
Error **errp)
 vms->highmem = value;
 }
 
+static bool virt_get_compact_highmem(Object *obj, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+return vms->highmem_compact;
+}
+
+static void virt_set_compact_highmem(Object *obj, bool value, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+vms->highmem_compact = value;
+}
+
 static bool virt_get_its(Object *obj, Error **errp)
 {
 VirtMachineState *vms = VIRT_MACHINE(obj);
@@ -2970,6 +2990,13 @@ static void virt_machine_class_init(ObjectClass *oc, 
void *data)
   "Set on/off to enable/disable using "
   "physical address space above 32 
bits");
 
+object_class_property_add_bool(oc, "compact-highmem",
+   virt_get_compact_highmem,
+   virt_set_compact_highmem);
+object_class_property_set_description(oc, "compact-highmem",
+  "Set on/off to enable/disable 
compact "
+  "layout for high memory regions");
+
 object_clas

[PULL 12/29] target/arm: Implement HCR_EL2.TTLBIS traps

2022-12-15 Thread Peter Maydell

For FEAT_EVT, the HCR_EL2.TTLBIS bit allows trapping on EL1 use of
TLB maintenance instructions that operate on the inner shareable
domain:

AArch64:
 TLBI VMALLE1IS, TLBI VAE1IS, TLBI ASIDE1IS, TLBI VAAE1IS,
 TLBI VALE1IS, TLBI VAALE1IS, TLBI RVAE1IS, TLBI RVAAE1IS,
 TLBI RVALE1IS, and TLBI RVAALE1IS.

AArch32:
 TLBIALLIS, TLBIMVAIS, TLBIASIDIS, TLBIMVAAIS, TLBIMVALIS,
 and TLBIMVAALIS.

Add the trapping support.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
---
 target/arm/helper.c | 43 +++
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 751c360ce45..475b48750e9 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -362,6 +362,17 @@ static CPAccessResult access_ttlb(CPUARMState *env, const 
ARMCPRegInfo *ri,
 return CP_ACCESS_OK;
 }
 
+/* Check for traps from EL1 due to HCR_EL2.TTLB or TTLBIS. */
+static CPAccessResult access_ttlbis(CPUARMState *env, const ARMCPRegInfo *ri,
+bool isread)
+{
+if (arm_current_el(env) == 1 &&
+(arm_hcr_el2_eff(env) & (HCR_TTLB | HCR_TTLBIS))) {
+return CP_ACCESS_TRAP_EL2;
+}
+return CP_ACCESS_OK;
+}
+
 static void dacr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t 
value)
 {
 ARMCPU *cpu = env_archcpu(env);
@@ -2206,16 +2217,16 @@ static const ARMCPRegInfo v7_cp_reginfo[] = {
 static const ARMCPRegInfo v7mp_cp_reginfo[] = {
 /* 32 bit TLB invalidates, Inner Shareable */
 { .name = "TLBIALLIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 0,
-  .type = ARM_CP_NO_RAW, .access = PL1_W, .accessfn = access_ttlb,
+  .type = ARM_CP_NO_RAW, .access = PL1_W, .accessfn = access_ttlbis,
   .writefn = tlbiall_is_write },
 { .name = "TLBIMVAIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 1,
-  .type = ARM_CP_NO_RAW, .access = PL1_W, .accessfn = access_ttlb,
+  .type = ARM_CP_NO_RAW, .access = PL1_W, .accessfn = access_ttlbis,
   .writefn = tlbimva_is_write },
 { .name = "TLBIASIDIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 2,
-  .type = ARM_CP_NO_RAW, .access = PL1_W, .accessfn = access_ttlb,
+  .type = ARM_CP_NO_RAW, .access = PL1_W, .accessfn = access_ttlbis,
   .writefn = tlbiasid_is_write },
 { .name = "TLBIMVAAIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 3,
-  .type = ARM_CP_NO_RAW, .access = PL1_W, .accessfn = access_ttlb,
+  .type = ARM_CP_NO_RAW, .access = PL1_W, .accessfn = access_ttlbis,
   .writefn = tlbimvaa_is_write },
 };
 
@@ -4948,27 +4959,27 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
 /* TLBI operations */
 { .name = "TLBI_VMALLE1IS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 0,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbis, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vmalle1is_write },
 { .name = "TLBI_VAE1IS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 1,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbis, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vae1is_write },
 { .name = "TLBI_ASIDE1IS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 2,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbis, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vmalle1is_write },
 { .name = "TLBI_VAAE1IS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 3,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbis, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vae1is_write },
 { .name = "TLBI_VALE1IS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 5,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbis, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vae1is_write },
 { .name = "TLBI_VAALE1IS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 7,
-  .access = PL1_W, .accessfn = access_ttlb, .type = ARM_CP_NO_RAW,
+  .access = PL1_W, .accessfn = access_ttlbis, .type = ARM_CP_NO_RAW,
   .writefn = tlbi_aa64_vae1is_write },
 { .name = "TLBI_VMALLE1", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 7, .opc2 = 0,
@@ -5078,10 +5089,10 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
 #endif
 /* TLB invalidate last level of translation table walk */
 { .name = "TLBIMVALIS", .cp = 15, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 5,
-  .type = ARM_CP_NO_RAW, .access = PL1_W, .accessfn = access_ttlb,
+  .type = ARM_

[PULL 28/29] hw/misc: Move some arm-related files from specific_ss into softmmu_ss

2022-12-15 Thread Peter Maydell

From: Thomas Huth 

The header target/arm/kvm-consts.h checks CONFIG_KVM which is marked as
poisoned in common code, so the files that include this header have to
be added to specific_ss and recompiled for each, qemu-system-arm and
qemu-system-aarch64. However, since the kvm headers are only optionally
used in kvm-constants.h for some sanity checks, we can additionally
check the NEED_CPU_H macro first to avoid the poisoned CONFIG_KVM macro,
so kvm-constants.h can also be used from "common" files (without the
sanity checks - which should be OK since they are still done from other
target-specific files instead). This way, and by adjusting some other
include statements in the related files here and there, we can move some
files from specific_ss into softmmu_ss, so that they only need to be
compiled once during the build process.

Signed-off-by: Thomas Huth 
Reviewed-by: Philippe Mathieu-Daudé 
Message-id: 20221202154023.293614-1-th...@redhat.com
Signed-off-by: Peter Maydell 
---
 include/hw/misc/xlnx-zynqmp-apu-ctrl.h |  2 +-
 target/arm/kvm-consts.h|  8 
 hw/misc/imx6_src.c |  2 +-
 hw/misc/iotkit-sysctl.c|  1 -
 hw/misc/meson.build| 11 +--
 5 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/hw/misc/xlnx-zynqmp-apu-ctrl.h 
b/include/hw/misc/xlnx-zynqmp-apu-ctrl.h
index b8ca9434afb..c3bf3c1583b 100644
--- a/include/hw/misc/xlnx-zynqmp-apu-ctrl.h
+++ b/include/hw/misc/xlnx-zynqmp-apu-ctrl.h
@@ -13,7 +13,7 @@
 
 #include "hw/sysbus.h"
 #include "hw/register.h"
-#include "target/arm/cpu.h"
+#include "target/arm/cpu-qom.h"
 
 #define TYPE_XLNX_ZYNQMP_APU_CTRL "xlnx.apu-ctrl"
 OBJECT_DECLARE_SIMPLE_TYPE(XlnxZynqMPAPUCtrl, XLNX_ZYNQMP_APU_CTRL)
diff --git a/target/arm/kvm-consts.h b/target/arm/kvm-consts.h
index faacf96fdc7..09967ec5e64 100644
--- a/target/arm/kvm-consts.h
+++ b/target/arm/kvm-consts.h
@@ -14,16 +14,16 @@
 #ifndef ARM_KVM_CONSTS_H
 #define ARM_KVM_CONSTS_H
 
+#ifdef NEED_CPU_H
 #ifdef CONFIG_KVM
 #include 
 #include 
-
 #define MISMATCH_CHECK(X, Y) QEMU_BUILD_BUG_ON(X != Y)
+#endif
+#endif
 
-#else
-
+#ifndef MISMATCH_CHECK
 #define MISMATCH_CHECK(X, Y) QEMU_BUILD_BUG_ON(0)
-
 #endif
 
 #define CP_REG_SIZE_SHIFT 52
diff --git a/hw/misc/imx6_src.c b/hw/misc/imx6_src.c
index 7b0e968804a..a9c64d06ebc 100644
--- a/hw/misc/imx6_src.c
+++ b/hw/misc/imx6_src.c
@@ -15,7 +15,7 @@
 #include "qemu/log.h"
 #include "qemu/main-loop.h"
 #include "qemu/module.h"
-#include "arm-powerctl.h"
+#include "target/arm/arm-powerctl.h"
 #include "hw/core/cpu.h"
 
 #ifndef DEBUG_IMX6_SRC
diff --git a/hw/misc/iotkit-sysctl.c b/hw/misc/iotkit-sysctl.c
index 7147e2f84e6..e664215ee67 100644
--- a/hw/misc/iotkit-sysctl.c
+++ b/hw/misc/iotkit-sysctl.c
@@ -30,7 +30,6 @@
 #include "hw/qdev-properties.h"
 #include "hw/arm/armsse-version.h"
 #include "target/arm/arm-powerctl.h"
-#include "target/arm/cpu.h"
 
 REG32(SECDBGSTAT, 0x0)
 REG32(SECDBGSET, 0x4)
diff --git a/hw/misc/meson.build b/hw/misc/meson.build
index 95268eddc07..ed0598dc9eb 100644
--- a/hw/misc/meson.build
+++ b/hw/misc/meson.build
@@ -51,6 +51,7 @@ softmmu_ss.add(when: 'CONFIG_IMX', if_true: files(
   'imx25_ccm.c',
   'imx31_ccm.c',
   'imx6_ccm.c',
+  'imx6_src.c',
   'imx6ul_ccm.c',
   'imx7_ccm.c',
   'imx7_gpr.c',
@@ -84,8 +85,8 @@ softmmu_ss.add(when: 'CONFIG_RASPI', if_true: files(
 ))
 softmmu_ss.add(when: 'CONFIG_SLAVIO', if_true: files('slavio_misc.c'))
 softmmu_ss.add(when: 'CONFIG_ZYNQ', if_true: files('zynq_slcr.c'))
-specific_ss.add(when: 'CONFIG_XLNX_ZYNQMP_ARM', if_true: 
files('xlnx-zynqmp-crf.c'))
-specific_ss.add(when: 'CONFIG_XLNX_ZYNQMP_ARM', if_true: 
files('xlnx-zynqmp-apu-ctrl.c'))
+softmmu_ss.add(when: 'CONFIG_XLNX_ZYNQMP_ARM', if_true: 
files('xlnx-zynqmp-crf.c'))
+softmmu_ss.add(when: 'CONFIG_XLNX_ZYNQMP_ARM', if_true: 
files('xlnx-zynqmp-apu-ctrl.c'))
 specific_ss.add(when: 'CONFIG_XLNX_VERSAL', if_true: 
files('xlnx-versal-crl.c'))
 softmmu_ss.add(when: 'CONFIG_XLNX_VERSAL', if_true: files(
   'xlnx-versal-xramc.c',
@@ -101,6 +102,7 @@ softmmu_ss.add(when: 'CONFIG_TZ_MPC', if_true: 
files('tz-mpc.c'))
 softmmu_ss.add(when: 'CONFIG_TZ_MSC', if_true: files('tz-msc.c'))
 softmmu_ss.add(when: 'CONFIG_TZ_PPC', if_true: files('tz-ppc.c'))
 softmmu_ss.add(when: 'CONFIG_IOTKIT_SECCTL', if_true: files('iotkit-secctl.c'))
+softmmu_ss.add(when: 'CONFIG_IOTKIT_SYSCTL', if_true: files('iotkit-sysctl.c'))
 softmmu_ss.add(when: 'CONFIG_IOTKIT_SYSINFO', if_true: 
files('iotkit-sysinfo.c'))
 softmmu_ss.add(when: 'CONFIG_ARMSSE_CPU_PWRCTRL', if_true: 
files('armsse-cpu-pwrctrl.c'))
 softmmu_ss.add(when: 'CONFIG_ARMSSE_CPUID', if_true: files('armsse-cpuid.c'))
@@ -126,15 +128,12 @@ softmmu_ss.add(when: 'CONFIG_GRLIB', if_true: 
files('grlib_ahb_apb_pnp.c'))
 
 specific_ss.add(when: 'CONFIG_AVR_POWER', if_true: files('avr_power.c'))
 
-specific_ss.add(when: 'CONFIG_IMX', if_true: files('imx6_src.c'))
-specific_ss.add(when: 'CONFIG_IOTKIT_SYSCTL', if_t

[PULL 04/29] hw/arm/virt: Introduce virt_get_high_memmap_enabled() helper

2022-12-15 Thread Peter Maydell

From: Gavin Shan 

This introduces virt_get_high_memmap_enabled() helper, which returns
the pointer to vms->highmem_{redists, ecam, mmio}. The pointer will
be used in the subsequent patches.

No functional change intended.

Signed-off-by: Gavin Shan 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
Reviewed-by: Marc Zyngier 
Tested-by: Zhenyu Zhang 
Message-id: 20221029224307.138822-5-gs...@redhat.com
Signed-off-by: Peter Maydell 
---
 hw/arm/virt.c | 32 +++-
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 3bb1bf079ff..7689337470a 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1690,14 +1690,31 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState 
*vms, int idx)
 return arm_cpu_mp_affinity(idx, clustersz);
 }
 
+static inline bool *virt_get_high_memmap_enabled(VirtMachineState *vms,
+ int index)
+{
+bool *enabled_array[] = {
+&vms->highmem_redists,
+&vms->highmem_ecam,
+&vms->highmem_mmio,
+};
+
+assert(ARRAY_SIZE(extended_memmap) - VIRT_LOWMEMMAP_LAST ==
+   ARRAY_SIZE(enabled_array));
+assert(index - VIRT_LOWMEMMAP_LAST < ARRAY_SIZE(enabled_array));
+
+return enabled_array[index - VIRT_LOWMEMMAP_LAST];
+}
+
 static void virt_set_high_memmap(VirtMachineState *vms,
  hwaddr base, int pa_bits)
 {
 hwaddr region_base, region_size;
-bool fits;
+bool *region_enabled, fits;
 int i;
 
 for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
+region_enabled = virt_get_high_memmap_enabled(vms, i);
 region_base = ROUND_UP(base, extended_memmap[i].size);
 region_size = extended_memmap[i].size;
 
@@ -1715,18 +1732,7 @@ static void virt_set_high_memmap(VirtMachineState *vms,
 vms->highest_gpa = region_base + region_size - 1;
 }
 
-switch (i) {
-case VIRT_HIGH_GIC_REDIST2:
-vms->highmem_redists &= fits;
-break;
-case VIRT_HIGH_PCIE_ECAM:
-vms->highmem_ecam &= fits;
-break;
-case VIRT_HIGH_PCIE_MMIO:
-vms->highmem_mmio &= fits;
-break;
-}
-
+*region_enabled &= fits;
 base = region_base + region_size;
 }
 }
-- 
2.25.1

[PULL 26/29] hw/arm/boot: set initrd with #address-cells type in fdt

2022-12-15 Thread Peter Maydell

From: Schspa Shi 

We use 32bit value for linux,initrd-[start/end], when we have
loader_start > 4GB, there will be a wrong initrd_start passed
to the kernel, and the kernel will report the following warning.

[0.00] [ cut here ]
[0.00] initrd not fully accessible via the linear mapping -- please 
check your bootloader ...
[0.00] WARNING: CPU: 0 PID: 0 at arch/arm64/mm/init.c:355 
arm64_memblock_init+0x158/0x244
[0.00] Modules linked in:
[0.00] CPU: 0 PID: 0 Comm: swapper Tainted: GW  
6.1.0-rc3-13250-g30a0b95b1335-dirty #28
[0.00] Hardware name: Horizon Sigi Virtual development board (DT)
[0.00] pstate: 60c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[0.00] pc : arm64_memblock_init+0x158/0x244
[0.00] lr : arm64_memblock_init+0x158/0x244
[0.00] sp : 89273df0
[0.00] x29: 89273df0 x28: 001000cc0010 x27: 8000
[0.00] x26: 0050a3e2 x25: 88b46000 x24: 88b46000
[0.00] x23: 88a53000 x22: 8942 x21: 88a53000
[0.00] x20: 0400 x19: 0400 x18: 1020
[0.00] x17: 6568632065736165 x16: 6c70202d2d20676e x15: 697070616d207261
[0.00] x14: 656e696c20656874 x13: 0a2e2e2e20726564 x12: 
[0.00] x11:  x10:  x9 : 
[0.00] x8 :  x7 : 796c6c756620746f x6 : 6e20647274696e69
[0.00] x5 : 893c7c47 x4 : 88a2102f x3 : 89273a88
[0.00] x2 : 8000f038 x1 : 00c0 x0 : 0056
[0.00] Call trace:
[0.00]  arm64_memblock_init+0x158/0x244
[0.00]  setup_arch+0x164/0x1cc
[0.00]  start_kernel+0x94/0x4ac
[0.00]  __primary_switched+0xb4/0xbc
[0.00] ---[ end trace  ]---
[0.00] Zone ranges:
[0.00]   DMA  [mem 0x0010-0x001007ff]

This doesn't affect any machine types we currently support, because
for all of our machine types the RAM starts well below the 4GB
mark, but it does demonstrate that we're not currently writing
the device-tree properties quite as intended.

To fix it, we can change it to write these values to the dtb using a
type width matching #address-cells.  This is the intended size for
these dtb properties, and is how u-boot, for instance, writes them,
although in practice the Linux kernel will cope with them being any
width as long as they're big enough to fit the value.

Signed-off-by: Schspa Shi 
Message-id: 20221129160724.75667-1-sch...@gmail.com
[PMM: tweaked commit message]
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 hw/arm/boot.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 15c2bf1867f..3d7d11f782f 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -656,15 +656,17 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info 
*binfo,
 }
 
 if (binfo->initrd_size) {
-rc = qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-start",
-   binfo->initrd_start);
+rc = qemu_fdt_setprop_sized_cells(fdt, "/chosen", "linux,initrd-start",
+  acells, binfo->initrd_start);
 if (rc < 0) {
 fprintf(stderr, "couldn't set /chosen/linux,initrd-start\n");
 goto fail;
 }
 
-rc = qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-end",
-   binfo->initrd_start + binfo->initrd_size);
+rc = qemu_fdt_setprop_sized_cells(fdt, "/chosen", "linux,initrd-end",
+  acells,
+  binfo->initrd_start +
+  binfo->initrd_size);
 if (rc < 0) {
 fprintf(stderr, "couldn't set /chosen/linux,initrd-end\n");
 goto fail;
-- 
2.25.1

[PULL 02/29] hw/arm/virt: Rename variable size to region_size in virt_set_high_memmap()

2022-12-15 Thread Peter Maydell

From: Gavin Shan 

This renames variable 'size' to 'region_size' in virt_set_high_memmap().
Its counterpart ('region_base') will be introduced in next patch.

No functional change intended.

Signed-off-by: Gavin Shan 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
Reviewed-by: Marc Zyngier 
Tested-by: Zhenyu Zhang 
Message-id: 20221029224307.138822-3-gs...@redhat.com
Signed-off-by: Peter Maydell 
---
 hw/arm/virt.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index ca300281939..2659f4db15c 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1693,15 +1693,16 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState 
*vms, int idx)
 static void virt_set_high_memmap(VirtMachineState *vms,
  hwaddr base, int pa_bits)
 {
+hwaddr region_size;
+bool fits;
 int i;
 
 for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
-hwaddr size = extended_memmap[i].size;
-bool fits;
+region_size = extended_memmap[i].size;
 
-base = ROUND_UP(base, size);
+base = ROUND_UP(base, region_size);
 vms->memmap[i].base = base;
-vms->memmap[i].size = size;
+vms->memmap[i].size = region_size;
 
 /*
  * Check each device to see if they fit in the PA space,
@@ -1709,9 +1710,9 @@ static void virt_set_high_memmap(VirtMachineState *vms,
  *
  * For each device that doesn't fit, disable it.
  */
-fits = (base + size) <= BIT_ULL(pa_bits);
+fits = (base + region_size) <= BIT_ULL(pa_bits);
 if (fits) {
-vms->highest_gpa = base + size - 1;
+vms->highest_gpa = base + region_size - 1;
 }
 
 switch (i) {
@@ -1726,7 +1727,7 @@ static void virt_set_high_memmap(VirtMachineState *vms,
 break;
 }
 
-base += size;
+base += region_size;
 }
 }
 
-- 
2.25.1

[PULL 22/29] hw/intc: Convert TYPE_KVM_ARM_GICV3 to 3-phase reset

2022-12-15 Thread Peter Maydell

Convert the TYPE_KVM_ARM_GICV3 device to 3-phase reset.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Message-id: 20221109161444.3397405-7-peter.mayd...@linaro.org
---
 hw/intc/arm_gicv3_kvm.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c
index 3ca643ecba4..72ad916d3db 100644
--- a/hw/intc/arm_gicv3_kvm.c
+++ b/hw/intc/arm_gicv3_kvm.c
@@ -77,7 +77,7 @@ DECLARE_OBJ_CHECKERS(GICv3State, KVMARMGICv3Class,
 struct KVMARMGICv3Class {
 ARMGICv3CommonClass parent_class;
 DeviceRealize parent_realize;
-void (*parent_reset)(DeviceState *dev);
+ResettablePhases parent_phases;
 };
 
 static void kvm_arm_gicv3_set_irq(void *opaque, int irq, int level)
@@ -703,14 +703,16 @@ static void arm_gicv3_icc_reset(CPUARMState *env, const 
ARMCPRegInfo *ri)
 c->icc_ctlr_el1[GICV3_S] = c->icc_ctlr_el1[GICV3_NS];
 }
 
-static void kvm_arm_gicv3_reset(DeviceState *dev)
+static void kvm_arm_gicv3_reset_hold(Object *obj)
 {
-GICv3State *s = ARM_GICV3_COMMON(dev);
+GICv3State *s = ARM_GICV3_COMMON(obj);
 KVMARMGICv3Class *kgc = KVM_ARM_GICV3_GET_CLASS(s);
 
 DPRINTF("Reset\n");
 
-kgc->parent_reset(dev);
+if (kgc->parent_phases.hold) {
+kgc->parent_phases.hold(obj);
+}
 
 if (s->migration_blocker) {
 DPRINTF("Cannot put kernel gic state, no kernel interface\n");
@@ -890,6 +892,7 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error 
**errp)
 static void kvm_arm_gicv3_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ResettableClass *rc = RESETTABLE_CLASS(klass);
 ARMGICv3CommonClass *agcc = ARM_GICV3_COMMON_CLASS(klass);
 KVMARMGICv3Class *kgc = KVM_ARM_GICV3_CLASS(klass);
 
@@ -897,7 +900,8 @@ static void kvm_arm_gicv3_class_init(ObjectClass *klass, 
void *data)
 agcc->post_load = kvm_arm_gicv3_put;
 device_class_set_parent_realize(dc, kvm_arm_gicv3_realize,
 &kgc->parent_realize);
-device_class_set_parent_reset(dc, kvm_arm_gicv3_reset, &kgc->parent_reset);
+resettable_class_set_parent_phases(rc, NULL, kvm_arm_gicv3_reset_hold, 
NULL,
+   &kgc->parent_phases);
 }
 
 static const TypeInfo kvm_arm_gicv3_info = {
-- 
2.25.1

[PULL 07/29] hw/arm/virt: Add properties to disable high memory regions

2022-12-15 Thread Peter Maydell

From: Gavin Shan 

The 3 high memory regions are usually enabled by default, but they may
be not used. For example, VIRT_HIGH_GIC_REDIST2 isn't needed by GICv2.
This leads to waste in the PA space.

Add properties ("highmem-redists", "highmem-ecam", "highmem-mmio") to
allow users selectively disable them if needed. After that, the high
memory region for GICv3 or GICv4 redistributor can be disabled by user,
the number of maximal supported CPUs needs to be calculated based on
'vms->highmem_redists'. The follow-up error message is also improved
to indicate if the high memory region for GICv3 and GICv4 has been
enabled or not.

Suggested-by: Marc Zyngier 
Signed-off-by: Gavin Shan 
Reviewed-by: Marc Zyngier 
Reviewed-by: Cornelia Huck 
Reviewed-by: Eric Auger 
Message-id: 20221029224307.138822-8-gs...@redhat.com
Signed-off-by: Peter Maydell 
---
 docs/system/arm/virt.rst | 13 +++
 hw/arm/virt.c| 75 ++--
 2 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/docs/system/arm/virt.rst b/docs/system/arm/virt.rst
index 4454706392c..188a4f211f4 100644
--- a/docs/system/arm/virt.rst
+++ b/docs/system/arm/virt.rst
@@ -98,6 +98,19 @@ compact-highmem
   Set ``on``/``off`` to enable/disable the compact layout for high memory 
regions.
   The default is ``on`` for machine types later than ``virt-7.2``.
 
+highmem-redists
+  Set ``on``/``off`` to enable/disable the high memory region for GICv3 or
+  GICv4 redistributor. The default is ``on``. Setting this to ``off`` will
+  limit the maximum number of CPUs when GICv3 or GICv4 is used.
+
+highmem-ecam
+  Set ``on``/``off`` to enable/disable the high memory region for PCI ECAM.
+  The default is ``on`` for machine types later than ``virt-3.0``.
+
+highmem-mmio
+  Set ``on``/``off`` to enable/disable the high memory region for PCI MMIO.
+  The default is ``on``.
+
 gic-version
   Specify the version of the Generic Interrupt Controller (GIC) to provide.
   Valid values are:
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 3d1371c05c0..0acb71be962 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2096,14 +2096,20 @@ static void machvirt_init(MachineState *machine)
 if (vms->gic_version == VIRT_GIC_VERSION_2) {
 virt_max_cpus = GIC_NCPU;
 } else {
-virt_max_cpus = virt_redist_capacity(vms, VIRT_GIC_REDIST) +
-virt_redist_capacity(vms, VIRT_HIGH_GIC_REDIST2);
+virt_max_cpus = virt_redist_capacity(vms, VIRT_GIC_REDIST);
+if (vms->highmem_redists) {
+virt_max_cpus += virt_redist_capacity(vms, VIRT_HIGH_GIC_REDIST2);
+}
 }
 
 if (max_cpus > virt_max_cpus) {
 error_report("Number of SMP CPUs requested (%d) exceeds max CPUs "
  "supported by machine 'mach-virt' (%d)",
  max_cpus, virt_max_cpus);
+if (vms->gic_version != VIRT_GIC_VERSION_2 && !vms->highmem_redists) {
+error_printf("Try 'highmem-redists=on' for more CPUs\n");
+}
+
 exit(1);
 }
 
@@ -2372,6 +2378,49 @@ static void virt_set_compact_highmem(Object *obj, bool 
value, Error **errp)
 vms->highmem_compact = value;
 }
 
+static bool virt_get_highmem_redists(Object *obj, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+return vms->highmem_redists;
+}
+
+static void virt_set_highmem_redists(Object *obj, bool value, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+vms->highmem_redists = value;
+}
+
+static bool virt_get_highmem_ecam(Object *obj, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+return vms->highmem_ecam;
+}
+
+static void virt_set_highmem_ecam(Object *obj, bool value, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+vms->highmem_ecam = value;
+}
+
+static bool virt_get_highmem_mmio(Object *obj, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+return vms->highmem_mmio;
+}
+
+static void virt_set_highmem_mmio(Object *obj, bool value, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+vms->highmem_mmio = value;
+}
+
+
 static bool virt_get_its(Object *obj, Error **errp)
 {
 VirtMachineState *vms = VIRT_MACHINE(obj);
@@ -2997,6 +3046,28 @@ static void virt_machine_class_init(ObjectClass *oc, 
void *data)
   "Set on/off to enable/disable 
compact "
   "layout for high memory regions");
 
+object_class_property_add_bool(oc, "highmem-redists",
+   virt_get_highmem_redists,
+   virt_set_highmem_redists);
+object_class_property_set_description(oc, "highmem-redists",
+  "Set on/off to enable/disable high "
+  "memory region for GICv3 or GICv4 "
+  "redistributor");
+
+object_class_proper

[PULL 15/29] target/arm: Implement HCR_EL2.TID4 traps

2022-12-15 Thread Peter Maydell

For FEAT_EVT, the HCR_EL2.TID4 trap allows trapping of the cache ID
registers CCSIDR_EL1, CCSIDR2_EL1, CLIDR_EL1 and CSSELR_EL1 (and
their AArch32 equivalents).  This is a subset of the registers
trapped by HCR_EL2.TID2, which includes all of these and also the
CTR_EL0 register.

Our implementation already uses a separate access function for
CTR_EL0 (ctr_el0_access()), so all of the registers currently using
access_aa64_tid2() should also be checking TID4.  Make that function
check both TID2 and TID4, and rename it appropriately.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
---
 target/arm/helper.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index eee95a42f7f..bac2ea62c44 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -1895,11 +1895,12 @@ static void scr_reset(CPUARMState *env, const 
ARMCPRegInfo *ri)
 scr_write(env, ri, 0);
 }
 
-static CPAccessResult access_aa64_tid2(CPUARMState *env,
-   const ARMCPRegInfo *ri,
-   bool isread)
+static CPAccessResult access_tid4(CPUARMState *env,
+  const ARMCPRegInfo *ri,
+  bool isread)
 {
-if (arm_current_el(env) == 1 && (arm_hcr_el2_eff(env) & HCR_TID2)) {
+if (arm_current_el(env) == 1 &&
+(arm_hcr_el2_eff(env) & (HCR_TID2 | HCR_TID4))) {
 return CP_ACCESS_TRAP_EL2;
 }
 
@@ -2130,12 +2131,12 @@ static const ARMCPRegInfo v7_cp_reginfo[] = {
 { .name = "CCSIDR", .state = ARM_CP_STATE_BOTH,
   .opc0 = 3, .crn = 0, .crm = 0, .opc1 = 1, .opc2 = 0,
   .access = PL1_R,
-  .accessfn = access_aa64_tid2,
+  .accessfn = access_tid4,
   .readfn = ccsidr_read, .type = ARM_CP_NO_RAW },
 { .name = "CSSELR", .state = ARM_CP_STATE_BOTH,
   .opc0 = 3, .crn = 0, .crm = 0, .opc1 = 2, .opc2 = 0,
   .access = PL1_RW,
-  .accessfn = access_aa64_tid2,
+  .accessfn = access_tid4,
   .writefn = csselr_write, .resetvalue = 0,
   .bank_fieldoffsets = { offsetof(CPUARMState, cp15.csselr_s),
  offsetof(CPUARMState, cp15.csselr_ns) } },
@@ -7281,7 +7282,7 @@ static const ARMCPRegInfo ccsidr2_reginfo[] = {
 { .name = "CCSIDR2", .state = ARM_CP_STATE_BOTH,
   .opc0 = 3, .opc1 = 1, .crn = 0, .crm = 0, .opc2 = 2,
   .access = PL1_R,
-  .accessfn = access_aa64_tid2,
+  .accessfn = access_tid4,
   .readfn = ccsidr2_read, .type = ARM_CP_NO_RAW },
 };
 
@@ -7581,7 +7582,7 @@ void register_cp_regs_for_features(ARMCPU *cpu)
 .name = "CLIDR", .state = ARM_CP_STATE_BOTH,
 .opc0 = 3, .crn = 0, .crm = 0, .opc1 = 1, .opc2 = 1,
 .access = PL1_R, .type = ARM_CP_CONST,
-.accessfn = access_aa64_tid2,
+.accessfn = access_tid4,
 .resetvalue = cpu->clidr
 };
 define_one_arm_cp_reg(cpu, &clidr);
-- 
2.25.1

[PULL 05/29] hw/arm/virt: Improve high memory region address assignment

2022-12-15 Thread Peter Maydell

From: Gavin Shan 

There are three high memory regions, which are VIRT_HIGH_REDIST2,
VIRT_HIGH_PCIE_ECAM and VIRT_HIGH_PCIE_MMIO. Their base addresses
are floating on highest RAM address. However, they can be disabled
in several cases.

(1) One specific high memory region is likely to be disabled by
code by toggling vms->highmem_{redists, ecam, mmio}.

(2) VIRT_HIGH_PCIE_ECAM region is disabled on machine, which is
'virt-2.12' or ealier than it.

(3) VIRT_HIGH_PCIE_ECAM region is disabled when firmware is loaded
on 32-bits system.

(4) One specific high memory region is disabled when it breaks the
PA space limit.

The current implementation of virt_set_{memmap, high_memmap}() isn't
optimized because the high memory region's PA space is always reserved,
regardless of whatever the actual state in the corresponding
vms->highmem_{redists, ecam, mmio} flag. In the code, 'base' and
'vms->highest_gpa' are always increased for case (1), (2) and (3).
It's unnecessary since the assigned PA space for the disabled high
memory region won't be used afterwards.

Improve the address assignment for those three high memory region by
skipping the address assignment for one specific high memory region if
it has been disabled in case (1), (2) and (3). The memory layout may
be changed after the improvement is applied, which leads to potential
migration breakage. So 'vms->highmem_compact' is added to control if
the improvement should be applied. For now, 'vms->highmem_compact' is
set to false, meaning that we don't have memory layout change until it
becomes configurable through property 'compact-highmem' in next patch.

Signed-off-by: Gavin Shan 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
Reviewed-by: Marc Zyngier 
Tested-by: Zhenyu Zhang 
Message-id: 20221029224307.138822-6-gs...@redhat.com
Signed-off-by: Peter Maydell 
---
 include/hw/arm/virt.h |  1 +
 hw/arm/virt.c | 15 ++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 6ec479ca2b7..709f6237412 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -144,6 +144,7 @@ struct VirtMachineState {
 PFlashCFI01 *flash[2];
 bool secure;
 bool highmem;
+bool highmem_compact;
 bool highmem_ecam;
 bool highmem_mmio;
 bool highmem_redists;
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 7689337470a..807175707e7 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1722,18 +1722,23 @@ static void virt_set_high_memmap(VirtMachineState *vms,
 vms->memmap[i].size = region_size;
 
 /*
- * Check each device to see if they fit in the PA space,
- * moving highest_gpa as we go.
+ * Check each device to see if it fits in the PA space,
+ * moving highest_gpa as we go. For compatibility, move
+ * highest_gpa for disabled fitting devices as well, if
+ * the compact layout has been disabled.
  *
  * For each device that doesn't fit, disable it.
  */
 fits = (region_base + region_size) <= BIT_ULL(pa_bits);
-if (fits) {
-vms->highest_gpa = region_base + region_size - 1;
+*region_enabled &= fits;
+if (vms->highmem_compact && !*region_enabled) {
+continue;
 }
 
-*region_enabled &= fits;
 base = region_base + region_size;
+if (fits) {
+vms->highest_gpa = base - 1;
+}
 }
 }
 
-- 
2.25.1

[PULL 20/29] hw/intc: Convert TYPE_ARM_GIC_KVM to 3-phase reset

2022-12-15 Thread Peter Maydell

Now we have converted TYPE_ARM_GIC_COMMON, we can convert the
TYPE_ARM_GIC_KVM subclass to 3-phase reset.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Message-id: 20221109161444.3397405-5-peter.mayd...@linaro.org
---
 hw/intc/arm_gic_kvm.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/hw/intc/arm_gic_kvm.c b/hw/intc/arm_gic_kvm.c
index 7d2a13273a4..1d588946bce 100644
--- a/hw/intc/arm_gic_kvm.c
+++ b/hw/intc/arm_gic_kvm.c
@@ -38,7 +38,7 @@ DECLARE_OBJ_CHECKERS(GICState, KVMARMGICClass,
 struct KVMARMGICClass {
 ARMGICCommonClass parent_class;
 DeviceRealize parent_realize;
-void (*parent_reset)(DeviceState *dev);
+ResettablePhases parent_phases;
 };
 
 void kvm_arm_gic_set_irq(uint32_t num_irq, int irq, int level)
@@ -473,12 +473,14 @@ static void kvm_arm_gic_get(GICState *s)
 }
 }
 
-static void kvm_arm_gic_reset(DeviceState *dev)
+static void kvm_arm_gic_reset_hold(Object *obj)
 {
-GICState *s = ARM_GIC_COMMON(dev);
+GICState *s = ARM_GIC_COMMON(obj);
 KVMARMGICClass *kgc = KVM_ARM_GIC_GET_CLASS(s);
 
-kgc->parent_reset(dev);
+if (kgc->parent_phases.hold) {
+kgc->parent_phases.hold(obj);
+}
 
 if (kvm_arm_gic_can_save_restore(s)) {
 kvm_arm_gic_put(s);
@@ -593,6 +595,7 @@ static void kvm_arm_gic_realize(DeviceState *dev, Error 
**errp)
 static void kvm_arm_gic_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ResettableClass *rc = RESETTABLE_CLASS(klass);
 ARMGICCommonClass *agcc = ARM_GIC_COMMON_CLASS(klass);
 KVMARMGICClass *kgc = KVM_ARM_GIC_CLASS(klass);
 
@@ -600,7 +603,8 @@ static void kvm_arm_gic_class_init(ObjectClass *klass, void 
*data)
 agcc->post_load = kvm_arm_gic_put;
 device_class_set_parent_realize(dc, kvm_arm_gic_realize,
 &kgc->parent_realize);
-device_class_set_parent_reset(dc, kvm_arm_gic_reset, &kgc->parent_reset);
+resettable_class_set_parent_phases(rc, NULL, kvm_arm_gic_reset_hold, NULL,
+   &kgc->parent_phases);
 }
 
 static const TypeInfo kvm_arm_gic_info = {
-- 
2.25.1

[PULL 29/29] target/arm: Restrict arm_cpu_exec_interrupt() to TCG accelerator

2022-12-15 Thread Peter Maydell

From: Philippe Mathieu-Daudé 

When building with --disable-tcg on Darwin we get:

  target/arm/cpu.c:725:16: error: incomplete definition of type 'struct 
TCGCPUOps'
cc->tcg_ops->do_interrupt(cs);
~~~^

Commit 083afd18a9 ("target/arm: Restrict cpu_exec_interrupt()
handler to sysemu") limited this block to system emulation,
but neglected to also limit it to TCG.

Signed-off-by: Philippe Mathieu-Daudé 
Reviewed-by: Fabiano Rosas 
Message-id: 20221209110823.59495-1-phi...@linaro.org
Signed-off-by: Peter Maydell 
---
 target/arm/cpu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 38d066c294d..0f55004d7e7 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -528,7 +528,7 @@ static void arm_cpu_reset(DeviceState *dev)
 arm_rebuild_hflags(env);
 }
 
-#ifndef CONFIG_USER_ONLY
+#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
 
 static inline bool arm_excp_unmasked(CPUState *cs, unsigned int excp_idx,
  unsigned int target_el,
@@ -725,7 +725,8 @@ static bool arm_cpu_exec_interrupt(CPUState *cs, int 
interrupt_request)
 cc->tcg_ops->do_interrupt(cs);
 return true;
 }
-#endif /* !CONFIG_USER_ONLY */
+
+#endif /* CONFIG_TCG && !CONFIG_USER_ONLY */
 
 void arm_cpu_update_virq(ARMCPU *cpu)
 {
-- 
2.25.1

Re: migration qtest failure: "query-migrate shows failed migration: Unable to write to socket: Broken pipe"

2022-12-15 Thread Dr. David Alan Gilbert

* Peter Maydell (peter.mayd...@linaro.org) wrote:
> On Thu, 15 Dec 2022 at 11:40, Dr. David Alan Gilbert
>  wrote:
> >
> > * Peter Maydell (peter.mayd...@linaro.org) wrote:
> > > Hi; I see this migration qtest failure on my x86 macos box:
> > >
> > >
> > > ▶  32/591 
> > > ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> > > assertion fail
> > > ed: (!g_str_equal(status, "failed")) ERROR
> > >  32/591 qemu:qtest+qtest-aarch64 / qtest-aarch64/migration-test
> > >ERROR  152.27s   killed by signal 6 SIGABRT
> > > ― ✀  
> > > ―
> > > stderr:
> > > query-migrate shows failed migration: Unable to write to socket: Broken 
> > > pipe
> > > **
> > > ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> > > assertion failed: (!g_str_equal(status, "failed"))
> > > Assertion failed: (pid == s->qemu_pid), function qtest_wait_qemu, file
> > > ../../tests/qtest/libqtest.c, line 207.
> > >
> > > (test program exited with status code -6)
> > > ――
> > >
> > > and similarly:
> > >
> > > ▶  34/591 
> > > ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> > > assertion failed: (!g_str_equal(status, "failed")) ERROR
> > >  34/591 qemu:qtest+qtest-i386 / qtest-i386/migration-test
> > >ERROR  169.44s   killed by signal 6 SIGABRT
> > > ― ✀  
> > > ―
> > > stderr:
> > > query-migrate shows failed migration: Unable to write to socket: Broken 
> > > pipe
> > > **
> > > ERROR:../../tests/qtest/migration-helpers.c:151:migrate_query_not_failed:
> > > assertion failed: (!g_str_equal(status, "failed"))
> > > Assertion failed: (pid == s->qemu_pid), function qtest_wait_qemu, file
> > > ../../tests/qtest/libqtest.c, line 207.
> > >
> > > (test program exited with status code -6)
> > > ――
> > >
> > > It seems to be fairly consistent. Any ideas what it might be?
> > > Maybe the QEMU process has already exited before the test binary
> > > gets round to querying the status ?
> >
> > Yes, it sounds like it, can you get a backtrace to figure out which test
> > it was in/where it was upto when it died?
> 
> The logfile says it had just done
> ok 23 /aarch64/migration/multifd/tcp/plain/none
> so I think the one it was in the middle of when it failed was
> /aarch64/migration/multifd/tcp/plain/cancel.
> Similarly the log suggests the x86 failure was for
> /i386/migration/miltifd/tcp/plain/cancel.
> 
> It doesn't seem to repro running manually, my guess is that
> it happens because the machine is heavily loaded doing the
> whole build-and-test cycle.

Yeh; I think we'll still need a backtrace or better qmp log though to
figure it out.
If I read that correctly, what that test does is:

   start 'from'
   start 'to'
   slow migrate from->to
   
   start 'to2'
   migrate from->to2

I'd only expect the 'to' to quit by itself, and I don't think we should
be doing a 'failed' check on a destination (I don't think).
Even if the migration finished quickly 'from' shouldn't just quit - so
the QMP should still be working, so we shouldn't get the broken pipe on
the actual qmp pipe.
Assuming then it's a broken socket on the migration stream, that sounds
like a bug - a 'cancel' might cause the destination to get an error
but the source shouldn't - it should know it's cancelling.

Dave

> thanks
> -- PMM
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH Trivial] hw/cxl/cxl-cdat.c: spelling: missmatch

2022-12-15 Thread Philippe Mathieu-Daudé


On 15/12/22 13:37, Michael Tokarev wrote:

Introduced by: aba578bdace5303a441f8a37aad781b5cb06f38c

Signed-off-by: Michael Tokarev 
---
  hw/cxl/cxl-cdat.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Philippe Mathieu-Daudé

[PULL 17/29] hw/arm: Convert TYPE_ARM_SMMU to 3-phase reset

2022-12-15 Thread Peter Maydell

Convert the TYPE_ARM_SMMU device to 3-phase reset.  The legacy method
doesn't do anything that's invalid in the hold phase, so the
conversion is simple and not a behaviour change.

Note that we must convert this base class before we can convert the
TYPE_ARM_SMMUV3 subclass -- transitional support in Resettable
handles "chain to parent class reset" when the base class is 3-phase
and the subclass is still using legacy reset, but not the other way
around.

Signed-off-by: Peter Maydell 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Reviewed-by: Eric Auger 
Message-id: 20221109161444.3397405-2-peter.mayd...@linaro.org
---
 hw/arm/smmu-common.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index e09b9c13b74..220838525d4 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -526,9 +526,9 @@ static void smmu_base_realize(DeviceState *dev, Error 
**errp)
 }
 }
 
-static void smmu_base_reset(DeviceState *dev)
+static void smmu_base_reset_hold(Object *obj)
 {
-SMMUState *s = ARM_SMMU(dev);
+SMMUState *s = ARM_SMMU(obj);
 
 g_hash_table_remove_all(s->configs);
 g_hash_table_remove_all(s->iotlb);
@@ -543,12 +543,13 @@ static Property smmu_dev_properties[] = {
 static void smmu_base_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ResettableClass *rc = RESETTABLE_CLASS(klass);
 SMMUBaseClass *sbc = ARM_SMMU_CLASS(klass);
 
 device_class_set_props(dc, smmu_dev_properties);
 device_class_set_parent_realize(dc, smmu_base_realize,
 &sbc->parent_realize);
-dc->reset = smmu_base_reset;
+rc->phases.hold = smmu_base_reset_hold;
 }
 
 static const TypeInfo smmu_base_info = {
-- 
2.25.1

Re: [PATCH] hw/riscv: Add support to change default RISCV hart memory region

2022-12-15 Thread Bin Meng

On Sun, Dec 11, 2022 at 1:29 PM Vysakh P Pillai
 wrote:
>
> Add support to optionally specify a memory region container
> to be used to override the default system memory used
> by the the RISCV harts when they are realized. Additional
> memory regions can be added as sub-regions of this container
> to dynamically control the memory regions and mappings visible
> from the hart.

Could you please specify what user case are you trying to address with
this patch?

>
> Signed-off-by: Vysakh P Pillai 
> ---
>  hw/riscv/riscv_hart.c | 5 +
>  include/hw/riscv/riscv_hart.h | 1 +
>  2 files changed, 6 insertions(+)
>
> diff --git a/hw/riscv/riscv_hart.c b/hw/riscv/riscv_hart.c
> index 613ea2aaa0..7a8dcab7e7 100644
> --- a/hw/riscv/riscv_hart.c
> +++ b/hw/riscv/riscv_hart.c
> @@ -33,6 +33,8 @@ static Property riscv_harts_props[] = {
>  DEFINE_PROP_STRING("cpu-type", RISCVHartArrayState, cpu_type),
>  DEFINE_PROP_UINT64("resetvec", RISCVHartArrayState, resetvec,
> DEFAULT_RSTVEC),
> +DEFINE_PROP_UINT64("cpu-memory", RISCVHartArrayState,
> +   cpu_memory,NULL),
>  DEFINE_PROP_END_OF_LIST(),
>  };
>
> @@ -49,6 +51,9 @@ static bool riscv_hart_realize(RISCVHartArrayState *s, int 
> idx,
>  qdev_prop_set_uint64(DEVICE(&s->harts[idx]), "resetvec", s->resetvec);
>  s->harts[idx].env.mhartid = s->hartid_base + idx;
>  qemu_register_reset(riscv_harts_cpu_reset, &s->harts[idx]);
> +if (s->cpu_memory) {
> +object_property_set_link(OBJECT(&s->harts[idx].parent_obj), 
> "memory",OBJECT(s->cpu_memory), &error_abort);
> +}
>  return qdev_realize(DEVICE(&s->harts[idx]), NULL, errp);
>  }
>
> diff --git a/include/hw/riscv/riscv_hart.h b/include/hw/riscv/riscv_hart.h
> index bbc21cdc9a..3e5dfeeaae 100644
> --- a/include/hw/riscv/riscv_hart.h
> +++ b/include/hw/riscv/riscv_hart.h
> @@ -38,6 +38,7 @@ struct RISCVHartArrayState {
>  uint32_t hartid_base;
>  char *cpu_type;
>  uint64_t resetvec;
> +uint64_t cpu_memory;
>  RISCVCPU *harts;
>  };
>

Regards,
Bin

[PULL 01/29] hw/arm/virt: Introduce virt_set_high_memmap() helper

2022-12-15 Thread Peter Maydell

From: Gavin Shan 

This introduces virt_set_high_memmap() helper. The logic of high
memory region address assignment is moved to the helper. The intention
is to make the subsequent optimization for high memory region address
assignment easier.

No functional change intended.

Signed-off-by: Gavin Shan 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
Reviewed-by: Marc Zyngier 
Tested-by: Zhenyu Zhang 
Message-id: 20221029224307.138822-2-gs...@redhat.com
Signed-off-by: Peter Maydell 
---
 hw/arm/virt.c | 74 ---
 1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index b8713508561..ca300281939 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1690,6 +1690,46 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState 
*vms, int idx)
 return arm_cpu_mp_affinity(idx, clustersz);
 }
 
+static void virt_set_high_memmap(VirtMachineState *vms,
+ hwaddr base, int pa_bits)
+{
+int i;
+
+for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
+hwaddr size = extended_memmap[i].size;
+bool fits;
+
+base = ROUND_UP(base, size);
+vms->memmap[i].base = base;
+vms->memmap[i].size = size;
+
+/*
+ * Check each device to see if they fit in the PA space,
+ * moving highest_gpa as we go.
+ *
+ * For each device that doesn't fit, disable it.
+ */
+fits = (base + size) <= BIT_ULL(pa_bits);
+if (fits) {
+vms->highest_gpa = base + size - 1;
+}
+
+switch (i) {
+case VIRT_HIGH_GIC_REDIST2:
+vms->highmem_redists &= fits;
+break;
+case VIRT_HIGH_PCIE_ECAM:
+vms->highmem_ecam &= fits;
+break;
+case VIRT_HIGH_PCIE_MMIO:
+vms->highmem_mmio &= fits;
+break;
+}
+
+base += size;
+}
+}
+
 static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
 {
 MachineState *ms = MACHINE(vms);
@@ -1745,39 +1785,7 @@ static void virt_set_memmap(VirtMachineState *vms, int 
pa_bits)
 /* We know for sure that at least the memory fits in the PA space */
 vms->highest_gpa = memtop - 1;
 
-for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
-hwaddr size = extended_memmap[i].size;
-bool fits;
-
-base = ROUND_UP(base, size);
-vms->memmap[i].base = base;
-vms->memmap[i].size = size;
-
-/*
- * Check each device to see if they fit in the PA space,
- * moving highest_gpa as we go.
- *
- * For each device that doesn't fit, disable it.
- */
-fits = (base + size) <= BIT_ULL(pa_bits);
-if (fits) {
-vms->highest_gpa = base + size - 1;
-}
-
-switch (i) {
-case VIRT_HIGH_GIC_REDIST2:
-vms->highmem_redists &= fits;
-break;
-case VIRT_HIGH_PCIE_ECAM:
-vms->highmem_ecam &= fits;
-break;
-case VIRT_HIGH_PCIE_MMIO:
-vms->highmem_mmio &= fits;
-break;
-}
-
-base += size;
-}
+virt_set_high_memmap(vms, base, pa_bits);
 
 if (device_memory_size > 0) {
 ms->device_memory = g_malloc0(sizeof(*ms->device_memory));
-- 
2.25.1

[PULL 18/29] hw/arm: Convert TYPE_ARM_SMMUV3 to 3-phase reset

2022-12-15 Thread Peter Maydell

Convert the TYPE_ARM_SMMUV3 device to 3-phase reset.  The legacy
reset method doesn't do anything that's invalid in the hold phase, so
the conversion only requires changing it to a hold phase method, and
using the 3-phase versions of the "save the parent reset method and
chain to it" code.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Reviewed-by: Eric Auger 
Reviewed-by: Philippe Mathieu-Daudé 
Message-id: 20221109161444.3397405-3-peter.mayd...@linaro.org
---
 include/hw/arm/smmuv3.h |  2 +-
 hw/arm/smmuv3.c | 12 
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index c641e60735e..f1921fdf9e7 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -77,7 +77,7 @@ struct SMMUv3Class {
 /*< public >*/
 
 DeviceRealize parent_realize;
-DeviceReset   parent_reset;
+ResettablePhases parent_phases;
 };
 
 #define TYPE_ARM_SMMUV3   "arm-smmuv3"
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index daa80e9c7b6..955b89c8d59 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1431,12 +1431,14 @@ static void smmu_init_irq(SMMUv3State *s, SysBusDevice 
*dev)
 }
 }
 
-static void smmu_reset(DeviceState *dev)
+static void smmu_reset_hold(Object *obj)
 {
-SMMUv3State *s = ARM_SMMUV3(dev);
+SMMUv3State *s = ARM_SMMUV3(obj);
 SMMUv3Class *c = ARM_SMMUV3_GET_CLASS(s);
 
-c->parent_reset(dev);
+if (c->parent_phases.hold) {
+c->parent_phases.hold(obj);
+}
 
 smmuv3_init_regs(s);
 }
@@ -1520,10 +1522,12 @@ static void smmuv3_instance_init(Object *obj)
 static void smmuv3_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ResettableClass *rc = RESETTABLE_CLASS(klass);
 SMMUv3Class *c = ARM_SMMUV3_CLASS(klass);
 
 dc->vmsd = &vmstate_smmuv3;
-device_class_set_parent_reset(dc, smmu_reset, &c->parent_reset);
+resettable_class_set_parent_phases(rc, NULL, smmu_reset_hold, NULL,
+   &c->parent_phases);
 c->parent_realize = dc->realize;
 dc->realize = smmu_realize;
 }
-- 
2.25.1

[PULL 23/29] hw/intc: Convert TYPE_ARM_GICV3_ITS_COMMON to 3-phase reset

2022-12-15 Thread Peter Maydell

Convert the TYPE_ARM_GICV3_ITS_COMMON parent class to 3-phase reset.

Signed-off-by: Peter Maydell 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Message-id: 20221109161444.3397405-8-peter.mayd...@linaro.org
---
 hw/intc/arm_gicv3_its_common.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hw/intc/arm_gicv3_its_common.c b/hw/intc/arm_gicv3_its_common.c
index 90b85f1e25c..d7532a7a899 100644
--- a/hw/intc/arm_gicv3_its_common.c
+++ b/hw/intc/arm_gicv3_its_common.c
@@ -122,9 +122,9 @@ void gicv3_its_init_mmio(GICv3ITSState *s, const 
MemoryRegionOps *ops,
 msi_nonbroken = true;
 }
 
-static void gicv3_its_common_reset(DeviceState *dev)
+static void gicv3_its_common_reset_hold(Object *obj)
 {
-GICv3ITSState *s = ARM_GICV3_ITS_COMMON(dev);
+GICv3ITSState *s = ARM_GICV3_ITS_COMMON(obj);
 
 s->ctlr = 0;
 s->cbaser = 0;
@@ -137,8 +137,9 @@ static void gicv3_its_common_reset(DeviceState *dev)
 static void gicv3_its_common_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ResettableClass *rc = RESETTABLE_CLASS(klass);
 
-dc->reset = gicv3_its_common_reset;
+rc->phases.hold = gicv3_its_common_reset_hold;
 dc->vmsd = &vmstate_its;
 }
 
-- 
2.25.1

[PULL 00/29] target-arm queue

2022-12-15 Thread Peter Maydell

First arm pullreq of the 8.0 series...

The following changes since commit ae2b87341b5ddb0dcb1b3f2d4f586ef18de75873:

  Merge tag 'pull-qapi-2022-12-14-v2' of https://repo.or.cz/qemu/armbru into 
staging (2022-12-14 22:42:14 +)

are available in the Git repository at:

  https://git.linaro.org/people/pmaydell/qemu-arm.git 
tags/pull-target-arm-20221215

for you to fetch changes up to 4f3ebdc33618e7c163f769047859d6f34373e3af:

  target/arm: Restrict arm_cpu_exec_interrupt() to TCG accelerator (2022-12-15 
11:18:20 +)


target-arm queue:
 * hw/arm/virt: Add properties to allow more granular
   configuration of use of highmem space
 * target/arm: Add Cortex-A55 CPU
 * hw/intc/arm_gicv3: Fix GICD_TYPER ITLinesNumber advertisement
 * Implement FEAT_EVT
 * Some 3-phase-reset conversions for Arm GIC, SMMU
 * hw/arm/boot: set initrd with #address-cells type in fdt
 * align user-mode exposed ID registers with Linux
 * hw/misc: Move some arm-related files from specific_ss into softmmu_ss
 * Restrict arm_cpu_exec_interrupt() to TCG accelerator


Gavin Shan (7):
  hw/arm/virt: Introduce virt_set_high_memmap() helper
  hw/arm/virt: Rename variable size to region_size in virt_set_high_memmap()
  hw/arm/virt: Introduce variable region_base in virt_set_high_memmap()
  hw/arm/virt: Introduce virt_get_high_memmap_enabled() helper
  hw/arm/virt: Improve high memory region address assignment
  hw/arm/virt: Add 'compact-highmem' property
  hw/arm/virt: Add properties to disable high memory regions

Luke Starrett (1):
  hw/intc/arm_gicv3: Fix GICD_TYPER ITLinesNumber advertisement

Mihai Carabas (1):
  hw/arm/virt: build SMBIOS 19 table

Peter Maydell (15):
  target/arm: Allow relevant HCR bits to be written for FEAT_EVT
  target/arm: Implement HCR_EL2.TTLBIS traps
  target/arm: Implement HCR_EL2.TTLBOS traps
  target/arm: Implement HCR_EL2.TICAB,TOCU traps
  target/arm: Implement HCR_EL2.TID4 traps
  target/arm: Report FEAT_EVT for TCG '-cpu max'
  hw/arm: Convert TYPE_ARM_SMMU to 3-phase reset
  hw/arm: Convert TYPE_ARM_SMMUV3 to 3-phase reset
  hw/intc: Convert TYPE_ARM_GIC_COMMON to 3-phase reset
  hw/intc: Convert TYPE_ARM_GIC_KVM to 3-phase reset
  hw/intc: Convert TYPE_ARM_GICV3_COMMON to 3-phase reset
  hw/intc: Convert TYPE_KVM_ARM_GICV3 to 3-phase reset
  hw/intc: Convert TYPE_ARM_GICV3_ITS_COMMON to 3-phase reset
  hw/intc: Convert TYPE_ARM_GICV3_ITS to 3-phase reset
  hw/intc: Convert TYPE_KVM_ARM_ITS to 3-phase reset

Philippe Mathieu-Daudé (1):
  target/arm: Restrict arm_cpu_exec_interrupt() to TCG accelerator

Schspa Shi (1):
  hw/arm/boot: set initrd with #address-cells type in fdt

Thomas Huth (1):
  hw/misc: Move some arm-related files from specific_ss into softmmu_ss

Timofey Kutergin (1):
  target/arm: Add Cortex-A55 CPU

Zhuojia Shen (1):
  target/arm: align exposed ID registers with Linux

 docs/system/arm/emulation.rst  |   1 +
 docs/system/arm/virt.rst   |  18 +++
 include/hw/arm/smmuv3.h|   2 +-
 include/hw/arm/virt.h  |   2 +
 include/hw/misc/xlnx-zynqmp-apu-ctrl.h |   2 +-
 target/arm/cpu.h   |  30 +
 target/arm/kvm-consts.h|   8 +-
 hw/arm/boot.c  |  10 +-
 hw/arm/smmu-common.c   |   7 +-
 hw/arm/smmuv3.c|  12 +-
 hw/arm/virt.c  | 202 +++-
 hw/intc/arm_gic_common.c   |   7 +-
 hw/intc/arm_gic_kvm.c  |  14 +-
 hw/intc/arm_gicv3_common.c |   7 +-
 hw/intc/arm_gicv3_dist.c   |   4 +-
 hw/intc/arm_gicv3_its.c|  14 +-
 hw/intc/arm_gicv3_its_common.c |   7 +-
 hw/intc/arm_gicv3_its_kvm.c|  14 +-
 hw/intc/arm_gicv3_kvm.c|  14 +-
 hw/misc/imx6_src.c |   2 +-
 hw/misc/iotkit-sysctl.c|   1 -
 target/arm/cpu.c   |   5 +-
 target/arm/cpu64.c |  70 ++
 target/arm/cpu_tcg.c   |   1 +
 target/arm/helper.c| 231 -
 hw/misc/meson.build|  11 +-
 26 files changed, 538 insertions(+), 158 deletions(-)

[PULL 21/29] hw/intc: Convert TYPE_ARM_GICV3_COMMON to 3-phase reset

2022-12-15 Thread Peter Maydell

Convert the TYPE_ARM_GICV3_COMMON parent class to 3-phase reset.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Message-id: 20221109161444.3397405-6-peter.mayd...@linaro.org
---
 hw/intc/arm_gicv3_common.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c
index 351843db4aa..642a8243ed4 100644
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -450,9 +450,9 @@ static void arm_gicv3_finalize(Object *obj)
 g_free(s->redist_region_count);
 }
 
-static void arm_gicv3_common_reset(DeviceState *dev)
+static void arm_gicv3_common_reset_hold(Object *obj)
 {
-GICv3State *s = ARM_GICV3_COMMON(dev);
+GICv3State *s = ARM_GICV3_COMMON(obj);
 int i;
 
 for (i = 0; i < s->num_cpu; i++) {
@@ -578,9 +578,10 @@ static Property arm_gicv3_common_properties[] = {
 static void arm_gicv3_common_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ResettableClass *rc = RESETTABLE_CLASS(klass);
 ARMLinuxBootIfClass *albifc = ARM_LINUX_BOOT_IF_CLASS(klass);
 
-dc->reset = arm_gicv3_common_reset;
+rc->phases.hold = arm_gicv3_common_reset_hold;
 dc->realize = arm_gicv3_common_realize;
 device_class_set_props(dc, arm_gicv3_common_properties);
 dc->vmsd = &vmstate_gicv3;
-- 
2.25.1

[PULL 10/29] hw/intc/arm_gicv3: Fix GICD_TYPER ITLinesNumber advertisement

2022-12-15 Thread Peter Maydell

From: Luke Starrett 

The ARM GICv3 TRM describes that the ITLinesNumber field of GICD_TYPER
register:

"indicates the maximum SPI INTID that the GIC implementation supports"

As SPI #0 is absolute IRQ #32, the max SPI INTID should have accounted
for the internal 16x SGI's and 16x PPI's.  However, the original GICv3
model subtracted off the SGI/PPI.  Cosmetically this can be seen at OS
boot (Linux) showing 32 shy of what should be there, i.e.:

[0.00] GICv3: 224 SPIs implemented

Though in hw/arm/virt.c, the machine is configured for 256 SPI's.  ARM
virt machine likely doesn't have a problem with this because the upper
32 IRQ's don't actually have anything meaningful wired. But, this does
become a functional issue on a custom use case which wants to make use
of these IRQ's.  Additionally, boot code (i.e. TF-A) will only init up
to the number (blocks of 32) that it believes to actually be there.

Signed-off-by: Luke Starrett 
Message-id: 
am9p193mb168473d99b761e204e032095d4...@am9p193mb1684.eurp193.prod.outlook.com
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 hw/intc/arm_gicv3_dist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/intc/arm_gicv3_dist.c b/hw/intc/arm_gicv3_dist.c
index eea03681187..d599fefcbcf 100644
--- a/hw/intc/arm_gicv3_dist.c
+++ b/hw/intc/arm_gicv3_dist.c
@@ -390,9 +390,9 @@ static bool gicd_readl(GICv3State *s, hwaddr offset,
  * MBIS == 0 (message-based SPIs not supported)
  * SecurityExtn == 1 if security extns supported
  * CPUNumber == 0 since for us ARE is always 1
- * ITLinesNumber == (num external irqs / 32) - 1
+ * ITLinesNumber == (((max SPI IntID + 1) / 32) - 1)
  */
-int itlinesnumber = ((s->num_irq - GIC_INTERNAL) / 32) - 1;
+int itlinesnumber = (s->num_irq / 32) - 1;
 /*
  * SecurityExtn must be RAZ if GICD_CTLR.DS == 1, and
  * "security extensions not supported" always implies DS == 1,
-- 
2.25.1

Re: [PATCH v1 22/24] vfio-user: add 'x-msg-timeout' option that specifies msg wait times

2022-12-15 Thread Cédric Le Goater


On 11/9/22 00:13, John Johnson wrote:

Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
---
  hw/vfio/pci.c  | 4 
  hw/vfio/pci.h  | 1 +
  hw/vfio/user.c | 7 +--
  hw/vfio/user.h | 1 +
  4 files changed, 11 insertions(+), 2 deletions(-)>
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 005fcf8..3ae3a13 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3729,6 +3729,9 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error 
**errp)
  if (udev->no_post) {
  proxy->flags |= VFIO_PROXY_NO_POST;
  }
+if (udev->wait_time) {
+proxy->wait_time = udev->wait_time;
+}
  
  vfio_user_validate_version(proxy, &err);

  if (err != NULL) {
@@ -3848,6 +3851,7 @@ static Property vfio_user_pci_dev_properties[] = {
  DEFINE_PROP_BOOL("secure-dma", VFIOUserPCIDevice, secure_dma, false),
  DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false),
  DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false),
+DEFINE_PROP_UINT32("x-msg-timeout", VFIOUserPCIDevice, wait_time, 0),


I see that patch 9 introduced :

+static int wait_time = 5000;   /* wait up to 5 sec for busy servers */

May be use a define instead and assign  "x-msg-timeout" to this default
value.

how do you plan to use the "x-msg-timeout" property ?

Thanks,

C.


  DEFINE_PROP_END_OF_LIST(),
  };
  
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h

index c4b8e5c..48b19ee 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -199,6 +199,7 @@ struct VFIOUserPCIDevice {
  bool secure_dma;/* disable shared mem for DMA */
  bool send_queued;   /* all sends are queued */
  bool no_post;   /* all regions write are sync */
+uint32_t wait_time; /* timeout for message replies */
  };
  
  /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */

diff --git a/hw/vfio/user.c b/hw/vfio/user.c
index ddf9e13..a9e6cf5 100644
--- a/hw/vfio/user.c
+++ b/hw/vfio/user.c
@@ -717,7 +717,8 @@ static void vfio_user_send_wait(VFIOProxy *proxy, 
VFIOUserHdr *hdr,
  
  if (ret == 0) {

  while (!msg->complete) {
-if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) {
+if (!qemu_cond_timedwait(&msg->cv, &proxy->lock,
+ proxy->wait_time)) {
  VFIOUserMsgQ *list;
  
  list = msg->pending ? &proxy->pending : &proxy->outgoing;

@@ -759,7 +760,8 @@ static void vfio_user_wait_reqs(VFIOProxy *proxy)
  msg = proxy->last_nowait;
  msg->type = VFIO_MSG_WAIT;
  while (!msg->complete) {
-if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) {
+if (!qemu_cond_timedwait(&msg->cv, &proxy->lock,
+ proxy->wait_time)) {
  VFIOUserMsgQ *list;
  
  list = msg->pending ? &proxy->pending : &proxy->outgoing;

@@ -881,6 +883,7 @@ VFIOProxy *vfio_user_connect_dev(SocketAddress *addr, Error 
**errp)
  
  proxy->flags = VFIO_PROXY_CLIENT;

  proxy->state = VFIO_PROXY_CONNECTED;
+proxy->wait_time = wait_time;
  
  qemu_mutex_init(&proxy->lock);

  qemu_cond_init(&proxy->close_cv);
diff --git a/hw/vfio/user.h b/hw/vfio/user.h
index d88ffe5..f711861 100644
--- a/hw/vfio/user.h
+++ b/hw/vfio/user.h
@@ -62,6 +62,7 @@ typedef struct VFIOProxy {
  uint64_t max_bitmap;
  uint64_t migr_pgsize;
  int flags;
+uint32_t wait_time;
  QemuCond close_cv;
  AioContext *ctx;
  QEMUBH *req_bh;

[PATCH] block: remove bdrv_coroutine_enter

2022-12-15 Thread Paolo Bonzini

It has only one caller---inline it and remove the function.

Signed-off-by: Paolo Bonzini 
---
 block.c  | 6 --
 block/block-backend.c| 2 +-
 include/block/block-io.h | 5 -
 3 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/block.c b/block.c
index 9c2ac757e495..3f2bd128570e 100644
--- a/block.c
+++ b/block.c
@@ -7177,12 +7177,6 @@ void coroutine_fn bdrv_co_unlock(BlockDriverState *bs)
 }
 }
 
-void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co)
-{
-IO_CODE();
-aio_co_enter(bdrv_get_aio_context(bs), co);
-}
-
 static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
 {
 GLOBAL_STATE_CODE();
diff --git a/block/block-backend.c b/block/block-backend.c
index 2852a892de6c..a3e7901f291e 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1555,7 +1555,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, 
int64_t offset,
 acb->has_returned = false;
 
 co = qemu_coroutine_create(co_entry, acb);
-bdrv_coroutine_enter(blk_bs(blk), co);
+aio_co_enter(blk_get_aio_context(blk), co);
 
 acb->has_returned = true;
 if (acb->rwco.ret != NOT_DONE) {
diff --git a/include/block/block-io.h b/include/block/block-io.h
index 2ed6214909d8..1fa717a545a0 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -210,11 +210,6 @@ AioContext *coroutine_fn bdrv_co_enter(BlockDriverState 
*bs);
  */
 void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx);
 
-/**
- * Transfer control to @co in the aio context of @bs
- */
-void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co);
-
 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
 
 void bdrv_io_plug(BlockDriverState *bs);
-- 
2.38.1

[PULL 27/29] target/arm: align exposed ID registers with Linux

2022-12-15 Thread Peter Maydell

From: Zhuojia Shen 

In CPUID registers exposed to userspace, some registers were missing
and some fields were not exposed.  This patch aligns exposed ID
registers and their fields with what the upstream kernel currently
exposes.

Specifically, the following new ID registers/fields are exposed to
userspace:

ID_AA64PFR1_EL1.BT:   bits 3-0
ID_AA64PFR1_EL1.MTE:  bits 11-8
ID_AA64PFR1_EL1.SME:  bits 27-24

ID_AA64ZFR0_EL1.SVEver:   bits 3-0
ID_AA64ZFR0_EL1.AES:  bits 7-4
ID_AA64ZFR0_EL1.BitPerm:  bits 19-16
ID_AA64ZFR0_EL1.BF16: bits 23-20
ID_AA64ZFR0_EL1.SHA3: bits 35-32
ID_AA64ZFR0_EL1.SM4:  bits 43-40
ID_AA64ZFR0_EL1.I8MM: bits 47-44
ID_AA64ZFR0_EL1.F32MM:bits 55-52
ID_AA64ZFR0_EL1.F64MM:bits 59-56

ID_AA64SMFR0_EL1.F32F32:  bit 32
ID_AA64SMFR0_EL1.B16F32:  bit 34
ID_AA64SMFR0_EL1.F16F32:  bit 35
ID_AA64SMFR0_EL1.I8I32:   bits 39-36
ID_AA64SMFR0_EL1.F64F64:  bit 48
ID_AA64SMFR0_EL1.I16I64:  bits 55-52
ID_AA64SMFR0_EL1.FA64:bit 63

ID_AA64MMFR0_EL1.ECV: bits 63-60

ID_AA64MMFR1_EL1.AFP: bits 47-44

ID_AA64MMFR2_EL1.AT:  bits 35-32

ID_AA64ISAR0_EL1.RNDR:bits 63-60

ID_AA64ISAR1_EL1.FRINTTS: bits 35-32
ID_AA64ISAR1_EL1.BF16:bits 47-44
ID_AA64ISAR1_EL1.DGH: bits 51-48
ID_AA64ISAR1_EL1.I8MM:bits 55-52

ID_AA64ISAR2_EL1.WFxT:bits 3-0
ID_AA64ISAR2_EL1.RPRES:   bits 7-4
ID_AA64ISAR2_EL1.GPA3:bits 11-8
ID_AA64ISAR2_EL1.APA3:bits 15-12

The code is also refactored to use symbolic names for ID register fields
for better readability and maintainability.

Signed-off-by: Zhuojia Shen 
Message-id: 
ds7pr12mb6309bc9133877bcc6fc419feac...@ds7pr12mb6309.namprd12.prod.outlook.com
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 target/arm/helper.c | 96 +
 1 file changed, 79 insertions(+), 17 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index bac2ea62c44..6efc632b20d 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -7864,31 +7864,89 @@ void register_cp_regs_for_features(ARMCPU *cpu)
 #ifdef CONFIG_USER_ONLY
 static const ARMCPRegUserSpaceInfo v8_user_idregs[] = {
 { .name = "ID_AA64PFR0_EL1",
-  .exported_bits = 0x000f000f00ff,
-  .fixed_bits= 0x0011 },
+  .exported_bits = R_ID_AA64PFR0_FP_MASK |
+   R_ID_AA64PFR0_ADVSIMD_MASK |
+   R_ID_AA64PFR0_SVE_MASK |
+   R_ID_AA64PFR0_DIT_MASK,
+  .fixed_bits = (0x1 << R_ID_AA64PFR0_EL0_SHIFT) |
+(0x1 << R_ID_AA64PFR0_EL1_SHIFT) },
 { .name = "ID_AA64PFR1_EL1",
-  .exported_bits = 0x00f0 },
+  .exported_bits = R_ID_AA64PFR1_BT_MASK |
+   R_ID_AA64PFR1_SSBS_MASK |
+   R_ID_AA64PFR1_MTE_MASK |
+   R_ID_AA64PFR1_SME_MASK },
 { .name = "ID_AA64PFR*_EL1_RESERVED",
-  .is_glob = true },
-{ .name = "ID_AA64ZFR0_EL1"   },
+  .is_glob = true },
+{ .name = "ID_AA64ZFR0_EL1",
+  .exported_bits = R_ID_AA64ZFR0_SVEVER_MASK |
+   R_ID_AA64ZFR0_AES_MASK |
+   R_ID_AA64ZFR0_BITPERM_MASK |
+   R_ID_AA64ZFR0_BFLOAT16_MASK |
+   R_ID_AA64ZFR0_SHA3_MASK |
+   R_ID_AA64ZFR0_SM4_MASK |
+   R_ID_AA64ZFR0_I8MM_MASK |
+   R_ID_AA64ZFR0_F32MM_MASK |
+   R_ID_AA64ZFR0_F64MM_MASK },
+{ .name = "ID_AA64SMFR0_EL1",
+  .exported_bits = R_ID_AA64SMFR0_F32F32_MASK |
+   R_ID_AA64SMFR0_B16F32_MASK |
+   R_ID_AA64SMFR0_F16F32_MASK |
+   R_ID_AA64SMFR0_I8I32_MASK |
+   R_ID_AA64SMFR0_F64F64_MASK |
+   R_ID_AA64SMFR0_I16I64_MASK |
+   R_ID_AA64SMFR0_FA64_MASK },
 { .name = "ID_AA64MMFR0_EL1",
-  .fixed_bits= 0xff00 },
-{ .name = "ID_AA64MMFR1_EL1"  },
+  .exported_bits = R_ID_AA64MMFR0_ECV_MASK,
+  .fixed_bits = (0xf << R_ID_AA64MMFR0_TGRAN64_SHIFT) |
+(0xf << R_ID_AA64MMFR0_TGRAN4_SHIFT) },
+{ .name = "ID_AA64MMFR1_EL1",
+  .exported_bits = R_ID_AA64MMFR1_AFP_MASK },
+{ .name = "ID_AA64MMFR2_EL1",
+  .exported_bits = R_ID_AA64MMFR2_AT_MASK },
 { .name = "ID_AA64MMFR*_EL1_RESERVED",
-  .is_glob = true },
+  .is_glob = true },
 { .name = "ID_AA64DFR0_EL1",
-  .fixed_bits= 0x0

[PULL 24/29] hw/intc: Convert TYPE_ARM_GICV3_ITS to 3-phase reset

2022-12-15 Thread Peter Maydell

Convert the TYPE_ARM_GICV3_ITS device to 3-phase reset.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Message-id: 20221109161444.3397405-9-peter.mayd...@linaro.org
---
 hw/intc/arm_gicv3_its.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/hw/intc/arm_gicv3_its.c b/hw/intc/arm_gicv3_its.c
index 2ff21ed6bbe..57c79da5c55 100644
--- a/hw/intc/arm_gicv3_its.c
+++ b/hw/intc/arm_gicv3_its.c
@@ -27,7 +27,7 @@ DECLARE_OBJ_CHECKERS(GICv3ITSState, GICv3ITSClass,
 
 struct GICv3ITSClass {
 GICv3ITSCommonClass parent_class;
-void (*parent_reset)(DeviceState *dev);
+ResettablePhases parent_phases;
 };
 
 /*
@@ -1953,12 +1953,14 @@ static void gicv3_arm_its_realize(DeviceState *dev, 
Error **errp)
 }
 }
 
-static void gicv3_its_reset(DeviceState *dev)
+static void gicv3_its_reset_hold(Object *obj)
 {
-GICv3ITSState *s = ARM_GICV3_ITS_COMMON(dev);
+GICv3ITSState *s = ARM_GICV3_ITS_COMMON(obj);
 GICv3ITSClass *c = ARM_GICV3_ITS_GET_CLASS(s);
 
-c->parent_reset(dev);
+if (c->parent_phases.hold) {
+c->parent_phases.hold(obj);
+}
 
 /* Quiescent bit reset to 1 */
 s->ctlr = FIELD_DP32(s->ctlr, GITS_CTLR, QUIESCENT, 1);
@@ -2012,12 +2014,14 @@ static Property gicv3_its_props[] = {
 static void gicv3_its_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ResettableClass *rc = RESETTABLE_CLASS(klass);
 GICv3ITSClass *ic = ARM_GICV3_ITS_CLASS(klass);
 GICv3ITSCommonClass *icc = ARM_GICV3_ITS_COMMON_CLASS(klass);
 
 dc->realize = gicv3_arm_its_realize;
 device_class_set_props(dc, gicv3_its_props);
-device_class_set_parent_reset(dc, gicv3_its_reset, &ic->parent_reset);
+resettable_class_set_parent_phases(rc, NULL, gicv3_its_reset_hold, NULL,
+   &ic->parent_phases);
 icc->post_load = gicv3_its_post_load;
 }
 
-- 
2.25.1

Re: [PULL 00/14] Miscellaneous patches for 2022-12-14

2022-12-15 Thread Peter Maydell

On Wed, 14 Dec 2022 at 16:46, Markus Armbruster  wrote:
>
> The following changes since commit ea3a008d2d9ced9c4f93871c823baee237047f93:
>
>   Update VERSION for v7.2.0-rc4 (2022-12-06 19:53:34 -0500)
>
> are available in the Git repository at:
>
>   https://repo.or.cz/qemu/armbru.git tags/pull-misc-2022-12-14
>
> for you to fetch changes up to 6c5aaee4b61eb8bf60c7c30365432710b4346421:
>
>   ppc4xx_sdram: Simplify sdram_ddr_size() to return (2022-12-14 16:19:35 
> +0100)
>
> 
> Miscellaneous patches for 2022-12-14
>
> 


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/7.1
for any user-visible changes.

-- PMM

Re: [PULL 00/14] Miscellaneous patches for 2022-12-14

2022-12-15 Thread Peter Maydell

On Thu, 15 Dec 2022 at 13:06, Peter Maydell  wrote:
>
> On Wed, 14 Dec 2022 at 16:46, Markus Armbruster  wrote:
> >
> > The following changes since commit ea3a008d2d9ced9c4f93871c823baee237047f93:
> >
> >   Update VERSION for v7.2.0-rc4 (2022-12-06 19:53:34 -0500)
> >
> > are available in the Git repository at:
> >
> >   https://repo.or.cz/qemu/armbru.git tags/pull-misc-2022-12-14
> >
> > for you to fetch changes up to 6c5aaee4b61eb8bf60c7c30365432710b4346421:
> >
> >   ppc4xx_sdram: Simplify sdram_ddr_size() to return (2022-12-14 16:19:35 
> > +0100)
> >
> > 
> > Miscellaneous patches for 2022-12-14
> >
> > 
>
>
> Applied, thanks.
>
> Please update the changelog at https://wiki.qemu.org/ChangeLog/7.1
> for any user-visible changes.

Should be https://wiki.qemu.org/ChangeLog/8.0 of course -- I forgot
I hadn't updated my canned email text...

-- PMM

[PULL 16/29] target/arm: Report FEAT_EVT for TCG '-cpu max'

2022-12-15 Thread Peter Maydell

Update the ID registers for TCG's '-cpu max' to report the
FEAT_EVT Enhanced Virtualization Traps support.

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
---
 docs/system/arm/emulation.rst | 1 +
 target/arm/cpu64.c| 1 +
 target/arm/cpu_tcg.c  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst
index e3af79bb8c9..b33d7c28dc1 100644
--- a/docs/system/arm/emulation.rst
+++ b/docs/system/arm/emulation.rst
@@ -26,6 +26,7 @@ the following architecture extensions:
 - FEAT_DoubleFault (Double Fault Extension)
 - FEAT_E0PD (Preventing EL0 access to halves of address maps)
 - FEAT_ETS (Enhanced Translation Synchronization)
+- FEAT_EVT (Enhanced Virtualization Traps)
 - FEAT_FCMA (Floating-point complex number instructions)
 - FEAT_FHM (Floating-point half-precision multiplication instructions)
 - FEAT_FP16 (Half-precision floating-point data processing)
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index cec64471b4e..2cf2ca4ce5a 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -1254,6 +1254,7 @@ static void aarch64_max_initfn(Object *obj)
 t = FIELD_DP64(t, ID_AA64MMFR2, FWB, 1);  /* FEAT_S2FWB */
 t = FIELD_DP64(t, ID_AA64MMFR2, TTL, 1);  /* FEAT_TTL */
 t = FIELD_DP64(t, ID_AA64MMFR2, BBM, 2);  /* FEAT_BBM at level 2 */
+t = FIELD_DP64(t, ID_AA64MMFR2, EVT, 2);  /* FEAT_EVT */
 t = FIELD_DP64(t, ID_AA64MMFR2, E0PD, 1); /* FEAT_E0PD */
 cpu->isar.id_aa64mmfr2 = t;
 
diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
index 9a2cef7d05a..568cbcfc524 100644
--- a/target/arm/cpu_tcg.c
+++ b/target/arm/cpu_tcg.c
@@ -65,6 +65,7 @@ void aa32_max_features(ARMCPU *cpu)
 t = FIELD_DP32(t, ID_MMFR4, AC2, 1);  /* ACTLR2, HACTLR2 */
 t = FIELD_DP32(t, ID_MMFR4, CNP, 1);  /* FEAT_TTCNP */
 t = FIELD_DP32(t, ID_MMFR4, XNX, 1);  /* FEAT_XNX */
+t = FIELD_DP32(t, ID_MMFR4, EVT, 2);  /* FEAT_EVT */
 cpu->isar.id_mmfr4 = t;
 
 t = cpu->isar.id_mmfr5;
-- 
2.25.1

Re: migration qtest failure: "query-migrate shows failed migration: Unable to write to socket: Broken pipe"

2022-12-15 Thread Peter Maydell

On Thu, 15 Dec 2022 at 12:52, Dr. David Alan Gilbert
 wrote:
>
> * Peter Maydell (peter.mayd...@linaro.org) wrote:
> > It doesn't seem to repro running manually, my guess is that
> > it happens because the machine is heavily loaded doing the
> > whole build-and-test cycle.
>
> Yeh; I think we'll still need a backtrace or better qmp log though to
> figure it out.

Unfortunately, often all you get is "what does 'make check' output".
Is there a way we can improve the test so it outputs something
more useful when it detects a failure ?

thanks
-- PMM

[PATCH] blkdebug: ignore invalid rules in non-coroutine context

2022-12-15 Thread Paolo Bonzini

blkdebug events can be called from either non-coroutine or coroutine
contexts.  However, suspend actions only make sense from within
a coroutine.  Currently, using those action would lead to an abort() in
qemu_coroutine_yield() ("Co-routine is yielding to no one").  Catch them
and print an error instead.

Signed-off-by: Paolo Bonzini 
---
 block.c  |  2 +-
 block/blkdebug.c | 10 --
 include/block/block-io.h |  2 +-
 include/block/block_int-common.h |  3 ++-
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/block.c b/block.c
index 3f2bd128570e..49c66475c73e 100644
--- a/block.c
+++ b/block.c
@@ -6334,7 +6334,7 @@ BlockStatsSpecific 
*bdrv_get_specific_stats(BlockDriverState *bs)
 return drv->bdrv_get_specific_stats(bs);
 }
 
-void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event)
+void coroutine_mixed_fn bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent 
event)
 {
 IO_CODE();
 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 4265ca125e25..ce297961b7db 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -31,6 +31,7 @@
 #include "block/qdict.h"
 #include "qemu/module.h"
 #include "qemu/option.h"
+#include "qemu/error-report.h"
 #include "qapi/qapi-visit-block-core.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qlist.h"
@@ -837,7 +838,7 @@ static void process_rule(BlockDriverState *bs, struct 
BlkdebugRule *rule,
 }
 }
 
-static void blkdebug_debug_event(BlockDriverState *bs, BlkdebugEvent event)
+static void coroutine_mixed_fn blkdebug_debug_event(BlockDriverState *bs, 
BlkdebugEvent event)
 {
 BDRVBlkdebugState *s = bs->opaque;
 struct BlkdebugRule *rule, *next;
@@ -855,7 +856,12 @@ static void blkdebug_debug_event(BlockDriverState *bs, 
BlkdebugEvent event)
 }
 
 while (actions_count[ACTION_SUSPEND] > 0) {
-qemu_coroutine_yield();
+if (qemu_in_coroutine()) {
+qemu_coroutine_yield();
+} else {
+error_report("Non-coroutine event %s cannot suspend\n",
+ BlkdebugEvent_lookup.array[event]);
+}
 actions_count[ACTION_SUSPEND]--;
 }
 }
diff --git a/include/block/block-io.h b/include/block/block-io.h
index 1fa717a545a0..0e7032a23936 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -175,7 +175,7 @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t 
size);
 void bdrv_enable_copy_on_read(BlockDriverState *bs);
 void bdrv_disable_copy_on_read(BlockDriverState *bs);
 
-void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event);
+void coroutine_mixed_fn bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent 
event);
 
 #define BLKDBG_EVENT(child, evt) \
 do { \
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index c34c525fa6ba..1d4fd5094a5b 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -726,7 +726,8 @@ struct BlockDriver {
 int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_check)(
 BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix);
 
-void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
+void coroutine_mixed_fn (*bdrv_debug_event)(BlockDriverState *bs,
+BlkdebugEvent event);
 
 /* io queue for linux-aio */
 void (*bdrv_io_plug)(BlockDriverState *bs);
-- 
2.38.1

Re: [PATCH v3 1/3] update-linux-headers: Version 6.1-rc8

2022-12-15 Thread Bin Meng

On Thu, Dec 8, 2022 at 10:54 PM Mayuresh Chitale
 wrote:
>
> Sync headers with kernel commit 76dcd734eca2
>
> Signed-off-by: Mayuresh Chitale 
> Reviewed-by: Andrew Jones 
> ---
>  include/standard-headers/drm/drm_fourcc.h |  34 -
>  include/standard-headers/linux/ethtool.h  |  63 +++-
>  include/standard-headers/linux/fuse.h |   6 +-
>  .../linux/input-event-codes.h |   1 +
>  include/standard-headers/linux/virtio_blk.h   |  19 +++
>  linux-headers/asm-generic/hugetlb_encode.h|  26 ++--
>  linux-headers/asm-generic/mman-common.h   |   2 +
>  linux-headers/asm-mips/mman.h |   2 +
>  linux-headers/asm-riscv/kvm.h |   4 +
>  linux-headers/linux/kvm.h |   1 +
>  linux-headers/linux/psci.h|  14 ++
>  linux-headers/linux/userfaultfd.h |   4 +
>  linux-headers/linux/vfio.h| 142 ++
>  13 files changed, 298 insertions(+), 20 deletions(-)
>

Acked-by: Bin Meng

Re: [PATCH v1 24/24] vfio-user: add trace points

2022-12-15 Thread Cédric Le Goater


On 11/9/22 00:13, John Johnson wrote:

Signed-off-by: John G Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
---
  hw/vfio/trace-events | 15 +++
  hw/vfio/user.c   | 26 ++
  2 files changed, 41 insertions(+)



I would introduce the traces progressively in the patchset with the
routine being traced.

Thanks,

C.



diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 73dffe9..c27cec7 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -166,3 +166,18 @@ vfio_load_state_device_data(const char *name, uint64_t 
data_offset, uint64_t dat
  vfio_load_cleanup(const char *name) " (%s)"
  vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container 
fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
  vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 
0x%"PRIx64" - 0x%"PRIx64
+
+# user.c
+vfio_user_recv_hdr(const char *name, uint16_t id, uint16_t cmd, uint32_t size, uint32_t 
flags) " (%s) id 0x%x cmd 0x%x size 0x%x flags 0x%x"
+vfio_user_recv_read(uint16_t id, int read) " id 0x%x read 0x%x"
+vfio_user_recv_request(uint16_t cmd) " command 0x%x"
+vfio_user_send_write(uint16_t id, int wrote) " id 0x%x wrote 0x%x"
+vfio_user_version(uint16_t major, uint16_t minor, const char *caps) " major %d 
minor %d caps: %s"
+vfio_user_dma_map(uint64_t iova, uint64_t size, uint64_t off, uint32_t flags, bool will_commit) " iova 
0x%"PRIx64" size 0x%"PRIx64" off 0x%"PRIx64" flags 0x%x will_commit %d"
+vfio_user_dma_unmap(uint64_t iova, uint64_t size, uint32_t flags, bool dirty, bool will_commit) " iova 
0x%"PRIx64" size 0x%"PRIx64" flags 0x%x dirty %d will_commit %d"
+vfio_user_get_info(uint32_t nregions, uint32_t nirqs) " #regions %d #irqs %d"
+vfio_user_get_region_info(uint32_t index, uint32_t flags, uint64_t size) " index %d 
flags 0x%x size 0x%"PRIx64
+vfio_user_get_irq_info(uint32_t index, uint32_t flags, uint32_t count) " index %d 
flags 0x%x count %d"
+vfio_user_set_irqs(uint32_t index, uint32_t start, uint32_t count, uint32_t flags) 
" index %d start %d count %d flags 0x%x"
+vfio_user_region_rw(uint32_t region, uint64_t off, uint32_t count) " region %d offset 
0x%"PRIx64" count %d"
+vfio_user_wrmulti(const char *s, uint64_t wr_cnt) " %s count 0x%"PRIx64
diff --git a/hw/vfio/user.c b/hw/vfio/user.c
index 4ed305b..74e1714 100644
--- a/hw/vfio/user.c
+++ b/hw/vfio/user.c
@@ -30,6 +30,8 @@
  #include "qapi/qmp/qnum.h"
  #include "qapi/qmp/qbool.h"
  #include "user.h"
+#include "trace.h"
+
  
  
  /*

@@ -108,6 +110,8 @@ static int vfio_user_send_qio(VFIOProxy *proxy, VFIOUserMsg 
*msg)
  vfio_user_shutdown(proxy);
  error_report_err(local_err);
  }
+trace_vfio_user_send_write(msg->hdr->id, ret);
+
  return ret;
  }
  
@@ -225,6 +229,7 @@ static int vfio_user_complete(VFIOProxy *proxy, Error **errp)

  }
  return ret;
  }
+trace_vfio_user_recv_read(msg->hdr->id, ret);
  
  msgleft -= ret;

  data += ret;
@@ -332,6 +337,8 @@ static int vfio_user_recv_one(VFIOProxy *proxy)
  error_setg(&local_err, "unknown message type");
  goto fatal;
  }
+trace_vfio_user_recv_hdr(proxy->sockname, hdr.id, hdr.command, hdr.size,
+ hdr.flags);
  
  /*

   * For replies, find the matching pending request.
@@ -408,6 +415,7 @@ static int vfio_user_recv_one(VFIOProxy *proxy)
  if (ret <= 0) {
  goto fatal;
  }
+trace_vfio_user_recv_read(hdr.id, ret);
  
  msgleft -= ret;

  data += ret;
@@ -546,6 +554,7 @@ static void vfio_user_request(void *opaque)
  QTAILQ_INIT(&free);
  QTAILQ_FOREACH_SAFE(msg, &new, next, m1) {
  QTAILQ_REMOVE(&new, msg, next);
+trace_vfio_user_recv_request(msg->hdr->command);
  proxy->request(proxy->req_arg, msg);
  QTAILQ_INSERT_HEAD(&free, msg, next);
  }
@@ -1265,6 +1274,7 @@ int vfio_user_validate_version(VFIOProxy *proxy, Error 
**errp)
  msgp->minor = VFIO_USER_MINOR_VER;
  memcpy(&msgp->capabilities, caps->str, caplen);
  g_string_free(caps, true);
+trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities);
  
  vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0, false);

  if (msgp->hdr.flags & VFIO_USER_ERROR) {
@@ -1288,6 +1298,7 @@ int vfio_user_validate_version(VFIOProxy *proxy, Error 
**errp)
  return -1;
  }
  
+trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities);

  return 0;
  }
  
@@ -1305,6 +1316,8 @@ static int vfio_user_dma_map(VFIOProxy *proxy,

  msgp->offset = map->vaddr;
  msgp->iova = map->iova;
  msgp->size = map->size;
+trace_vfio_user_dma_map(msgp->iova, msgp->size, msgp->offset, msgp->flags,
+will_commit);
  
  /*

   * The will_commit case sends withou

[PATCH] qemu-io: do not reinvent the blk_pwrite_zeroes wheel

2022-12-15 Thread Paolo Bonzini

qemu-io's do_co_pwrite_zeroes is reinventing the coroutine wrapper
blk_pwrite_zeroes.  Just use the real thing directly.

Signed-off-by: Paolo Bonzini 
---
 qemu-io-cmds.c | 55 +-
 1 file changed, 9 insertions(+), 46 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 952dc940f1df..7a412d6512fb 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -572,54 +572,17 @@ static int do_pwrite(BlockBackend *blk, char *buf, 
int64_t offset,
 return 1;
 }
 
-typedef struct {
-BlockBackend *blk;
-int64_t offset;
-int64_t bytes;
-int64_t *total;
-int flags;
-int ret;
-bool done;
-} CoWriteZeroes;
-
-static void coroutine_fn co_pwrite_zeroes_entry(void *opaque)
-{
-CoWriteZeroes *data = opaque;
-
-data->ret = blk_co_pwrite_zeroes(data->blk, data->offset, data->bytes,
- data->flags);
-data->done = true;
-if (data->ret < 0) {
-*data->total = data->ret;
-return;
-}
-
-*data->total = data->bytes;
-}
-
-static int do_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+static int do_pwrite_zeroes(BlockBackend *blk, int64_t offset,
int64_t bytes, int flags, int64_t *total)
 {
-Coroutine *co;
-CoWriteZeroes data = {
-.blk= blk,
-.offset = offset,
-.bytes  = bytes,
-.total  = total,
-.flags  = flags,
-.done   = false,
-};
+int ret = blk_pwrite_zeroes(blk, offset, bytes,
+flags | BDRV_REQ_ZERO_WRITE);
 
-co = qemu_coroutine_create(co_pwrite_zeroes_entry, &data);
-bdrv_coroutine_enter(blk_bs(blk), co);
-while (!data.done) {
-aio_poll(blk_get_aio_context(blk), true);
-}
-if (data.ret < 0) {
-return data.ret;
-} else {
-return 1;
+if (ret < 0) {
+return ret;
 }
+*total = bytes;
+return 1;
 }
 
 static int do_write_compressed(BlockBackend *blk, char *buf, int64_t offset,
@@ -1042,7 +1005,7 @@ static void write_help(void)
 " -C, -- report statistics in a machine parsable format\n"
 " -q, -- quiet mode, do not show I/O statistics\n"
 " -u, -- with -z, allow unmapping\n"
-" -z, -- write zeroes using blk_co_pwrite_zeroes\n"
+" -z, -- write zeroes using blk_pwrite_zeroes\n"
 "\n");
 }
 
@@ -1199,7 +1162,7 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 if (bflag) {
 ret = do_save_vmstate(blk, buf, offset, count, &total);
 } else if (zflag) {
-ret = do_co_pwrite_zeroes(blk, offset, count, flags, &total);
+ret = do_pwrite_zeroes(blk, offset, count, flags, &total);
 } else if (cflag) {
 ret = do_write_compressed(blk, buf, offset, count, &total);
 } else {
-- 
2.38.1

QEMU PPC VLE support

2022-12-15 Thread Stefan Hajnoczi

Hi,
I came across this post where Ralf-Philipp is looking for a freelancer
to implement PPC VLE support in QEMU:
https://chaos.social/@rpw/109516326028642262

It mentions upstreaming the code and I've included QEMU PPC
maintainers in this email so they can discuss the project with
Ralf-Philipp. That way the chances of a mergable result will be
maximized.

The Rust aspect is interesting, but QEMU does not have any existing
targets implemented in Rust. It might be a major effort to create the
necessary C<->Rust interfacing, so I'm not sure whether that's
realistic given the timeframe for the project.

Does anyone have time to take on this freelancing project or know
someone who is available?

Thanks,
Stefan

Re: [PULL 00/23] First batch of s390x, qtest, CI and misc patches for 8.0

2022-12-15 Thread Peter Maydell

On Wed, 14 Dec 2022 at 10:16, Thomas Huth  wrote:
>
>  Hi!
>
> The following changes since commit 5204b499a6cae4dfd9fe762d5e6e82224892383b:
>
>   mailmap: Fix Stefan Weil author email (2022-12-13 15:56:57 -0500)
>
> are available in the Git repository at:
>
>   https://gitlab.com/thuth/qemu.git tags/pull-request-2022-12-14
>
> for you to fetch changes up to 8eeb98e2ea03639e743fdae82ae69d571d8ef0a3:
>
>   tests/qtest/vhost-user-blk-test: don't abort all qtests on missing envar 
> (2022-12-14 08:55:37 +0100)
>
> 
> * s390x PCI fixes and improvements (for the ISM device)
> * Fix emulated MVCP and MVCS s390x instructions
> * Clean-ups for the e1000e qtest
> * Enable qtests on Windows
> * Update FreeBSD CI to version 12.4
> * Check --disable-tcg for ppc64 in the CI
> * Improve scripts/make-releases a little bit
> * Many other misc small clean-ups and fixes here and there

Hi; this fails to compile:

../../util/qemu-config.c: In function 'objprop_to_cmdline_prop':
../../util/qemu-config.c:165:13: error: 'CommandLineParameterInfo' has
no member named 'has_help'
  165 | info->has_help = true;
  | ^~
../../util/qemu-config.c: In function 'query_all_machine_properties':
../../util/qemu-config.c:217:9: error: 'CommandLineParameterInfo' has
no member named 'has_help'
  217 | info->has_help = true;
  | ^~

I'm afraid your pullreq has crossed in the mail with Markus'
QAPI one, which refactored things so these QAPI structs
no longer have has_* fields.

thanks
-- PMM

Re: migration qtest failure: "query-migrate shows failed migration: Unable to write to socket: Broken pipe"

2022-12-15 Thread Dr. David Alan Gilbert

* Peter Maydell (peter.mayd...@linaro.org) wrote:
> On Thu, 15 Dec 2022 at 12:52, Dr. David Alan Gilbert
>  wrote:
> >
> > * Peter Maydell (peter.mayd...@linaro.org) wrote:
> > > It doesn't seem to repro running manually, my guess is that
> > > it happens because the machine is heavily loaded doing the
> > > whole build-and-test cycle.
> >
> > Yeh; I think we'll still need a backtrace or better qmp log though to
> > figure it out.
> 
> Unfortunately, often all you get is "what does 'make check' output".
> Is there a way we can improve the test so it outputs something
> more useful when it detects a failure ?

I can't think how to improve the test itself like that; could you bind
assertion failures to call/print a backtrace:

https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/backtrace.3.html

?

Dave

> thanks
> -- PMM
> 
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH] blkdebug: ignore invalid rules in non-coroutine context

2022-12-15 Thread Kevin Wolf

Am 15.12.2022 um 14:02 hat Paolo Bonzini geschrieben:
> blkdebug events can be called from either non-coroutine or coroutine
> contexts.  However, suspend actions only make sense from within
> a coroutine.  Currently, using those action would lead to an abort() in
> qemu_coroutine_yield() ("Co-routine is yielding to no one").  Catch them
> and print an error instead.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block.c  |  2 +-
>  block/blkdebug.c | 10 --
>  include/block/block-io.h |  2 +-
>  include/block/block_int-common.h |  3 ++-
>  4 files changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 3f2bd128570e..49c66475c73e 100644
> --- a/block.c
> +++ b/block.c
> @@ -6334,7 +6334,7 @@ BlockStatsSpecific 
> *bdrv_get_specific_stats(BlockDriverState *bs)
>  return drv->bdrv_get_specific_stats(bs);
>  }
>  
> -void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event)
> +void coroutine_mixed_fn bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent 
> event)

coroutine_mixed_fn isn't a thing. I assume this depends on some patch
you haven't sent yet?

Kevin

[PATCH v11 2/5] vdpa: add vdpa-dev support

2022-12-15 Thread Longpeng(Mike)

From: Longpeng 

Supports vdpa-dev, we can use the deivce directly:

-M microvm -m 512m -smp 2 -kernel ... -initrd ... -device \
vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-x

Reviewed-by: Stefano Garzarella 
Acked-by: Jason Wang 
Signed-off-by: Longpeng 
---
 hw/virtio/Kconfig|   5 +
 hw/virtio/meson.build|   1 +
 hw/virtio/vdpa-dev.c | 376 +++
 include/hw/virtio/vdpa-dev.h |  43 
 4 files changed, 425 insertions(+)
 create mode 100644 hw/virtio/vdpa-dev.c
 create mode 100644 include/hw/virtio/vdpa-dev.h

diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index cbfd8c7173..89e9e426d8 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -85,3 +85,8 @@ config VHOST_USER_GPIO
 bool
 default y
 depends on VIRTIO && VHOST_USER
+
+config VHOST_VDPA_DEV
+bool
+default y
+depends on VIRTIO && VHOST_VDPA && LINUX
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index dfed1e7af5..54d6d29af7 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -31,6 +31,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: 
files('vhost-user-i2c.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: 
files('vhost-user-rng.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: 
files('vhost-user-gpio.c'))
 virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: 
files('vhost-user-gpio-pci.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c'))
 
 virtio_pci_ss = ss.source_set()
 virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: 
files('vhost-vsock-pci.c'))
diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c
new file mode 100644
index 00..dbc4f8001d
--- /dev/null
+++ b/hw/virtio/vdpa-dev.c
@@ -0,0 +1,376 @@
+/*
+ * Vhost Vdpa Device
+ *
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All Rights Reserved.
+ *
+ * Authors:
+ *   Longpeng 
+ *
+ * Largely based on the "vhost-user-blk-pci.c" and "vhost-user-blk.c"
+ * implemented by:
+ *   Changpeng Liu 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "hw/qdev-core.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/vdpa-dev.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/runstate.h"
+
+static void
+vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+/* Nothing to do */
+}
+
+static uint32_t
+vhost_vdpa_device_get_u32(int fd, unsigned long int cmd, Error **errp)
+{
+uint32_t val = (uint32_t)-1;
+
+if (ioctl(fd, cmd, &val) < 0) {
+error_setg(errp, "vhost-vdpa-device: cmd 0x%lx failed: %s",
+   cmd, strerror(errno));
+}
+
+return val;
+}
+
+static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VhostVdpaDevice *v = VHOST_VDPA_DEVICE(vdev);
+uint16_t max_queue_size;
+struct vhost_virtqueue *vqs;
+int i, ret;
+
+if (!v->vhostdev) {
+error_setg(errp, "vhost-vdpa-device: vhostdev are missing");
+return;
+}
+
+v->vhostfd = qemu_open(v->vhostdev, O_RDWR, errp);
+if (*errp) {
+return;
+}
+v->vdpa.device_fd = v->vhostfd;
+
+v->vdev_id = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_DEVICE_ID, errp);
+if (*errp) {
+goto out;
+}
+
+max_queue_size = vhost_vdpa_device_get_u32(v->vhostfd,
+   VHOST_VDPA_GET_VRING_NUM, errp);
+if (*errp) {
+goto out;
+}
+
+if (v->queue_size > max_queue_size) {
+error_setg(errp, "vhost-vdpa-device: invalid queue_size: %u (max:%u)",
+   v->queue_size, max_queue_size);
+goto out;
+} else if (!v->queue_size) {
+v->queue_size = max_queue_size;
+}
+
+v->num_queues = vhost_vdpa_device_get_u32(v->vhostfd,
+  VHOST_VDPA_GET_VQS_COUNT, errp);
+if (*errp) {
+goto out;
+}
+
+if (!v->num_queues || v->num_queues > VIRTIO_QUEUE_MAX) {
+error_setg(errp, "invalid number of virtqueues: %u (max:%u)",
+   v->num_queues, VIRTIO_QUEUE_MAX);
+goto out;
+}
+
+v->dev.nvqs = v->num_queues;
+vqs = g_new0(struct vhost_virtqueue, v->dev.nvqs);
+v->dev.vqs = vqs;
+v->dev.vq_index = 0;
+v->dev.vq_index_end = v->dev.nvqs;
+v->dev.backend_features = 0;
+v->started = false;
+
+ret = vhost_dev_init(&v->dev, &v->vdpa, VHOST_BACKEND_TYPE_VDPA, 0, NULL);
+if (ret < 0) {
+error_s

1 2 3 >

1 - 100 of 223 matches

Mail list logo