date:20250203

On Mon, Feb 03, 2025 at 01:25:29PM -0500, Stefan Hajnoczi wrote:
> BLOCK_OP_TYPE_DATAPLANE prevents BlockDriverState from being used by
> virtio-blk/virtio-scsi with IOThread. Commit b112a65c52aa ("block:
> declare blockjobs and dataplane friends!") eliminated the main reason
> for this blocker in 2014.

Wow, that's a long time.

> 
> Nowadays the block layer supports I/O from multiple AioContexts, so
> there is even less reason to block IOThread users. Any legitimate
> reasons related to interference would probably also apply to
> non-IOThread users.
> 
> The only remaining users are bdrv_op_unblock(BLOCK_OP_TYPE_DATAPLANE)
> calls after bdrv_op_block_all(). If we remove BLOCK_OP_TYPE_DATAPLANE
> their behavior doesn't change.
> 
> Existing bdrv_op_block_all() callers that don't explicitly unblock
> BLOCK_OP_TYPE_DATAPLANE seem to do so simply because no one bothered to
> rather than because it is necessary to keep BLOCK_OP_TYPE_DATAPLANE
> blocked.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  include/block/block-common.h | 1 -
>  block/replication.c  | 1 -
>  blockjob.c   | 2 --
>  hw/block/virtio-blk.c| 9 -
>  hw/scsi/virtio-scsi.c| 3 ---
>  5 files changed, 16 deletions(-)

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

Re: [PATCH v3 09/26] target/arm/kvm-rme: Initialize Realm memory

2025-02-03 Thread Gavin Shan


On 11/26/24 5:56 AM, Jean-Philippe Brucker wrote:

Initialize the IPA state of RAM. Collect the images copied into guest
RAM into a sorted list, and issue POPULATE_REALM KVM ioctls once we've
created the Realm Descriptor. The images are part of the Realm Initial
Measurement.

Signed-off-by: Jean-Philippe Brucker 
---
v2->v3: RIPAS is now initialized separately
---
  target/arm/kvm_arm.h |  14 +
  target/arm/kvm-rme.c | 128 +++
  2 files changed, 142 insertions(+)

diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h
index 8b52a881b0..67db09a424 100644
--- a/target/arm/kvm_arm.h
+++ b/target/arm/kvm_arm.h
@@ -255,6 +255,16 @@ int kvm_arm_rme_vm_type(MachineState *ms);
   */
  int kvm_arm_rme_vcpu_init(CPUState *cs);
  
+/*

+ * kvm_arm_rme_init_guest_ram
+ * @base: base address of RAM
+ * @size: size of RAM
+ *
+ * If the user requested a Realm, set the base and size of guest RAM, in order
+ * to initialize the Realm IPA space.
+ */
+void kvm_arm_rme_init_guest_ram(hwaddr base, size_t size);
+
  #else
  
  /*

@@ -281,6 +291,10 @@ static inline bool kvm_arm_mte_supported(void)
  return false;
  }
  
+static inline void kvm_arm_rme_init_guest_ram(hwaddr base, size_t size)

+{
+}
+
  /*
   * These functions should never actually be called without KVM support.
   */
diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c
index e3cc37538a..83a29421df 100644
--- a/target/arm/kvm-rme.c
+++ b/target/arm/kvm-rme.c
@@ -9,6 +9,7 @@
  #include "exec/confidential-guest-support.h"
  #include "hw/boards.h"
  #include "hw/core/cpu.h"
+#include "hw/loader.h"
  #include "kvm_arm.h"
  #include "migration/blocker.h"
  #include "qapi/error.h"
@@ -20,16 +21,85 @@
  #define TYPE_RME_GUEST "rme-guest"
  OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST)
  
+#define RME_PAGE_SIZE qemu_real_host_page_size()

+
  struct RmeGuest {
  ConfidentialGuestSupport parent_obj;
+Notifier rom_load_notifier;
+GSList *ram_regions;
+
+hwaddr ram_base;
+size_t ram_size;
  };
  


s/size_t/hwaddr. To be consistent with RmeRamRegion, we may reuse
it like below.

struct RmeGuest {
:
GSlist *populate_ram_regions;
RmeRamRegion init_ram_region;
};


  OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(RmeGuest, rme_guest, RME_GUEST,
CONFIDENTIAL_GUEST_SUPPORT,
{ TYPE_USER_CREATABLE }, { })
  
+typedef struct {

+hwaddr base;
+hwaddr size;
+} RmeRamRegion;
+
  static RmeGuest *rme_guest;
  
+static int rme_init_ram(hwaddr base, size_t size, Error **errp)

+{
+int ret;
+uint64_t start = QEMU_ALIGN_DOWN(base, RME_PAGE_SIZE);
+uint64_t end = QEMU_ALIGN_UP(base + size, RME_PAGE_SIZE);
+struct kvm_cap_arm_rme_init_ipa_args init_args = {
+.init_ipa_base = start,
+.init_ipa_size = end - start,
+};
+
+ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0,
+KVM_CAP_ARM_RME_INIT_IPA_REALM,
+(intptr_t)&init_args);
+if (ret) {
+error_setg_errno(errp, -ret,
+ "failed to init RAM [0x%"HWADDR_PRIx", 
0x%"HWADDR_PRIx")",

 ^^
^^^
The type for 'start' and 'end' would be 'hwaddr'.


+ start, end);
+}
+
+return ret;
+}
+
+static int rme_populate_range(hwaddr base, size_t size, bool measure,
+  Error **errp)
+{
+int ret;
+uint64_t start = QEMU_ALIGN_DOWN(base, RME_PAGE_SIZE);
+uint64_t end = QEMU_ALIGN_UP(base + size, RME_PAGE_SIZE);
+struct kvm_cap_arm_rme_populate_realm_args populate_args = {
+.populate_ipa_base = start,
+.populate_ipa_size = end - start,
+.flags = measure ? KVM_ARM_RME_POPULATE_FLAGS_MEASURE : 0,
+};
+
+ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0,
+KVM_CAP_ARM_RME_POPULATE_REALM,
+(intptr_t)&populate_args);
+if (ret) {
+error_setg_errno(errp, -ret,
+   "failed to populate realm [0x%"HWADDR_PRIx", 
0x%"HWADDR_PRIx")",
+   start, end);
+}
+return ret;
+}
+
+static void rme_populate_ram_region(gpointer data, gpointer err)
+{
+Error **errp = err;
+const RmeRamRegion *region = data;
+
+if (*errp) {
+return;
+}
+
+rme_populate_range(region->base, region->size, /* measure */ true, errp);
+}
+
  static int rme_init_cpus(Error **errp)
  {
  int ret;
@@ -60,6 +130,16 @@ static int rme_create_realm(Error **errp)
  return -1;
  }
  
+if (rme_init_ram(rme_guest->ram_base, rme_guest->ram_size, errp)) {

+return -1;
+}
+
+g_slist_foreach(rme_guest->ram_regions, rme_populate_ram_region, errp);
+g_slist_free_full(g_steal_pointer(&rme_guest->ram_regions), g_free);
+if (*errp) {
+return -1;
+}
+

Re: [PATCH v3 08/26] hw/core/loader: Add ROM loader notifier

2025-02-03 Thread Gavin Shan


On 11/26/24 5:56 AM, Jean-Philippe Brucker wrote:

Add a function to register a notifier, that is invoked after a ROM gets
loaded into guest memory.

It will be used by Arm confidential guest support, in order to register
all blobs loaded into memory with KVM, so that their content is moved
into Realm state and measured into the initial VM state.

Signed-off-by: Jean-Philippe Brucker 
---
  include/hw/loader.h | 15 +++
  hw/core/loader.c| 15 +++
  2 files changed, 30 insertions(+)

diff --git a/include/hw/loader.h b/include/hw/loader.h
index 7f6d06b956..0cd9905f97 100644
--- a/include/hw/loader.h
+++ b/include/hw/loader.h
@@ -353,6 +353,21 @@ void *rom_ptr_for_as(AddressSpace *as, hwaddr addr, size_t 
size);
  ssize_t rom_add_vga(const char *file);
  ssize_t rom_add_option(const char *file, int32_t bootindex);
  
+typedef struct RomLoaderNotify {

+/* Parameters passed to rom_add_blob() */
+hwaddr addr;
+size_t len;
+size_t max_len;
+} RomLoaderNotify;
+


I would suggest to rename it to RomLoaderNotifyData since it's the
data passed to the notifier.


+/**
+ * rom_add_load_notifier - Add a notifier for loaded images
+ *
+ * Add a notifier that will be invoked with a RomLoaderNotify structure for 
each
+ * blob loaded into guest memory, after the blob is loaded.
+ */
+void rom_add_load_notifier(Notifier *notifier);
+
  /* This is the usual maximum in uboot, so if a uImage overflows this, it would
   * overflow on real hardware too. */
  #define UBOOT_MAX_GUNZIP_BYTES (64 << 20)
diff --git a/hw/core/loader.c b/hw/core/loader.c
index 31593a1171..759a62cf58 100644
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -67,6 +67,8 @@
  #include 
  
  static int roms_loaded;

+static NotifierList rom_loader_notifier =
+NOTIFIER_LIST_INITIALIZER(rom_loader_notifier);
  
  /* return the size or -1 if error */

  int64_t get_image_size(const char *filename)
@@ -1179,6 +1181,11 @@ MemoryRegion *rom_add_blob(const char *name, const void 
*blob, size_t len,
  return mr;
  }
  
+void rom_add_load_notifier(Notifier *notifier)

+{
+notifier_list_add(&rom_loader_notifier, notifier);
+}
+
  /* This function is specific for elf program because we don't need to allocate
   * all the rom. We just allocate the first part and the rest is just zeros. 
This
   * is why romsize and datasize are different. Also, this function takes its 
own
@@ -1220,6 +1227,7 @@ ssize_t rom_add_option(const char *file, int32_t 
bootindex)
  static void rom_reset(void *unused)
  {
  Rom *rom;
+RomLoaderNotify notify;
  
  QTAILQ_FOREACH(rom, &roms, next) {

  if (rom->fw_file) {
@@ -1268,6 +1276,13 @@ static void rom_reset(void *unused)
  cpu_flush_icache_range(rom->addr, rom->datasize);
  
  trace_loader_write_rom(rom->name, rom->addr, rom->datasize, rom->isrom);

+
+notify = (RomLoaderNotify) {
+.addr = rom->addr,
+.len = rom->datasize,
+.max_len = rom->romsize,
+};
+notifier_list_notify(&rom_loader_notifier, ¬ify);
  }
  }
  


Thanks,
Gavin

Re: [PATCH 0/1] meson: Deprecate 32-bit host systems


On 3/2/25 10:10, Alex Bennée wrote:

Peter Maydell  writes:


On Wed, 29 Jan 2025 at 06:23, Thomas Huth  wrote:

So unless someone complains immediately with a good reason, I'm also in
favor of marking it as deprecated now. If then someone complains during the
deprecation period, we still can reconsider and remove the deprecation note
again.


Well, I mean the reason would be that I suspect we do still have
users who are using QEMU for some purposes on 32-bit arm hosts.
That doesn't mean they're trying to run massively complex or
high memory guests or that they care that our whole test suite
doesn't run.

I'm not really strongly opposed to dropping 32-bit host support,
but I don't think a thread on qemu-devel is exactly likely to
get the attention of the people who might be using this
functionality. (You could argue that functionality without
representation among the developer community is fair game
for being dumped even if it has users, of course.)


FWIW random internet poll:

   https://mastodon.org.uk/deck/@stsquad/113905257703721811

26% 32 bit
74% 64 bit

with 41 respondents.


Note that some respondents who voted to maintain 32-bit support
mixed 32-bit host with 32-bit guests.

Re: [PATCH 0/7] hw/arm/raspi4b: Add models with 4GB and 8GB of DRAM

On 3/2/25 15:50, Daniel P. Berrangé wrote:

On Mon, Feb 03, 2025 at 02:45:06PM +, Peter Maydell wrote:

On Mon, 3 Feb 2025 at 14:33, Daniel P. Berrangé wrote:

On Mon, Feb 03, 2025 at 02:29:49PM +, Alex Bennée wrote:

Peter Maydell writes:

On Sat, 1 Feb 2025 at 12:57, BALATON Zoltan wrote:

On Sat, 1 Feb 2025, Philippe Mathieu-Daudé wrote:

- Deprecate the 'raspi4b' machine name, renaming it as
'raspi4b-1g' on 32-bit hosts, 'raspi4b-2g' otherwise.
- Add the 'raspi4b-4g' and 'raspi4b-8g' machines, with
respectively 4GB and 8GB of DRAM.

IMHO (meaning you can ignore it, just my opinion) if the only difference
is the memory size -machine raspi4b -memory 4g would be better user
experience than having a lot of different machines.

Yes, I think I agree. We have a way for users to specify
how much memory they want, and I think it makes more sense
to use that than to have lots of different machine types.

I guess for the Pi we should validate the -memory supplied is on of the
supported grid of devices rather than an arbitrary value?

If the user wants to create a rpi4 with 6 GB RAM why should we stop
them ? It is their choice if they want to precisely replicate RAM
size from a physical model, or use something different when virtualized.

The board revision code (reported to the guest via the emulated
firmware interface) only supports reporting 256MB, 512MB,
1GB, 2GB, 4GB or 8GB:

https://www.raspberrypi.com/documentation/computers/raspberry-pi.html#new-style-revision-codes

I think it would be valid to report the revision code for the memory
size that doesn't exceed what QEMU has configured. eg if configured
with 6 GB, then report code for 4 GB.

We need to distinct between physical machines VS virtual ones.

Guests on virtual machines have some way to figure the virtual
hardware (ACPI tables, DeviceTree blob, fw-cfg, ...).

Guests for physical machines usually expect fixed hardware (not
considering devices on busses).

For the particular case of the Raspberry Pi machines, their
bootloader gets the board layout by reading the
RPI_FWREQ_GET_BOARD_REVISION constant value.

What would be the point of emulating a raspi machine with 6GB
if the FW is not going to consider besides 4GB?
Besides, someone modify a guest to work with 6GB, it won't work
on real HW.

For Arm embedded boards we mostly tend to "restrict the user
to what you can actually do", except for older boards where
we tended not to write any kind of sanity checking on CPU
type, memory size, etc.

If we're going to strictly limit memory size that's accepted I wonder
how we could information users/mgmt apps about what's permitted ?

Expressing valid combinations of configs across different args gets
pretty complicated quickly :-(

I'll try to address Zoltan and Peter request to have a dynamic raspi
machine. It is a bit unfortunate we didn't insisted on that when we
decided to expose a fixed set of existing boards in order to not be
bothered by inconsistent bug reports, back in 2019.

Regards,

Phil.

[PATCH qemu 4/5] hw/mem/cxl_type3: Ensure errp is set on realization failure

From: Li Zhijian 

Simply pass the errp to its callee which will set errp if needed, to
enhance error reporting for CXL Type 3 device initialization by setting
the errp when realization functions fail.

Previously, failing to set `errp` could result in errors being overlooked,
causing the system to mistakenly treat failure scenarios as successful and
potentially leading to redundant cleanup operations in ct3_exit().

Signed-off-by: Li Zhijian 
Signed-off-by: Jonathan Cameron 
---
 hw/mem/cxl_type3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index ff6861889b..d8b45f9bd1 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -891,7 +891,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
  &ct3d->cxl_dstate.device_registers);
 
 /* MSI(-X) Initialization */
-rc = msix_init_exclusive_bar(pci_dev, CXL_T3_MSIX_VECTOR_NR, 4, NULL);
+rc = msix_init_exclusive_bar(pci_dev, CXL_T3_MSIX_VECTOR_NR, 4, errp);
 if (rc) {
 goto err_free_special_ops;
 }
@@ -912,7 +912,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
 
 pcie_cap_deverr_init(pci_dev);
 /* Leave a bit of room for expansion */
-rc = pcie_aer_init(pci_dev, PCI_ERR_VER, 0x200, PCI_ERR_SIZEOF, NULL);
+rc = pcie_aer_init(pci_dev, PCI_ERR_VER, 0x200, PCI_ERR_SIZEOF, errp);
 if (rc) {
 goto err_release_cdat;
 }
-- 
2.43.0

[PATCH qemu 5/5] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways

From: Yao Xingtao 

Since the kernel does not check the interleave capability, a
3-way, 6-way, 12-way or 16-way region can be create normally.

Applications can access the memory of 16-way region normally because
qemu can convert hpa to dpa correctly for the power of 2 interleave
ways, after kernel implementing the check, this kind of region will
not be created any more.

For non power of 2 interleave ways, applications could not access the
memory normally and may occur some unexpected behaviors, such as
segmentation fault.

So implements this feature is needed.

Link: 
https://lore.kernel.org/linux-cxl/3e84b919-7631-d1db-3e1d-33000f3f3...@fujitsu.com/
Signed-off-by: Yao Xingtao 
Signed-off-by: Jonathan Cameron 
---
 hw/cxl/cxl-component-utils.c |  9 +++--
 hw/mem/cxl_type3.c   | 15 +++
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
index cd116c0401..473895948b 100644
--- a/hw/cxl/cxl-component-utils.c
+++ b/hw/cxl/cxl-component-utils.c
@@ -243,8 +243,13 @@ static void hdm_init_common(uint32_t *reg_state, uint32_t 
*write_msk,
 ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, INTERLEAVE_4K, 1);
 ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY,
  POISON_ON_ERR_CAP, 0);
-ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 3_6_12_WAY, 0);
-ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 16_WAY, 0);
+if (type == CXL2_TYPE3_DEVICE) {
+ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 3_6_12_WAY, 1);
+ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 16_WAY, 1);
+} else {
+ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 3_6_12_WAY, 0);
+ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 16_WAY, 0);
+}
 ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, UIO, 0);
 ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY,
  UIO_DECODER_COUNT, 0);
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index d8b45f9bd1..6fffa21ead 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -1100,10 +1100,17 @@ static bool cxl_type3_dpa(CXLType3Dev *ct3d, hwaddr 
host_addr, uint64_t *dpa)
 continue;
 }
 
-*dpa = dpa_base +
-((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) |
- ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & hpa_offset)
-  >> iw));
+if (iw < 8) {
+*dpa = dpa_base +
+((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) |
+ ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & hpa_offset)
+  >> iw));
+} else {
+*dpa = dpa_base +
+((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) |
+ MAKE_64BIT_MASK(ig + iw, 64 - ig - iw) & hpa_offset)
+   >> (ig + iw)) / 3) << (ig + 8)));
+}
 
 return true;
 }
-- 
2.43.0

Re: [PATCH V1 02/26] migration: lower handler priority

2025-02-03 Thread Fabiano Rosas

Steve Sistare  writes:

> Define a vmstate priority that is lower than the default, so its handlers
> run after all default priority handlers.  Since 0 is no longer the default
> priority, translate an uninitialized priority of 0 to MIG_PRI_DEFAULT.
>
> CPR for vfio will use this to install handlers for containers that run
> after handlers for the devices that they contain.
>
> Signed-off-by: Steve Sistare 

Reviewed-by: Fabiano Rosas

[PATCH qemu 3/5] hw/mem/cxl_type3: Fix special_ops memory leak on msix_init_exclusive_bar() failure

From: Li Zhijian 

Address a memory leak issue by ensuring `regs->special_ops` is freed when
`msix_init_exclusive_bar()` encounters an error during CXL Type3 device
initialization.

Additionally, this patch renames err_address_space_free to err_msix_uninit
for better clarity and logical flow

Signed-off-by: Li Zhijian 
Signed-off-by: Jonathan Cameron 
---
 hw/mem/cxl_type3.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 4775aab0d6..ff6861889b 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -893,7 +893,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
 /* MSI(-X) Initialization */
 rc = msix_init_exclusive_bar(pci_dev, CXL_T3_MSIX_VECTOR_NR, 4, NULL);
 if (rc) {
-goto err_address_space_free;
+goto err_free_special_ops;
 }
 for (i = 0; i < CXL_T3_MSIX_VECTOR_NR; i++) {
 msix_vector_use(pci_dev, i);
@@ -907,7 +907,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
 cxl_cstate->cdat.free_cdat_table = ct3_free_cdat_table;
 cxl_cstate->cdat.private = ct3d;
 if (!cxl_doe_cdat_init(cxl_cstate, errp)) {
-goto err_free_special_ops;
+goto err_msix_uninit;
 }
 
 pcie_cap_deverr_init(pci_dev);
@@ -943,10 +943,10 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
 
 err_release_cdat:
 cxl_doe_cdat_release(cxl_cstate);
-err_free_special_ops:
+err_msix_uninit:
 msix_uninit_exclusive_bar(pci_dev);
+err_free_special_ops:
 g_free(regs->special_ops);
-err_address_space_free:
 if (ct3d->dc.host_dc) {
 cxl_destroy_dc_regions(ct3d);
 address_space_destroy(&ct3d->dc.host_dc_as);
-- 
2.43.0

[PATCH qemu 2/5] hw/mem/cxl_type3: Add paired msix_uninit_exclusive_bar() call

From: Li Zhijian 

msix_uninit_exclusive_bar() should be paired with msix_init_exclusive_bar()

Ensure proper resource cleanup by adding the missing
`msix_uninit_exclusive_bar()` call for the Type3 CXL device.

Signed-off-by: Li Zhijian 
Signed-off-by: Jonathan Cameron 
---
 hw/mem/cxl_type3.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index ebc0ec536e..4775aab0d6 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -944,6 +944,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
 err_release_cdat:
 cxl_doe_cdat_release(cxl_cstate);
 err_free_special_ops:
+msix_uninit_exclusive_bar(pci_dev);
 g_free(regs->special_ops);
 err_address_space_free:
 if (ct3d->dc.host_dc) {
@@ -967,6 +968,7 @@ static void ct3_exit(PCIDevice *pci_dev)
 
 pcie_aer_exit(pci_dev);
 cxl_doe_cdat_release(cxl_cstate);
+msix_uninit_exclusive_bar(pci_dev);
 g_free(regs->special_ops);
 if (ct3d->dc.host_dc) {
 cxl_destroy_dc_regions(ct3d);
-- 
2.43.0

[PATCH qemu 0/5] hw/cxl: Cleanups and interleave support.

First set of CXL updates for the 10.0 cycle.
- Mixture of cleanup and hardening against a repeat of recent MSI-X
  numbering bug.
- Expanded interleave support (been on my tree a long time)

Whilst I think these are in a good state, review always welcome.

Li Zhijian (4):
  hw/cxl: Introduce CXL_T3_MSIX_VECTOR enumeration
  hw/mem/cxl_type3: Add paired msix_uninit_exclusive_bar() call
  hw/mem/cxl_type3: Fix special_ops memory leak on
msix_init_exclusive_bar() failure
  hw/mem/cxl_type3: Ensure errp is set on realization failure

Yao Xingtao (1):
  mem/cxl_type3: support 3, 6, 12 and 16 interleave ways

 include/hw/cxl/cxl_device.h  |  4 ++--
 hw/cxl/cxl-component-utils.c |  9 ++--
 hw/cxl/cxl-device-utils.c| 12 --
 hw/cxl/switch-mailbox-cci.c  |  4 +++-
 hw/mem/cxl_type3.c   | 45 +---
 5 files changed, 48 insertions(+), 26 deletions(-)

-- 
2.43.0

[PATCH qemu 1/5] hw/cxl: Introduce CXL_T3_MSIX_VECTOR enumeration

From: Li Zhijian 

Introduce the `CXL_T3_MSIX_VECTOR` enumeration to specify MSIX vector
assignments specific to the Type 3 (T3) CXL device.

The primary goal of this change is to encapsulate the MSIX vector uses
that are unique to the T3 device within an enumeration, improving code
readability and maintenance by avoiding magic numbers. This organizational
change allows for more explicit references to each vector’s role, thereby
reducing the potential for misconfiguration.

It also modified `mailbox_reg_init_common` to accept the `msi_n` parameter,
reflecting the new MSIX vector setup.

This pertains to the T3 device privately; other endpoints should refrain from
using it, despite its public accessibility to all of them.

Signed-off-by: Li Zhijian 
Signed-off-by: Jonathan Cameron 
---
 include/hw/cxl/cxl_device.h |  4 ++--
 hw/cxl/cxl-device-utils.c   | 12 +---
 hw/cxl/switch-mailbox-cci.c |  4 +++-
 hw/mem/cxl_type3.c  | 20 ++--
 4 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index 561b375dc8..3a0ee7e8e7 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -264,8 +264,8 @@ void cxl_device_register_block_init(Object *obj, 
CXLDeviceState *dev,
 typedef struct CXLType3Dev CXLType3Dev;
 typedef struct CSWMBCCIDev CSWMBCCIDev;
 /* Set up default values for the register block */
-void cxl_device_register_init_t3(CXLType3Dev *ct3d);
-void cxl_device_register_init_swcci(CSWMBCCIDev *sw);
+void cxl_device_register_init_t3(CXLType3Dev *ct3d, int msi_n);
+void cxl_device_register_init_swcci(CSWMBCCIDev *sw, int msi_n);
 
 /*
  * CXL r3.1 Section 8.2.8.1: CXL Device Capabilities Array Register
diff --git a/hw/cxl/cxl-device-utils.c b/hw/cxl/cxl-device-utils.c
index 035d034f6d..52ad1e4c3f 100644
--- a/hw/cxl/cxl-device-utils.c
+++ b/hw/cxl/cxl-device-utils.c
@@ -352,10 +352,8 @@ static void device_reg_init_common(CXLDeviceState 
*cxl_dstate)
 }
 }
 
-static void mailbox_reg_init_common(CXLDeviceState *cxl_dstate)
+static void mailbox_reg_init_common(CXLDeviceState *cxl_dstate, int msi_n)
 {
-const uint8_t msi_n = 9;
-
 /* 2048 payload size */
 ARRAY_FIELD_DP32(cxl_dstate->mbox_reg_state32, CXL_DEV_MAILBOX_CAP,
  PAYLOAD_SIZE, CXL_MAILBOX_PAYLOAD_SHIFT);
@@ -382,7 +380,7 @@ static void memdev_reg_init_common(CXLDeviceState 
*cxl_dstate)
 cxl_dstate->memdev_status = memdev_status_reg;
 }
 
-void cxl_device_register_init_t3(CXLType3Dev *ct3d)
+void cxl_device_register_init_t3(CXLType3Dev *ct3d, int msi_n)
 {
 CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate;
 uint64_t *cap_h = cxl_dstate->caps_reg_state64;
@@ -398,7 +396,7 @@ void cxl_device_register_init_t3(CXLType3Dev *ct3d)
 device_reg_init_common(cxl_dstate);
 
 cxl_device_cap_init(cxl_dstate, MAILBOX, 2, CXL_DEV_MAILBOX_VERSION);
-mailbox_reg_init_common(cxl_dstate);
+mailbox_reg_init_common(cxl_dstate, msi_n);
 
 cxl_device_cap_init(cxl_dstate, MEMORY_DEVICE, 0x4000,
 CXL_MEM_DEV_STATUS_VERSION);
@@ -408,7 +406,7 @@ void cxl_device_register_init_t3(CXLType3Dev *ct3d)
   CXL_MAILBOX_MAX_PAYLOAD_SIZE);
 }
 
-void cxl_device_register_init_swcci(CSWMBCCIDev *sw)
+void cxl_device_register_init_swcci(CSWMBCCIDev *sw, int msi_n)
 {
 CXLDeviceState *cxl_dstate = &sw->cxl_dstate;
 uint64_t *cap_h = cxl_dstate->caps_reg_state64;
@@ -423,7 +421,7 @@ void cxl_device_register_init_swcci(CSWMBCCIDev *sw)
 device_reg_init_common(cxl_dstate);
 
 cxl_device_cap_init(cxl_dstate, MAILBOX, 2, 1);
-mailbox_reg_init_common(cxl_dstate);
+mailbox_reg_init_common(cxl_dstate, msi_n);
 
 cxl_device_cap_init(cxl_dstate, MEMORY_DEVICE, 0x4000, 1);
 memdev_reg_init_common(cxl_dstate);
diff --git a/hw/cxl/switch-mailbox-cci.c b/hw/cxl/switch-mailbox-cci.c
index 65cdac6cc1..833b824619 100644
--- a/hw/cxl/switch-mailbox-cci.c
+++ b/hw/cxl/switch-mailbox-cci.c
@@ -17,10 +17,12 @@
 #include "hw/qdev-properties.h"
 #include "hw/cxl/cxl.h"
 
+#define CXL_SWCCI_MSIX_MBOX 3
+
 static void cswmbcci_reset(DeviceState *dev)
 {
 CSWMBCCIDev *cswmb = CXL_SWITCH_MAILBOX_CCI(dev);
-cxl_device_register_init_swcci(cswmb);
+cxl_device_register_init_swcci(cswmb, CXL_SWCCI_MSIX_MBOX);
 }
 
 static void cswbcci_realize(PCIDevice *pci_dev, Error **errp)
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 0ae1704a34..ebc0ec536e 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -30,6 +30,14 @@
 #include "hw/cxl/cxl.h"
 #include "hw/pci/msix.h"
 
+/* type3 device private */
+enum CXL_T3_MSIX_VECTOR {
+CXL_T3_MSIX_PCIE_DOE_TABLE_ACCESS = 0,
+CXL_T3_MSIX_EVENT_START = 2,
+CXL_T3_MSIX_MBOX = CXL_T3_MSIX_EVENT_START + CXL_EVENT_TYPE_MAX,
+CXL_T3_MSIX_VECTOR_NR
+};
+
 #define DWORD_BYTE 4
 #define CXL_CAPACITY_MULTIPLIER   (256 * MiB)
 
@@ -843,7 +851,6 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)

Re: [PATCH v2 10/15] block/export: Don't ignore image activation error in blk_exp_add()

On Fri, Jan 31, 2025 at 10:50:46AM +0100, Kevin Wolf wrote:
> Currently, block jobs can't handle inactive images correctly. Incoming
> write requests would run into assertion failures. Make sure that we
> return an error when creating an export can't activate the image.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/export/export.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

[PATCH v1 1/1] aspeed/soc: Support Non-maskable Interrupt for AST2700

2025-02-03 Thread Jamin Lin via

QEMU supports GICv3 Non-maskable Interrupt, adds to support Non-maskable
Interrupt for AST2700.

Reference:
https://github.com/qemu/qemu/commit/b36a32ead

Signed-off-by: Jamin Lin 
---
 hw/arm/aspeed_ast27x0.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/arm/aspeed_ast27x0.c b/hw/arm/aspeed_ast27x0.c
index 4114e15ddd..361a054d46 100644
--- a/hw/arm/aspeed_ast27x0.c
+++ b/hw/arm/aspeed_ast27x0.c
@@ -470,6 +470,10 @@ static bool aspeed_soc_ast2700_gic_realize(DeviceState 
*dev, Error **errp)
qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ));
 sysbus_connect_irq(gicbusdev, i + 3 * sc->num_cpus,
qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ));
+sysbus_connect_irq(gicbusdev, i + 4 * sc->num_cpus,
+   qdev_get_gpio_in(cpudev, ARM_CPU_NMI));
+sysbus_connect_irq(gicbusdev, i + 5 * sc->num_cpus,
+   qdev_get_gpio_in(cpudev, ARM_CPU_VINMI));
 }
 
 return true;
-- 
2.25.1

Re: [PATCH v2 10/14] configure: Define TARGET_LONG_BITS in configs/targets/*.mak

2025-02-03 Thread Richard Henderson


On 2/3/25 10:30, Philippe Mathieu-Daudé wrote:

On 3/2/25 04:18, Richard Henderson wrote:

Define TARGET_LONG_BITS in each target's configure fragment.
Do this without removing the define in target/*/cpu-param.h
so that errors are caught like so:

In file included from .../src/include/exec/cpu-defs.h:26,
  from ../src/target/hppa/cpu.h:24,
  from ../src/linux-user/qemu.h:4,
  from ../src/linux-user/hppa/cpu_loop.c:21:
../src/target/hppa/cpu-param.h:11: error: "TARGET_LONG_BITS" redefined [-Werror]
    11 | #define TARGET_LONG_BITS  64
   |
In file included from .../src/include/qemu/osdep.h:36,
  from ../src/linux-user/hppa/cpu_loop.c:20:
./hppa-linux-user-config-target.h:32: note: this is the location of the 
previous definition
    32 | #define TARGET_LONG_BITS 32
   |
cc1: all warnings being treated as errors

Signed-off-by: Richard Henderson 
---


Orthogonal to this series, what about the other definitions,
like TARGET_PHYS_ADDR_SPACE_BITS / TARGET_VIRT_ADDR_SPACE_BITS
and possibly TARGET_PAGE_BITS?


We don't need those at configure time, so there's no need to move them.


r~

Re: [PATCH v2 13/15] iotests: Add filter_qtest()

On Fri, Jan 31, 2025 at 10:50:49AM +0100, Kevin Wolf wrote:
> The open-coded form of this filter has been copied into enough tests
> that it's better to move it into iotests.py.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  tests/qemu-iotests/iotests.py | 4 
>  tests/qemu-iotests/041| 4 +---
>  tests/qemu-iotests/165| 4 +---
>  tests/qemu-iotests/tests/copy-before-write| 3 +--
>  tests/qemu-iotests/tests/migrate-bitmaps-test | 7 +++
>  5 files changed, 10 insertions(+), 12 deletions(-)

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

Re: [PATCH v2 14/15] iotests: Add qsd-migrate case

On Fri, Jan 31, 2025 at 10:50:50AM +0100, Kevin Wolf wrote:
> Test that it's possible to migrate a VM that uses an image on shared
> storage through qemu-storage-daemon.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  tests/qemu-iotests/tests/qsd-migrate | 132 +++
>  tests/qemu-iotests/tests/qsd-migrate.out |  51 +
>  2 files changed, 183 insertions(+)
>  create mode 100755 tests/qemu-iotests/tests/qsd-migrate
>  create mode 100644 tests/qemu-iotests/tests/qsd-migrate.out
> 
> diff --git a/tests/qemu-iotests/tests/qsd-migrate 
> b/tests/qemu-iotests/tests/qsd-migrate
> new file mode 100755
> index 00..687bda6f93
> --- /dev/null
> +++ b/tests/qemu-iotests/tests/qsd-migrate
> @@ -0,0 +1,132 @@
> +#!/usr/bin/env python3
> +# group: rw quick

> +
> +with iotests.FilePath('disk.img') as path, \
> + iotests.FilePath('nbd-src.sock', base_dir=iotests.sock_dir) as nbd_src, 
> \
> + iotests.FilePath('nbd-dst.sock', base_dir=iotests.sock_dir) as nbd_dst, 
> \
> + iotests.FilePath('migrate.sock', base_dir=iotests.sock_dir) as 
> mig_sock, \
> + iotests.VM(path_suffix="-src") as vm_src, \
> + iotests.VM(path_suffix="-dst") as vm_dst:
> +

> +
> +iotests.log('\nTest I/O on the source')
> +vm_src.hmp_qemu_io('virtio0/virtio-backend', 'write -P 0x11 0 4k',
> +   use_log=True, qdev=True)
> +vm_src.hmp_qemu_io('virtio0/virtio-backend', 'read -P 0x11 0 4k',
> +   use_log=True, qdev=True)
> +
> +iotests.log('\nStarting migration...')


Is it worth adding a test that qemu_io fails to write on the
destination while it is inactive (to ensure we are properly rejecting
modification of an inactive image)?

> +
> +mig_caps = [
> +{'capability': 'events', 'state': True},
> +{'capability': 'pause-before-switchover', 'state': True},
> +]
> +vm_src.qmp_log('migrate-set-capabilities', capabilities=mig_caps)
> +vm_dst.qmp_log('migrate-set-capabilities', capabilities=mig_caps)
> +vm_src.qmp_log('migrate', uri=f'unix:{mig_sock}',
> +   filters=[iotests.filter_qmp_testfiles])
> +
> +vm_src.event_wait('MIGRATION',
> +  match={'data': {'status': 'pre-switchover'}})
> +
> +iotests.log('\nPre-switchover: Reconfigure QSD instances')
> +
> +iotests.log(qsd_src.qmp('blockdev-set-active', {'active': False}))
> +iotests.log(qsd_dst.qmp('blockdev-set-active', {'active': True}))

Also, should you attempt a read on both src and dst while both sides
are inactive, to prove that reads can take a snapshot in the middle of
the handover?

Oveall a nice test.

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

Re: [PATCH v2 12/15] nbd/server: Support inactive nodes

On Fri, Jan 31, 2025 at 10:50:48AM +0100, Kevin Wolf wrote:
> In order to support running an NBD export on inactive nodes, we must
> make sure to return errors for any operations that aren't allowed on
> inactive nodes. Reads are the only operation we know we need for
> inactive images, so to err on the side of caution, return errors for
> everything else, even if some operations could possibly be okay.

We may still find a use case for block status on an inactive node
(especially if that helps us take more accurate snapshots, which is
the whole point of wanting to read pre-activation).  But I'm okay if
we defer that to a separate patch only if it actually proves to be
needed.

> 
> Signed-off-by: Kevin Wolf 
> ---
>  nbd/server.c | 17 +
>  1 file changed, 17 insertions(+)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

Re: [PATCH v4 09/33] migration: postcopy_ram_listen_thread() needs to take BQL for some calls


On 3.02.2025 20:58, Peter Xu wrote:

On Mon, Feb 03, 2025 at 02:57:36PM +0100, Maciej S. Szmigiero wrote:

On 2.02.2025 13:45, Dr. David Alan Gilbert wrote:

* Maciej S. Szmigiero (m...@maciej.szmigiero.name) wrote:

On 2.02.2025 03:06, Dr. David Alan Gilbert wrote:

* Maciej S. Szmigiero (m...@maciej.szmigiero.name) wrote:

From: "Maciej S. Szmigiero" 

postcopy_ram_listen_thread() is a free running thread, so it needs to
take BQL around function calls to migration methods requiring BQL.

qemu_loadvm_state_main() needs BQL held since it ultimately calls
"load_state" SaveVMHandlers.

migration_incoming_state_destroy() needs BQL held since it ultimately calls
"load_cleanup" SaveVMHandlers.

Signed-off-by: Maciej S. Szmigiero 
---
migration/savevm.c | 4 
1 file changed, 4 insertions(+)

diff --git a/migration/savevm.c b/migration/savevm.c
index b0b74140daea..0ceea9638cc1 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2013,7 +2013,9 @@ static void *postcopy_ram_listen_thread(void *opaque)
 * in qemu_file, and thus we must be blocking now.
 */
qemu_file_set_blocking(f, true);
+bql_lock();
load_res = qemu_loadvm_state_main(f, mis);
+bql_unlock();


Doesn't that leave that held for a heck of a long time?


Yes, and it effectively broke "postcopy recover" test but I
think the reason for that is qemu_loadvm_state_main() and
its children don't drop BQL while waiting for I/O.

I've described this case in more detail in my reply to Fabiano here:
https://lore.kernel.org/qemu-devel/0a09e627-955e-4f26-8d08-0192ecd25...@maciej.szmigiero.name/


While it might be the cause in this case, my feeling is it's more fundamental
here - it's the whole reason that postcopy has a separate ram listen
thread.  As the destination is running, after it loads it's devices
and as it starts up the destination will be still loading RAM
(and other postcopiable devices) potentially for quite a while.
Holding the bql around the ram listen thread means that the
execution of the destination won't be able to take that lock
until the postcopy load has finished; so while that might apparently
complete, it'll lead to the destination stalling until that's finished
which defeats the whole point of postcopy.
That last one probably won't fail a test but it will lead to a long stall
if you give it a nice big guest with lots of RAM that it's rapidly
changing.


Okay, I understand the postcopy case/flow now.
Thanks for explaining it clearly.


I still think that "load_state" SaveVMHandlers need to be called
with BQL held since implementations apparently expect it that way:
for example, I think PCI device configuration restore calls
address space manipulation methods which abort() if called
without BQL held.


However, the only devices that *should* be arriving on the channel
that the postcopy_ram_listen_thread is reading from are those
that are postcopiable (i.e. RAM and hmm block's dirty_bitmap).
Those load handlers are safe to be run while the other devices
are being changed.   Note the *should* - you could add a check
to fail if any other device arrives on that channel.


I think ultimately there should be either an explicit check, or,
as you suggest in the paragraph below, a separate SaveVMHandler
that runs without BQL held.


To me those are bugs happening during postcopy, so those abort()s in
memory.c are indeed for catching these issues too.


Since the current state of just running these SaveVMHandlers
without BQL in this case and hoping that nothing breaks is
clearly sub-optimal.


I have previously even submitted a patch to explicitly document
"load_state" SaveVMHandler as requiring BQL (which was also
included in the previous version of this patch set) and it
received a "Reviewed-by:" tag:
https://lore.kernel.org/qemu-devel/6976f129df610c8207da4e531c8c0475ec204fa4.1730203967.git.maciej.szmigi...@oracle.com/
https://lore.kernel.org/qemu-devel/e1949839932efaa531e2fe63ac13324e5787439c.1731773021.git.maciej.szmigi...@oracle.com/
https://lore.kernel.org/qemu-devel/87o732bti7@suse.de/


It happens!
You could make this safer by having a load_state and a load_state_postcopy
member, and only mark the load_state as requiring the lock.


To not digress too much from the subject of this patch set
(multifd VFIO device state transfer) for now I've just updated the
TODO comment around that qemu_loadvm_state_main(), so hopefully this
discussion won't get forgotten:
https://gitlab.com/maciejsszmigiero/qemu/-/commit/046e3deac5b1dbc406b3e9571f62468bd6743e79


The commit message may still need some touch ups, e.g.:

   postcopy_ram_listen_thread() is a free running thread, so it needs to
   take BQL around function calls to migration methods requiring BQL.


This sentence is still not correct, IMHO. As Dave explained, the ram load
thread is designed to run without BQL at least for the major workloads it
runs.


So what's your proposed wording of this commit then?


I don't worry on src sending s

[PATCH v2 02/12] hw/arm/raspi: Merge model 4B with other models

Except we alter the device tree blob, the 4B
is just another raspi model.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/arm/raspi.c | 114 -
 hw/arm/raspi4b.c   | 136 -
 hw/arm/meson.build |   2 +-
 3 files changed, 114 insertions(+), 138 deletions(-)
 delete mode 100644 hw/arm/raspi4b.c

diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index 508f90479e2..3fa382d62ce 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -8,6 +8,10 @@
  * Raspberry Pi 3 emulation Copyright (c) 2018 Zoltán Baldaszti
  * Upstream code cleanup (c) 2018 Pekka Enberg
  *
+ * Raspberry Pi 4 emulation Copyright (C) 2022 Ovchinnikov Vitalii
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  */
@@ -16,20 +20,27 @@
 #include "qemu/units.h"
 #include "qemu/cutils.h"
 #include "qapi/error.h"
+#include "qapi/visitor.h"
 #include "hw/arm/boot.h"
 #include "hw/arm/bcm2836.h"
 #include "hw/arm/bcm2838.h"
 #include "hw/arm/raspi_platform.h"
+#include "hw/display/bcm2835_fb.h"
 #include "hw/registerfields.h"
 #include "qemu/error-report.h"
 #include "hw/boards.h"
 #include "hw/loader.h"
 #include "hw/arm/boot.h"
 #include "qom/object.h"
+#include "system/device_tree.h"
+#include 
 
 #define TYPE_RASPI_MACHINE  MACHINE_TYPE_NAME("raspi-common")
 OBJECT_DECLARE_SIMPLE_TYPE(RaspiMachineState, RASPI_MACHINE)
 
+#define TYPE_RASPI4B_MACHINE MACHINE_TYPE_NAME("raspi4b")
+OBJECT_DECLARE_SIMPLE_TYPE(Raspi4bMachineState, RASPI4B_MACHINE)
+
 #define SMPBOOT_ADDR0x300 /* this should leave enough space for ATAGS */
 #define MVBAR_ADDR  0x400 /* secure vectors */
 #define BOARDSETUP_ADDR (MVBAR_ADDR + 0x20) /* board setup code */
@@ -44,6 +55,11 @@ struct RaspiMachineState {
 BCM283XState soc;
 };
 
+struct Raspi4bMachineState {
+RaspiBaseMachineState parent_obj;
+BCM2838State soc;
+};
+
 /*
  * Board revision codes:
  * www.raspberrypi.org/documentation/hardware/raspberrypi/revision-codes/
@@ -301,6 +317,83 @@ void raspi_base_machine_init(MachineState *machine,
boot_ram_size);
 }
 
+#ifdef TARGET_AARCH64
+/*
+ * Add second memory region if board RAM amount exceeds VC base address
+ * (see https://datasheets.raspberrypi.com/bcm2711/bcm2711-peripherals.pdf
+ * 1.2 Address Map)
+ */
+static int raspi4_add_memory_node(void *fdt, hwaddr mem_base, hwaddr mem_len)
+{
+int ret;
+uint32_t acells, scells;
+char *nodename = g_strdup_printf("/memory@%" PRIx64, mem_base);
+
+acells = qemu_fdt_getprop_cell(fdt, "/", "#address-cells",
+   NULL, &error_fatal);
+scells = qemu_fdt_getprop_cell(fdt, "/", "#size-cells",
+   NULL, &error_fatal);
+if (acells == 0 || scells == 0) {
+fprintf(stderr, "dtb file invalid (#address-cells or #size-cells 
0)\n");
+ret = -1;
+} else {
+qemu_fdt_add_subnode(fdt, nodename);
+qemu_fdt_setprop_string(fdt, nodename, "device_type", "memory");
+ret = qemu_fdt_setprop_sized_cells(fdt, nodename, "reg",
+   acells, mem_base,
+   scells, mem_len);
+}
+
+g_free(nodename);
+return ret;
+}
+
+static void raspi4_modify_dtb(const struct arm_boot_info *info, void *fdt)
+{
+uint64_t ram_size;
+
+/* Temporarily disable following devices until they are implemented */
+const char *nodes_to_remove[] = {
+"brcm,bcm2711-pcie",
+"brcm,bcm2711-rng200",
+"brcm,bcm2711-thermal",
+"brcm,bcm2711-genet-v5",
+};
+
+for (int i = 0; i < ARRAY_SIZE(nodes_to_remove); i++) {
+const char *dev_str = nodes_to_remove[i];
+
+int offset = fdt_node_offset_by_compatible(fdt, -1, dev_str);
+if (offset >= 0) {
+if (!fdt_nop_node(fdt, offset)) {
+warn_report("bcm2711 dtc: %s has been disabled!", dev_str);
+}
+}
+}
+
+ram_size = board_ram_size(info->board_id);
+
+if (info->ram_size > UPPER_RAM_BASE) {
+raspi4_add_memory_node(fdt, UPPER_RAM_BASE, ram_size - UPPER_RAM_BASE);
+}
+}
+
+static void raspi4b_machine_init(MachineState *machine)
+{
+Raspi4bMachineState *s = RASPI4B_MACHINE(machine);
+RaspiBaseMachineState *s_base = RASPI_BASE_MACHINE(machine);
+RaspiBaseMachineClass *mc = RASPI_BASE_MACHINE_GET_CLASS(machine);
+BCM2838State *soc = &s->soc;
+
+s_base->binfo.modify_dtb = raspi4_modify_dtb;
+s_base->binfo.board_id = mc->board_rev;
+
+object_initialize_child(OBJECT(machine), "soc", soc,
+board_soc_type(mc->board_rev));
+raspi_base_machine_init(machine, BCM283X_BASE(soc));
+}
+#endif /* TARGET_AARCH64 */
+
 void raspi_machine_init(MachineState *machine)
 {
 RaspiMachineState *s = RASPI_MACHINE(machine);
@@

[PATCH v2 00/12] hw/arm/raspi: Allow creating any Raspberry Pi machine

Full rewrite of v1 [1], addressing Zoltan & Peter suggestion.

Introduce a generic 'raspi' machine, which takes a 'model'
and 'revision' properties, and any memory size. The 'board_rev'
register is filled appropriately.

Before, merge raspi4b.c within raspi.c (more is planned here
with the MPCore refactor [2]).

Regards,

Phil.

[1] https://lore.kernel.org/qemu-devel/20250201091528.1177-1-phi...@linaro.org/
[2] https://lore.kernel.org/qemu-devel/20231212162935.42910-1-phi...@linaro.org/

Philippe Mathieu-Daudé (12):
  hw/arm/raspi: Access SoC parent object using  BCM283X_BASE() macro
  hw/arm/raspi: Merge model 4B with other models
  hw/arm/raspi: Unify RASPI_MACHINE types
  hw/arm/raspi: Pass board_rev as argument to raspi_base_machine_init()
  hw/arm/raspi: Consider processor id in types[] array
  hw/arm/raspi: Consider network interface for B models
  hw/arm/raspi: Check ramsize is within chipset aperture
  hw/arm/raspi: Introduce generic Raspberry Pi machine
  hw/arm/raspi: Have the generic machine take a 'revision' property
  hw/arm/raspi: List models creatable by the generic 'raspi' machine
  hw/arm/raspi: Deprecate old raspiX machine names
  hw/arm/raspi: Support more models

 docs/about/deprecated.rst   |  13 +
 include/hw/arm/raspi_platform.h |   5 +-
 hw/arm/raspi.c  | 383 ++--
 hw/arm/raspi4b.c| 136 -
 tests/qtest/bcm2835-dma-test.c  |   2 +-
 tests/qtest/bcm2835-i2c-test.c  |   2 +-
 tests/qtest/boot-serial-test.c  |   3 +-
 hw/arm/meson.build  |   2 +-
 tests/functional/test_aarch64_raspi3.py |   5 +-
 tests/functional/test_aarch64_raspi4.py |   4 +-
 tests/functional/test_arm_raspi2.py |   4 +-
 11 files changed, 385 insertions(+), 174 deletions(-)
 delete mode 100644 hw/arm/raspi4b.c

-- 
2.47.1

[PATCH v2 08/12] hw/arm/raspi: Introduce generic Raspberry Pi machine

The generic 'raspi' machine takes a 'model' argument and
create the machine associated with the model, with the
RAM size requested (or default to the minimum of 256MB
if not precised).

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2797
Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/arm/raspi_platform.h |   3 +-
 hw/arm/raspi.c  | 127 
 2 files changed, 115 insertions(+), 15 deletions(-)

diff --git a/include/hw/arm/raspi_platform.h b/include/hw/arm/raspi_platform.h
index defb786153b..14cb91e153c 100644
--- a/include/hw/arm/raspi_platform.h
+++ b/include/hw/arm/raspi_platform.h
@@ -41,7 +41,8 @@ OBJECT_DECLARE_TYPE(RaspiBaseMachineState, 
RaspiBaseMachineClass,
 struct RaspiBaseMachineState {
 /*< private >*/
 MachineState parent_obj;
-/*< public >*/
+
+uint32_t board_rev;
 struct arm_boot_info binfo;
 };
 
diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index d44277001ee..1dc41701efe 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -300,20 +300,6 @@ void raspi_base_machine_init(MachineState *machine,
 BlockBackend *blk;
 BusState *bus;
 DeviceState *carddev;
-uint64_t max_ramsize;
-
-if (machine->ram_size != ram_size) {
-char *size_str = size_to_str(ram_size);
-error_report("Invalid RAM size, should be %s", size_str);
-g_free(size_str);
-exit(1);
-}
-max_ramsize = ramsize_max(board_rev);
-if (ram_size > max_ramsize) {
-g_autofree char *max_ramsize_str = size_to_str(max_ramsize);
-error_report("At most %s of RAM can be used", max_ramsize_str);
- exit(1);
-}
 
 /* FIXME: Remove when we have custom CPU address space support */
 memory_region_add_subregion_overlap(get_system_memory(), 0,
@@ -448,6 +434,115 @@ void raspi_machine_init(MachineState *machine)
 raspi_base_machine_init(machine, BCM283X_BASE(soc), mc->board_rev);
 }
 
+static void raspi_generic_machine_init(MachineState *ms)
+{
+RaspiMachineState *s = RASPI_MACHINE(ms);
+RaspiBaseMachineState *s_base = RASPI_BASE_MACHINE(ms);
+uint32_t board_rev = s_base->board_rev;
+const char *soc_type = board_soc_type(board_rev);
+BCM283XBaseState *bsoc;
+uint64_t ram_size;
+uint64_t max_ramsize;
+
+if (!board_rev) {
+error_report("Missing model");
+exit(1);
+}
+
+ram_size = ROUND_UP(ms->ram_size, 256 * MiB);
+if (ram_size != ms->ram_size) {
+g_autofree char *ram_size_str = size_to_str(ms->ram_size);
+g_autofree char *rounded_size_str = size_to_str(ram_size);
+warn_report("Invalid RAM size %s, rounding to %s",
+ram_size_str, rounded_size_str);
+}
+max_ramsize = ramsize_max(board_rev);
+if (ram_size > max_ramsize) {
+g_autofree char *max_ramsize_str = size_to_str(max_ramsize);
+error_report("At most %s of RAM can be used with BCM%s",
+ max_ramsize_str, soc_type + 3);
+exit(1);
+}
+board_rev = FIELD_DP32(board_rev, REV_CODE, MEMORY_SIZE,
+   ctz64(ms->ram_size) - 28);
+
+ms->ram = g_new(MemoryRegion, 1);
+memory_region_init(ms->ram, OBJECT(ms), "DRAM", ram_size);
+
+if (board_processor_id(board_rev) == PROCESSOR_ID_BCM2838) {
+BCM2838State *soc = &s->soc4;
+bsoc = BCM283X_BASE(soc);
+object_initialize_child(OBJECT(ms), "soc", soc, soc_type);
+} else {
+BCM283XState *soc = &s->soc;
+bsoc = BCM283X_BASE(soc);
+object_initialize_child(OBJECT(ms), "soc", soc, soc_type);
+}
+raspi_base_machine_init(ms, bsoc, board_rev);
+}
+
+static void raspi_update_board_rev(RaspiBaseMachineState *s)
+{
+MachineState *ms = MACHINE(s);
+RaspiProcessorId proc;
+unsigned model_index;
+
+s->board_rev = FIELD_DP32(s->board_rev, REV_CODE, STYLE, 1);
+
+model_index = FIELD_EX32(s->board_rev, REV_CODE, TYPE);
+proc = types[model_index].proc_id;
+s->board_rev = FIELD_DP32(s->board_rev, REV_CODE, PROCESSOR, proc);
+
+ms->smp.max_cpus = soc_property[proc].cores_count;
+}
+
+static void raspi_set_machine_model(Object *obj, const char *value, Error 
**errp)
+{
+for (unsigned i = 0; i < ARRAY_SIZE(types); i++) {
+if (types[i].model && !strcmp(value, types[i].model)) {
+RaspiBaseMachineState *s = RASPI_BASE_MACHINE(obj);
+
+s->board_rev = FIELD_DP32(s->board_rev, REV_CODE, TYPE, i);
+
+return raspi_update_board_rev(s);
+}
+}
+error_setg(errp, "Invalid model");
+}
+
+static char *raspi_get_machine_model(Object *obj, Error **errp)
+{
+RaspiBaseMachineState *s = RASPI_BASE_MACHINE(obj);
+
+return g_strdup(types[FIELD_EX32(s->board_rev, REV_CODE, TYPE)].model);
+}
+
+static void raspi_generic_machine_class_init(ObjectClass *oc, void *data)
+{
+MachineClass *mc = MACHINE_CLASS(oc);
+RaspiBaseMachineClass *rmc = RASPI_BASE_MACHINE_CLASS(oc);
+
+r

[PATCH v2 06/12] hw/arm/raspi: Consider network interface for B models

Raspberry Pi 'B' models have an ethernet chipset (the LAN9512).
Since we don't yet model it, add a /* TODO */ comment.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/arm/raspi.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index 1a6a1f8ff22..68332fba027 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -143,6 +143,16 @@ static const char *board_type(uint32_t board_rev)
 return types[bt].model;
 }
 
+static bool is_model_b(uint32_t board_rev)
+{
+return !!strchr(board_type(board_rev), 'B');
+}
+
+static bool has_enet(uint32_t board_rev)
+{
+return is_model_b(board_rev);
+}
+
 static void write_smpboot(ARMCPU *cpu, const struct arm_boot_info *info)
 {
 static const ARMInsnFixup smpboot[] = {
@@ -304,6 +314,10 @@ void raspi_base_machine_init(MachineState *machine,
 machine->kernel_cmdline, &error_abort);
 qdev_realize(DEVICE(soc), NULL, &error_fatal);
 
+if (has_enet(board_rev)) {
+/* TODO: model LAN9512 and wire over USB2 */
+}
+
 /* Create and plug in the SD cards */
 di = drive_get(IF_SD, 0, 0);
 blk = di ? blk_by_legacy_dinfo(di) : NULL;
-- 
2.47.1

[PATCH v2 03/12] hw/arm/raspi: Unify RASPI_MACHINE types

Merge Raspi4bMachineState within RaspiMachineState by
using an unnamed union.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/arm/raspi.c | 21 +++--
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index 3fa382d62ce..ef94d57dab5 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -38,9 +38,6 @@
 #define TYPE_RASPI_MACHINE  MACHINE_TYPE_NAME("raspi-common")
 OBJECT_DECLARE_SIMPLE_TYPE(RaspiMachineState, RASPI_MACHINE)
 
-#define TYPE_RASPI4B_MACHINE MACHINE_TYPE_NAME("raspi4b")
-OBJECT_DECLARE_SIMPLE_TYPE(Raspi4bMachineState, RASPI4B_MACHINE)
-
 #define SMPBOOT_ADDR0x300 /* this should leave enough space for ATAGS */
 #define MVBAR_ADDR  0x400 /* secure vectors */
 #define BOARDSETUP_ADDR (MVBAR_ADDR + 0x20) /* board setup code */
@@ -49,15 +46,12 @@ OBJECT_DECLARE_SIMPLE_TYPE(Raspi4bMachineState, 
RASPI4B_MACHINE)
 #define SPINTABLE_ADDR  0xd8 /* Pi 3 bootloader spintable */
 
 struct RaspiMachineState {
-/*< private >*/
 RaspiBaseMachineState parent_obj;
-/*< public >*/
-BCM283XState soc;
-};
 
-struct Raspi4bMachineState {
-RaspiBaseMachineState parent_obj;
-BCM2838State soc;
+union {
+BCM283XState soc;
+BCM2838State soc4;
+};
 };
 
 /*
@@ -380,10 +374,10 @@ static void raspi4_modify_dtb(const struct arm_boot_info 
*info, void *fdt)
 
 static void raspi4b_machine_init(MachineState *machine)
 {
-Raspi4bMachineState *s = RASPI4B_MACHINE(machine);
+RaspiMachineState *s = RASPI_MACHINE(machine);
 RaspiBaseMachineState *s_base = RASPI_BASE_MACHINE(machine);
 RaspiBaseMachineClass *mc = RASPI_BASE_MACHINE_GET_CLASS(machine);
-BCM2838State *soc = &s->soc;
+BCM2838State *soc = &s->soc4;
 
 s_base->binfo.modify_dtb = raspi4_modify_dtb;
 s_base->binfo.board_id = mc->board_rev;
@@ -515,8 +509,7 @@ static const TypeInfo raspi_machine_types[] = {
 .class_init = raspi3b_machine_class_init,
 }, {
 .name   = MACHINE_TYPE_NAME("raspi4"),
-.parent = TYPE_RASPI_BASE_MACHINE,
-.instance_size  = sizeof(Raspi4bMachineState),
+.parent = TYPE_RASPI_MACHINE,
 .class_init = raspi4b_machine_class_init,
 #endif /* TARGET_AARCH64 */
 }, {
-- 
2.47.1

[PATCH v2 05/12] hw/arm/raspi: Consider processor id in types[] array

Expand the current type2model array to include the processor id.

Since the BCM2838 is indistinctly used as BCM2711 (within the
Linux community), add it as alias in RaspiProcessorId.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/arm/raspi.c | 33 +++--
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index 571b50bef7e..1a6a1f8ff22 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -70,6 +70,7 @@ typedef enum RaspiProcessorId {
 PROCESSOR_ID_BCM2836 = 1,
 PROCESSOR_ID_BCM2837 = 2,
 PROCESSOR_ID_BCM2838 = 3,
+PROCESSOR_ID_BCM2711 = 3,
 } RaspiProcessorId;
 
 static const struct {
@@ -82,6 +83,30 @@ static const struct {
 [PROCESSOR_ID_BCM2838] = {TYPE_BCM2838, BCM283X_NCPUS},
 };
 
+static const struct {
+RaspiProcessorId proc_id;
+const char *model;
+} types[] = {
+{PROCESSOR_ID_BCM2835, "A"},
+{PROCESSOR_ID_BCM2835, "B"},
+{PROCESSOR_ID_BCM2835, "A+"},
+{PROCESSOR_ID_BCM2835, "B+"},
+{PROCESSOR_ID_BCM2836, "2B"},
+{ },
+{PROCESSOR_ID_BCM2835, "CM1"},
+{ },
+{PROCESSOR_ID_BCM2837, "3B"},
+{PROCESSOR_ID_BCM2835, "Zero"},
+{PROCESSOR_ID_BCM2837, "CM3"},
+{ },
+{PROCESSOR_ID_BCM2835, "ZeroW"},
+{PROCESSOR_ID_BCM2837, "3B+"},
+{PROCESSOR_ID_BCM2837, "3A+"},
+{ },
+{PROCESSOR_ID_BCM2837, "CM3+"},
+{PROCESSOR_ID_BCM2711, "4B"},
+};
+
 uint64_t board_ram_size(uint32_t board_rev)
 {
 assert(FIELD_EX32(board_rev, REV_CODE, STYLE)); /* Only new style */
@@ -110,16 +135,12 @@ static int cores_count(uint32_t board_rev)
 
 static const char *board_type(uint32_t board_rev)
 {
-static const char *types[] = {
-"A", "B", "A+", "B+", "2B", "Alpha", "CM1", NULL, "3B", "Zero",
-"CM3", NULL, "Zero W", "3B+", "3A+", NULL, "CM3+", "4B",
-};
 assert(FIELD_EX32(board_rev, REV_CODE, STYLE)); /* Only new style */
 int bt = FIELD_EX32(board_rev, REV_CODE, TYPE);
-if (bt >= ARRAY_SIZE(types) || !types[bt]) {
+if (bt >= ARRAY_SIZE(types) || !types[bt].model) {
 return "Unknown";
 }
-return types[bt];
+return types[bt].model;
 }
 
 static void write_smpboot(ARMCPU *cpu, const struct arm_boot_info *info)
-- 
2.47.1

[PATCH v2 10/12] hw/arm/raspi: List models creatable by the generic 'raspi' machine

All the following models can be created (with different RAM size):

  $ qemu-system-aarch64 -M raspi
  qemu-system-aarch64: Missing model, try -M raspi,model=help
  $ qemu-system-aarch64 -M raspi,model=help
  Available models (processor):
  - A  (BCM2835)
  - B  (BCM2835)
  - A+ (BCM2835)
  - B+ (BCM2835)
  - 2B (BCM2836)
  - CM1(BCM2835)
  - 3B (BCM2837)
  - Zero   (BCM2835)
  - CM3(BCM2837)
  - ZeroW  (BCM2835)
  - 3B+(BCM2837)
  - 3A+(BCM2837)
  - CM3+   (BCM2837)
  - 4B (BCM2838)

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/arm/raspi.c | 28 +++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index b184ac3c446..8cae1ff6f93 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -445,7 +445,7 @@ static void raspi_generic_machine_init(MachineState *ms)
 uint64_t max_ramsize;
 
 if (!board_rev) {
-error_report("Missing model");
+error_report("Missing model, try -M raspi,model=help");
 exit(1);
 }
 
@@ -500,8 +500,33 @@ static void raspi_update_board_rev(RaspiBaseMachineState 
*s)
 ms->smp.max_cpus = soc_property[proc].cores_count;
 }
 
+static void raspi_list_machine_models(void)
+{
+printf("Available models (processor):\n");
+
+for (unsigned i = 0; i < ARRAY_SIZE(types); i++) {
+const char *soc_type;
+
+if (!types[i].model) {
+continue;
+}
+
+soc_type = soc_property[types[i].proc_id].type;
+if (!soc_type) {
+continue;
+}
+printf("- %-10s (BCM%s)\n",
+   types[i].model,
+   soc_property[types[i].proc_id].type + 3);
+}
+}
+
 static void raspi_set_machine_model(Object *obj, const char *value, Error 
**errp)
 {
+if (!strcmp(value, "help")) {
+raspi_list_machine_models();
+exit(0);
+}
 for (unsigned i = 0; i < ARRAY_SIZE(types); i++) {
 if (types[i].model && !strcmp(value, types[i].model)) {
 RaspiBaseMachineState *s = RASPI_BASE_MACHINE(obj);
@@ -512,6 +537,7 @@ static void raspi_set_machine_model(Object *obj, const char 
*value, Error **errp
 }
 }
 error_setg(errp, "Invalid model");
+error_append_hint(errp, "Use model=help to list models.\n");
 }
 
 static char *raspi_get_machine_model(Object *obj, Error **errp)
-- 
2.47.1

[PATCH v2 09/12] hw/arm/raspi: Have the generic machine take a 'revision' property

Add a property to specify the board revision. This allows to
create a Raspberry Pi 2B with BCM2836 SoC (rev 1.0 and 1.1)
or BCM2837 (rev 1.2 up to 1.5).

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/arm/raspi.c | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index 1dc41701efe..b184ac3c446 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -491,6 +491,10 @@ static void raspi_update_board_rev(RaspiBaseMachineState 
*s)
 
 model_index = FIELD_EX32(s->board_rev, REV_CODE, TYPE);
 proc = types[model_index].proc_id;
+if (model_index == 4 && FIELD_EX32(s->board_rev, REV_CODE, REVISION) > 1) {
+/* 2B rev 1.0 and 1.1 have BCM2836, 1.2+ have BCM2837 */
+proc = PROCESSOR_ID_BCM2837;
+}
 s->board_rev = FIELD_DP32(s->board_rev, REV_CODE, PROCESSOR, proc);
 
 ms->smp.max_cpus = soc_property[proc].cores_count;
@@ -517,6 +521,35 @@ static char *raspi_get_machine_model(Object *obj, Error 
**errp)
 return g_strdup(types[FIELD_EX32(s->board_rev, REV_CODE, TYPE)].model);
 }
 
+static void raspi_set_machine_rev(Object *obj, const char *value, Error **errp)
+{
+RaspiBaseMachineState *s;
+int rev;
+
+if (strlen(value) != 3 || value[0] != '1' || value[1] != '.') {
+error_setg(errp, "Invalid revision");
+return;
+}
+rev = value[2] - '0';
+if (rev < 0 || rev > 5) {
+error_setg(errp, "Invalid revision");
+return;
+}
+
+s = RASPI_BASE_MACHINE(obj);
+s->board_rev = FIELD_DP32(s->board_rev, REV_CODE, REVISION, rev);
+
+return raspi_update_board_rev(s);
+}
+
+static char *raspi_get_machine_rev(Object *obj, Error **errp)
+{
+RaspiBaseMachineState *s = RASPI_BASE_MACHINE(obj);
+
+return g_strdup_printf("1.%u",
+   FIELD_EX32(s->board_rev, REV_CODE, REVISION));
+}
+
 static void raspi_generic_machine_class_init(ObjectClass *oc, void *data)
 {
 MachineClass *mc = MACHINE_CLASS(oc);
@@ -540,6 +573,12 @@ static void raspi_generic_machine_class_init(ObjectClass 
*oc, void *data)
   raspi_get_machine_model,
   raspi_set_machine_model);
 object_class_property_set_description(oc, "model", "Set machine model.");
+object_class_property_add_str(oc, "revision",
+  raspi_get_machine_rev,
+  raspi_set_machine_rev);
+object_class_property_set_description(oc, "revision",
+  "Set machine revision. "
+  "Valid values are 1.0 to 1.5");
 };
 
 
-- 
2.47.1

[PATCH v2 01/12] hw/arm/raspi: Access SoC parent object using BCM283X_BASE() macro

We shouldn't access a QOM parent object directly.
Use the appropriate type-cast macro.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/arm/raspi.c   | 2 +-
 hw/arm/raspi4b.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index a7a662f40db..508f90479e2 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -312,7 +312,7 @@ void raspi_machine_init(MachineState *machine)
 
 object_initialize_child(OBJECT(machine), "soc", soc,
 board_soc_type(mc->board_rev));
-raspi_base_machine_init(machine, &soc->parent_obj);
+raspi_base_machine_init(machine, BCM283X_BASE(soc));
 }
 
 void raspi_machine_class_common_init(MachineClass *mc,
diff --git a/hw/arm/raspi4b.c b/hw/arm/raspi4b.c
index 1264e0d6eed..9b08a598f39 100644
--- a/hw/arm/raspi4b.c
+++ b/hw/arm/raspi4b.c
@@ -104,7 +104,7 @@ static void raspi4b_machine_init(MachineState *machine)
 object_initialize_child(OBJECT(machine), "soc", soc,
 board_soc_type(mc->board_rev));
 
-raspi_base_machine_init(machine, &soc->parent_obj);
+raspi_base_machine_init(machine, BCM283X_BASE(soc));
 }
 
 static void raspi4b_machine_class_init(ObjectClass *oc, void *data)
-- 
2.47.1

[PATCH v2 07/12] hw/arm/raspi: Check ramsize is within chipset aperture

Add the 'max_ramsize' field to the soc_property[] array,
corresponding to the maximum DRAM size a SoC can map.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/arm/raspi.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index 68332fba027..d44277001ee 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -76,11 +76,12 @@ typedef enum RaspiProcessorId {
 static const struct {
 const char *type;
 int cores_count;
+uint64_t max_ramsize;
 } soc_property[] = {
-[PROCESSOR_ID_BCM2835] = {TYPE_BCM2835, 1},
-[PROCESSOR_ID_BCM2836] = {TYPE_BCM2836, BCM283X_NCPUS},
-[PROCESSOR_ID_BCM2837] = {TYPE_BCM2837, BCM283X_NCPUS},
-[PROCESSOR_ID_BCM2838] = {TYPE_BCM2838, BCM283X_NCPUS},
+[PROCESSOR_ID_BCM2835] = {TYPE_BCM2835, 1,  512 * MiB},
+[PROCESSOR_ID_BCM2836] = {TYPE_BCM2836, BCM283X_NCPUS,  1 * GiB},
+[PROCESSOR_ID_BCM2837] = {TYPE_BCM2837, BCM283X_NCPUS,  1 * GiB},
+[PROCESSOR_ID_BCM2838] = {TYPE_BCM2838, BCM283X_NCPUS,  8 * GiB},
 };
 
 static const struct {
@@ -133,6 +134,11 @@ static int cores_count(uint32_t board_rev)
 return soc_property[board_processor_id(board_rev)].cores_count;
 }
 
+static uint64_t ramsize_max(uint32_t board_rev)
+{
+return soc_property[board_processor_id(board_rev)].max_ramsize;
+}
+
 static const char *board_type(uint32_t board_rev)
 {
 assert(FIELD_EX32(board_rev, REV_CODE, STYLE)); /* Only new style */
@@ -294,6 +300,7 @@ void raspi_base_machine_init(MachineState *machine,
 BlockBackend *blk;
 BusState *bus;
 DeviceState *carddev;
+uint64_t max_ramsize;
 
 if (machine->ram_size != ram_size) {
 char *size_str = size_to_str(ram_size);
@@ -301,6 +308,12 @@ void raspi_base_machine_init(MachineState *machine,
 g_free(size_str);
 exit(1);
 }
+max_ramsize = ramsize_max(board_rev);
+if (ram_size > max_ramsize) {
+g_autofree char *max_ramsize_str = size_to_str(max_ramsize);
+error_report("At most %s of RAM can be used", max_ramsize_str);
+ exit(1);
+}
 
 /* FIXME: Remove when we have custom CPU address space support */
 memory_region_add_subregion_overlap(get_system_memory(), 0,
-- 
2.47.1

[PATCH v2 04/12] hw/arm/raspi: Pass board_rev as argument to raspi_base_machine_init()

Since callers already have reference to the RaspiBaseMachineClass,
directly pass 'board_rev' as argument to raspi_base_machine_init().

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/arm/raspi_platform.h | 2 +-
 hw/arm/raspi.c  | 8 +++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/hw/arm/raspi_platform.h b/include/hw/arm/raspi_platform.h
index 7bc4807fa51..defb786153b 100644
--- a/include/hw/arm/raspi_platform.h
+++ b/include/hw/arm/raspi_platform.h
@@ -58,7 +58,7 @@ void raspi_machine_init(MachineState *machine);
 
 typedef struct BCM283XBaseState BCM283XBaseState;
 void raspi_base_machine_init(MachineState *machine,
- BCM283XBaseState *soc);
+ BCM283XBaseState *soc, const uint32_t board_rev);
 
 void raspi_machine_class_common_init(MachineClass *mc,
  uint32_t board_rev);
diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index ef94d57dab5..571b50bef7e 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -254,10 +254,8 @@ static void setup_boot(MachineState *machine, ARMCPU *cpu,
 }
 
 void raspi_base_machine_init(MachineState *machine,
- BCM283XBaseState *soc)
+ BCM283XBaseState *soc, const uint32_t board_rev)
 {
-RaspiBaseMachineClass *mc = RASPI_BASE_MACHINE_GET_CLASS(machine);
-uint32_t board_rev = mc->board_rev;
 uint64_t ram_size = board_ram_size(board_rev);
 uint32_t vcram_base, vcram_size;
 size_t boot_ram_size;
@@ -384,7 +382,7 @@ static void raspi4b_machine_init(MachineState *machine)
 
 object_initialize_child(OBJECT(machine), "soc", soc,
 board_soc_type(mc->board_rev));
-raspi_base_machine_init(machine, BCM283X_BASE(soc));
+raspi_base_machine_init(machine, BCM283X_BASE(soc), mc->board_rev);
 }
 #endif /* TARGET_AARCH64 */
 
@@ -399,7 +397,7 @@ void raspi_machine_init(MachineState *machine)
 
 object_initialize_child(OBJECT(machine), "soc", soc,
 board_soc_type(mc->board_rev));
-raspi_base_machine_init(machine, BCM283X_BASE(soc));
+raspi_base_machine_init(machine, BCM283X_BASE(soc), mc->board_rev);
 }
 
 void raspi_machine_class_common_init(MachineClass *mc,
-- 
2.47.1

[PATCH v2 12/12] hw/arm/raspi: Support more models

Allow to create the following machines:

  - Zero2W
  - 400
  - CM4 and CM4S

Fill the arrays with the BCM2712-based machines (raspi5),
but since we don't model the SoC, these machines can't
be created (and aren't listed in the 'help' output).

List taken from:
https://github.com/raspberrypi/documentation/blob/9b126446a5/documentation/asciidoc/computers/raspberry-pi/revision-codes.adoc

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/arm/raspi.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index 86ecc988e06..2346550eec5 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -71,6 +71,7 @@ typedef enum RaspiProcessorId {
 PROCESSOR_ID_BCM2837 = 2,
 PROCESSOR_ID_BCM2838 = 3,
 PROCESSOR_ID_BCM2711 = 3,
+PROCESSOR_ID_BCM2712 = 4,
 } RaspiProcessorId;
 
 static const struct {
@@ -82,6 +83,7 @@ static const struct {
 [PROCESSOR_ID_BCM2836] = {TYPE_BCM2836, BCM283X_NCPUS,  1 * GiB},
 [PROCESSOR_ID_BCM2837] = {TYPE_BCM2837, BCM283X_NCPUS,  1 * GiB},
 [PROCESSOR_ID_BCM2838] = {TYPE_BCM2838, BCM283X_NCPUS,  8 * GiB},
+[PROCESSOR_ID_BCM2712] = {NULL, BCM283X_NCPUS,  16 * GiB},
 };
 
 static const struct {
@@ -106,6 +108,17 @@ static const struct {
 { },
 {PROCESSOR_ID_BCM2837, "CM3+"},
 {PROCESSOR_ID_BCM2711, "4B"},
+{PROCESSOR_ID_BCM2837, "Zero2W"},
+{PROCESSOR_ID_BCM2711, "400"},
+
+{PROCESSOR_ID_BCM2711, "CM4"},
+{PROCESSOR_ID_BCM2711, "CM4S"},
+{ },
+{PROCESSOR_ID_BCM2712, "5"},
+{PROCESSOR_ID_BCM2712, "CM5"},
+{PROCESSOR_ID_BCM2712, "500"},
+{PROCESSOR_ID_BCM2712, "CM5lite"},
+{ },
 };
 
 uint64_t board_ram_size(uint32_t board_rev)
-- 
2.47.1

[PATCH v2 11/12] hw/arm/raspi: Deprecate old raspiX machine names

All previous raspi machines can be created using the
generic machine. Deprecate the old names to maintain
a single one. Update the tests.

Signed-off-by: Philippe Mathieu-Daudé 
---
QOM HMP introspection test fails because without the 'model'
argument set, no machine is created...

  $ qemu-system-aarch64 -M raspi
  qemu-system-aarch64: Missing model, try -M raspi,model=help
---
 docs/about/deprecated.rst   | 13 +
 hw/arm/raspi.c  |  5 +
 tests/qtest/bcm2835-dma-test.c  |  2 +-
 tests/qtest/bcm2835-i2c-test.c  |  2 +-
 tests/qtest/boot-serial-test.c  |  3 ++-
 tests/functional/test_aarch64_raspi3.py |  5 ++---
 tests/functional/test_aarch64_raspi4.py |  4 ++--
 tests/functional/test_arm_raspi2.py |  4 ++--
 8 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 4a3c302962a..c9a11a52f78 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -257,6 +257,19 @@ Big-Endian variants of MicroBlaze ``petalogix-ml605`` and 
``xlnx-zynqmp-pmu`` ma
 Both ``petalogix-ml605`` and ``xlnx-zynqmp-pmu`` were added for little endian
 CPUs. Big endian support is not tested.
 
+ARM ``raspi0``, ``raspi1ap``, ``raspi2b``, ``raspi3ap``, ``raspi3b`` and 
``raspi4b`` machines (since 10.0)
+''
+
+The Raspberry Pi machines have been unified under the generic ``raspi`` 
machine,
+which takes the model as argument.
+
+- `raspi0`` is now an alias for ``raspi,model=Zero``
+- `raspi1ap`` is now an alias for ``raspi,model=1A+``
+- `raspi2b`` is now an alias for ``raspi,model=2B``
+- `raspi3ap`` is now an alias for ``raspi,model=3A+``
+- `raspi3b`` is now an alias for ``raspi,model=3B``
+- `raspi4b`` is now an alias for ``raspi,model=4B``
+
 Backend options
 ---
 
diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index 8cae1ff6f93..86ecc988e06 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -637,6 +637,7 @@ static void raspi0_machine_class_init(ObjectClass *oc, void 
*data)
 
 rmc->board_rev = 0x920092; /* Revision 1.2 */
 raspi_machine_class_init(mc, rmc->board_rev);
+mc->deprecation_reason = "-M raspi,model=Zero";
 };
 
 static void raspi1ap_machine_class_init(ObjectClass *oc, void *data)
@@ -646,6 +647,7 @@ static void raspi1ap_machine_class_init(ObjectClass *oc, 
void *data)
 
 rmc->board_rev = 0x900021; /* Revision 1.1 */
 raspi_machine_class_init(mc, rmc->board_rev);
+mc->deprecation_reason = "-M raspi,model=A+ (-m 512m)";
 };
 
 static void raspi2b_machine_class_init(ObjectClass *oc, void *data)
@@ -655,6 +657,7 @@ static void raspi2b_machine_class_init(ObjectClass *oc, 
void *data)
 
 rmc->board_rev = 0xa21041;
 raspi_machine_class_init(mc, rmc->board_rev);
+mc->deprecation_reason = "-M raspi,model=2B -m 1g";
 };
 
 #ifdef TARGET_AARCH64
@@ -665,6 +668,7 @@ static void raspi3ap_machine_class_init(ObjectClass *oc, 
void *data)
 
 rmc->board_rev = 0x9020e0; /* Revision 1.0 */
 raspi_machine_class_init(mc, rmc->board_rev);
+mc->deprecation_reason = "-M raspi,model=3A+ -m 512m";
 };
 
 static void raspi3b_machine_class_init(ObjectClass *oc, void *data)
@@ -674,6 +678,7 @@ static void raspi3b_machine_class_init(ObjectClass *oc, 
void *data)
 
 rmc->board_rev = 0xa02082;
 raspi_machine_class_init(mc, rmc->board_rev);
+mc->deprecation_reason = "-M raspi,model=3B -m 1g";
 };
 
 static void raspi4b_machine_class_init(ObjectClass *oc, void *data)
diff --git a/tests/qtest/bcm2835-dma-test.c b/tests/qtest/bcm2835-dma-test.c
index 18901b76d21..705e6b2362b 100644
--- a/tests/qtest/bcm2835-dma-test.c
+++ b/tests/qtest/bcm2835-dma-test.c
@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 g_test_init(&argc, &argv, NULL);
 qtest_add_func("/bcm2835/dma/test_interrupts",
bcm2835_dma_test_interrupts);
-qtest_start("-machine raspi3b");
+qtest_start("-machine raspi,model=3B -m 1g");
 ret = g_test_run();
 qtest_end();
 return ret;
diff --git a/tests/qtest/bcm2835-i2c-test.c b/tests/qtest/bcm2835-i2c-test.c
index 15991949260..15904abf393 100644
--- a/tests/qtest/bcm2835-i2c-test.c
+++ b/tests/qtest/bcm2835-i2c-test.c
@@ -104,7 +104,7 @@ int main(int argc, char **argv)
 }
 
 /* Run I2C tests with TMP105 slaves on all three buses */
-qtest_start("-M raspi3b "
+qtest_start("-M raspi,model=3B -m 1g "
 "-device tmp105,address=0x50,bus=i2c-bus.0 "
 "-device tmp105,address=0x50,bus=i2c-bus.1 "
 "-device tmp105,address=0x50,bus=i2c-bus.2");
diff --git a/tests/qtest/boot-serial-test.c b/tests/qtest/boot-serial-test.c
index a05d26ee996..fbafd73facb 100644
--- a/tests/qtest/boot-serial-test.c
+++ b/tests/qtest/boot-serial-test.c
@@ -188,7 +188,8 @@ static const testdef_t tests[] = {
   size

Re: [RFC PATCH v12 qemu 2/2] qtest/cxl: Add aarch64 virt test for CXL

2025-02-03 Thread Itaru Kitayama

Jonathan,

> On Feb 4, 2025, at 2:30, Jonathan Cameron  wrote:
> 
> Add a single complex case for aarch64 virt machine.
> Given existing much more comprehensive tests for x86 cover the
> common functionality, a single test should be enough to verify
> that the aarch64 part continue to work.
> 
> Signed-off-by: Jonathan Cameron 
> ---
> tests/qtest/cxl-test.c  | 59 -
> tests/qtest/meson.build |  1 +
> 2 files changed, 47 insertions(+), 13 deletions(-)
> 
> diff --git a/tests/qtest/cxl-test.c b/tests/qtest/cxl-test.c
> index a600331843..c7189d6222 100644
> --- a/tests/qtest/cxl-test.c
> +++ b/tests/qtest/cxl-test.c
> @@ -19,6 +19,12 @@
> "-device pxb-cxl,id=cxl.1,bus=pcie.0,bus_nr=53 " \
> "-M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=4G 
> "
> 
> +#define QEMU_VIRT_2PXB_CMD \
> +"-machine virt,cxl=on -cpu max " \
> +"-device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 " \
> +"-device pxb-cxl,id=cxl.1,bus=pcie.0,bus_nr=53 " \
> +"-M 
> cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=4G "
> +
> #define QEMU_RP \
> "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 "
> 
> @@ -197,25 +203,52 @@ static void cxl_2pxb_4rp_4t3d(void)
> qtest_end();
> rmdir(tmpfs);
> }
> +
> +static void cxl_virt_2pxb_4rp_4t3d(void)
> +{
> +g_autoptr(GString) cmdline = g_string_new(NULL);
> +char template[] = "/tmp/cxl-test-XX";
> +const char *tmpfs;
> +
> +tmpfs = mkdtemp(template);
> +
> +g_string_printf(cmdline, QEMU_VIRT_2PXB_CMD QEMU_4RP QEMU_4T3D,
> +tmpfs, tmpfs, tmpfs, tmpfs, tmpfs, tmpfs,
> +tmpfs, tmpfs);
> +
> +qtest_start(cmdline->str);
> +qtest_end();
> +rmdir(tmpfs);
> +}
> #endif /* CONFIG_POSIX */
> 
> int main(int argc, char **argv)
> {
> -g_test_init(&argc, &argv, NULL);
> +const char *arch = qtest_get_arch();
> 
> -qtest_add_func("/pci/cxl/basic_hostbridge", cxl_basic_hb);
> -qtest_add_func("/pci/cxl/basic_pxb", cxl_basic_pxb);
> -qtest_add_func("/pci/cxl/pxb_with_window", cxl_pxb_with_window);
> -qtest_add_func("/pci/cxl/pxb_x2_with_window", cxl_2pxb_with_window);
> -qtest_add_func("/pci/cxl/rp", cxl_root_port);
> -qtest_add_func("/pci/cxl/rp_x2", cxl_2root_port);
> +g_test_init(&argc, &argv, NULL);
> +if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
> +qtest_add_func("/pci/cxl/basic_hostbridge", cxl_basic_hb);
> +qtest_add_func("/pci/cxl/basic_pxb", cxl_basic_pxb);
> +qtest_add_func("/pci/cxl/pxb_with_window", cxl_pxb_with_window);
> +qtest_add_func("/pci/cxl/pxb_x2_with_window", cxl_2pxb_with_window);
> +qtest_add_func("/pci/cxl/rp", cxl_root_port);
> +qtest_add_func("/pci/cxl/rp_x2", cxl_2root_port);
> #ifdef CONFIG_POSIX
> -qtest_add_func("/pci/cxl/type3_device", cxl_t3d_deprecated);
> -qtest_add_func("/pci/cxl/type3_device_pmem", cxl_t3d_persistent);
> -qtest_add_func("/pci/cxl/type3_device_vmem", cxl_t3d_volatile);
> -qtest_add_func("/pci/cxl/type3_device_vmem_lsa", cxl_t3d_volatile_lsa);
> -qtest_add_func("/pci/cxl/rp_x2_type3_x2", cxl_1pxb_2rp_2t3d);
> -qtest_add_func("/pci/cxl/pxb_x2_root_port_x4_type3_x4", 
> cxl_2pxb_4rp_4t3d);
> +qtest_add_func("/pci/cxl/type3_device", cxl_t3d_deprecated);
> +qtest_add_func("/pci/cxl/type3_device_pmem", cxl_t3d_persistent);
> +qtest_add_func("/pci/cxl/type3_device_vmem", cxl_t3d_volatile);
> +qtest_add_func("/pci/cxl/type3_device_vmem_lsa", 
> cxl_t3d_volatile_lsa);
> +qtest_add_func("/pci/cxl/rp_x2_type3_x2", cxl_1pxb_2rp_2t3d);
> +qtest_add_func("/pci/cxl/pxb_x2_root_port_x4_type3_x4",
> +   cxl_2pxb_4rp_4t3d);
> #endif
> +} else if (strcmp(arch, "aarch64") == 0) {
> +#ifdef CONFIG_POSIX
> +qtest_add_func("/pci/cxl/virt/pxb_x2_root_port_x4_type3_x4",
> +   cxl_virt_2pxb_4rp_4t3d);
> +#endif
> +}
> +
> return g_test_run();
> }
> diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
> index e60e92fe9d..f5e7fb060e 100644
> --- a/tests/qtest/meson.build
> +++ b/tests/qtest/meson.build
> @@ -257,6 +257,7 @@ qtests_aarch64 = \
>   (config_all_accel.has_key('CONFIG_TCG') and 
>\
>config_all_devices.has_key('CONFIG_TPM_TIS_I2C') ? ['tpm-tis-i2c-test'] : 
> []) + \
>   (config_all_devices.has_key('CONFIG_ASPEED_SOC') ? qtests_aspeed64 : []) + \
> +  qtests_cxl +   
>\
>   ['arm-cpu-features',
>'numa-test',
>'boot-serial-test',
> -- 
> 2.43.0
> 

In Ubuntu 22.04 LTS, cxl-test applied on top of today’s QEMU upstream master 
branch cxl-test fails:

$ ./tests/qtest/cxl-test
# random seed: R02S2a8b02df7b32b79d086ce22f7f8ebeab
1..1
# Start of aarch64 tests
# Start of pci tests
# Start of cxl tests
# Start of virt tests
# s

Re: [PATCH v3 06/26] target/arm/kvm-rme: Initialize vCPU

2025-02-03 Thread Gavin Shan


On 11/26/24 5:56 AM, Jean-Philippe Brucker wrote:

The target code calls kvm_arm_vcpu_init() to mark the vCPU as part of a
Realm. For a Realm vCPU, only x0-x7 can be set at runtime. Before boot,
the PC can also be set, and is ignored at runtime. KVM also accepts a
few system register changes during initial configuration, as returned by
KVM_GET_REG_LIST.

Signed-off-by: Jean-Philippe Brucker 
---
  target/arm/cpu.h |  3 +++
  target/arm/kvm_arm.h | 15 +++
  target/arm/kvm-rme.c | 10 
  target/arm/kvm.c | 61 
  4 files changed, 89 insertions(+)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index d86e641280..f617591921 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -961,6 +961,9 @@ struct ArchCPU {
  OnOffAuto kvm_steal_time;
  #endif /* CONFIG_KVM */
  
+/* Realm Management Extension */

+bool kvm_rme;
+
  /* Uniprocessor system with MP extensions */
  bool mp_is_up;
  
diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h

index 9d6a89f9b1..8b52a881b0 100644
--- a/target/arm/kvm_arm.h
+++ b/target/arm/kvm_arm.h
@@ -245,6 +245,16 @@ int kvm_arm_rme_init(MachineState *ms);
   */
  int kvm_arm_rme_vm_type(MachineState *ms);
  
+/**

+ * kvm_arm_rme_vcpu_init
+ * @cs: the CPU
+ *
+ * If the user requested a Realm, setup the given vCPU accordingly. Realm vCPUs
+ * behave a little differently, for example most of their register state is
+ * hidden from the host.
+ */
+int kvm_arm_rme_vcpu_init(CPUState *cs);
+
  #else
  
  /*

@@ -339,6 +349,11 @@ static inline int kvm_arm_rme_vm_type(MachineState *ms)
  g_assert_not_reached();
  }
  
+static inline int kvm_arm_rme_vcpu_init(CPUState *cs)

+{
+g_assert_not_reached();
+}
+
  #endif
  
  #endif

diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c
index 60d967a842..e3cc37538a 100644
--- a/target/arm/kvm-rme.c
+++ b/target/arm/kvm-rme.c
@@ -137,6 +137,16 @@ int kvm_arm_rme_init(MachineState *ms)
  return 0;
  }
  
+int kvm_arm_rme_vcpu_init(CPUState *cs)

+{
+ARMCPU *cpu = ARM_CPU(cs);
+
+if (rme_guest) {
+cpu->kvm_rme = true;
+}
+return 0;
+}
+
  int kvm_arm_rme_vm_type(MachineState *ms)
  {
  if (rme_guest) {
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index 0c80992f7c..a0de2efc41 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -1926,6 +1926,11 @@ int kvm_arch_init_vcpu(CPUState *cs)
  return ret;
  }
  
+ret = kvm_arm_rme_vcpu_init(cs);

+if (ret) {
+return ret;
+}
+
  if (cpu_isar_feature(aa64_sve, cpu)) {
  ret = kvm_arm_sve_set_vls(cpu);
  if (ret) {
@@ -2062,6 +2067,35 @@ static int kvm_arch_put_sve(CPUState *cs)
  return 0;
  }
  
+static int kvm_arm_rme_put_core_regs(CPUState *cs, Error **errp)

+{
+int i, ret;
+struct kvm_one_reg reg;
+ARMCPU *cpu = ARM_CPU(cs);
+CPUARMState *env = &cpu->env;
+
+/*
+ * The RME ABI only allows us to set 8 GPRs and the PC
+ */


Needn't to span for multiple lines.


+for (i = 0; i < 8; i++) {
+reg.id = AARCH64_CORE_REG(regs.regs[i]);
+reg.addr = (uintptr_t) &env->xregs[i];
+ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®);
+if (ret) {
+return ret;
+}
+}
+
+reg.id = AARCH64_CORE_REG(regs.pc);
+reg.addr = (uintptr_t) &env->pc;
+ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®);
+if (ret) {
+return ret;
+}
+
+return 0;
+}
+


Nice place to use kvm_set_one_reg(). With it, @reg can be dropped.


  static int kvm_arm_put_core_regs(CPUState *cs, int level, Error **errp)
  {
  uint64_t val;
@@ -2072,6 +2106,10 @@ static int kvm_arm_put_core_regs(CPUState *cs, int 
level, Error **errp)
  ARMCPU *cpu = ARM_CPU(cs);
  CPUARMState *env = &cpu->env;
  
+if (cpu->kvm_rme) {

+return kvm_arm_rme_put_core_regs(cs, errp);
+}
+
  /* If we are in AArch32 mode then we need to copy the AArch32 regs to the
   * AArch64 registers before pushing them out to 64-bit KVM.
   */
@@ -2259,6 +2297,25 @@ static int kvm_arch_get_sve(CPUState *cs)
  return 0;
  }
  
+static int kvm_arm_rme_get_core_regs(CPUState *cs, Error **errp)

+{
+int i, ret;
+struct kvm_one_reg reg;
+ARMCPU *cpu = ARM_CPU(cs);
+CPUARMState *env = &cpu->env;
+
+for (i = 0; i < 8; i++) {
+reg.id = AARCH64_CORE_REG(regs.regs[i]);
+reg.addr = (uintptr_t) &env->xregs[i];
+ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®);
+if (ret) {
+return ret;
+}
+}
+
+return 0;
+}
+


Similiarly, kvm_get_one_reg() can be used.


  static int kvm_arm_get_core_regs(CPUState *cs, Error **errp)
  {
  uint64_t val;
@@ -2269,6 +2326,10 @@ static int kvm_arm_get_core_regs(CPUState *cs, Error 
**errp)
  ARMCPU *cpu = ARM_CPU(cs);
  CPUARMState *env = &cpu->env;
  
+if (cpu->kvm_rme) {

+return kvm_arm_rme_get_core_regs(cs, errp);
+}
+
  fo

Re: [PATCH v2 11/15] block/export: Add option to allow export of inactive nodes

2025-02-03 Thread Stefan Hajnoczi

On Fri, Jan 31, 2025 at 10:50:47AM +0100, Kevin Wolf wrote:
> Add an option in BlockExportOptions to allow creating an export on an
> inactive node without activating the node. This mode needs to be
> explicitly supported by the export type (so that it doesn't perform any
> operations that are forbidden for inactive nodes), so this patch alone
> doesn't allow this option to be successfully used yet.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  qapi/block-export.json | 10 +-
>  include/block/block-global-state.h |  3 +++
>  include/block/export.h |  3 +++
>  block.c|  4 
>  block/export/export.c  | 31 --
>  5 files changed, 40 insertions(+), 11 deletions(-)
> 
> diff --git a/qapi/block-export.json b/qapi/block-export.json
> index ce33fe378d..117b05d13c 100644
> --- a/qapi/block-export.json
> +++ b/qapi/block-export.json
> @@ -372,6 +372,13 @@
>  # cannot be moved to the iothread.  The default is false.
>  # (since: 5.2)
>  #
> +# @allow-inactive: If true, the export allows the exported node to be 
> inactive.
> +# If it is created for an inactive block node, the node remains 
> inactive. If
> +# the export type doesn't support running on an inactive node, an error 
> is
> +# returned. If false, inactive block nodes are automatically activated 
> before
> +# creating the export and trying to inactivate them later fails.
> +# (since: 10.0; default: false)

Exposing activation in the API is ugly but I don't see a cleaner option
given that we cannot change block-export-add's existing behavior of
activating the node by default. :(

Ideally block-export-add would not modify active/inactive and leave it
up to user to provide a node in the desired state.

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature

Re: [PATCH v4 09/33] migration: postcopy_ram_listen_thread() needs to take BQL for some calls

On Mon, Feb 03, 2025 at 02:57:36PM +0100, Maciej S. Szmigiero wrote:
> On 2.02.2025 13:45, Dr. David Alan Gilbert wrote:
> > * Maciej S. Szmigiero (m...@maciej.szmigiero.name) wrote:
> > > On 2.02.2025 03:06, Dr. David Alan Gilbert wrote:
> > > > * Maciej S. Szmigiero (m...@maciej.szmigiero.name) wrote:
> > > > > From: "Maciej S. Szmigiero" 
> > > > > 
> > > > > postcopy_ram_listen_thread() is a free running thread, so it needs to
> > > > > take BQL around function calls to migration methods requiring BQL.
> > > > > 
> > > > > qemu_loadvm_state_main() needs BQL held since it ultimately calls
> > > > > "load_state" SaveVMHandlers.
> > > > > 
> > > > > migration_incoming_state_destroy() needs BQL held since it ultimately 
> > > > > calls
> > > > > "load_cleanup" SaveVMHandlers.
> > > > > 
> > > > > Signed-off-by: Maciej S. Szmigiero 
> > > > > ---
> > > > >migration/savevm.c | 4 
> > > > >1 file changed, 4 insertions(+)
> > > > > 
> > > > > diff --git a/migration/savevm.c b/migration/savevm.c
> > > > > index b0b74140daea..0ceea9638cc1 100644
> > > > > --- a/migration/savevm.c
> > > > > +++ b/migration/savevm.c
> > > > > @@ -2013,7 +2013,9 @@ static void *postcopy_ram_listen_thread(void 
> > > > > *opaque)
> > > > > * in qemu_file, and thus we must be blocking now.
> > > > > */
> > > > >qemu_file_set_blocking(f, true);
> > > > > +bql_lock();
> > > > >load_res = qemu_loadvm_state_main(f, mis);
> > > > > +bql_unlock();
> > > > 
> > > > Doesn't that leave that held for a heck of a long time?
> > > 
> > > Yes, and it effectively broke "postcopy recover" test but I
> > > think the reason for that is qemu_loadvm_state_main() and
> > > its children don't drop BQL while waiting for I/O.
> > > 
> > > I've described this case in more detail in my reply to Fabiano here:
> > > https://lore.kernel.org/qemu-devel/0a09e627-955e-4f26-8d08-0192ecd25...@maciej.szmigiero.name/
> > 
> > While it might be the cause in this case, my feeling is it's more 
> > fundamental
> > here - it's the whole reason that postcopy has a separate ram listen
> > thread.  As the destination is running, after it loads it's devices
> > and as it starts up the destination will be still loading RAM
> > (and other postcopiable devices) potentially for quite a while.
> > Holding the bql around the ram listen thread means that the
> > execution of the destination won't be able to take that lock
> > until the postcopy load has finished; so while that might apparently
> > complete, it'll lead to the destination stalling until that's finished
> > which defeats the whole point of postcopy.
> > That last one probably won't fail a test but it will lead to a long stall
> > if you give it a nice big guest with lots of RAM that it's rapidly
> > changing.
> 
> Okay, I understand the postcopy case/flow now.
> Thanks for explaining it clearly.
> 
> > > I still think that "load_state" SaveVMHandlers need to be called
> > > with BQL held since implementations apparently expect it that way:
> > > for example, I think PCI device configuration restore calls
> > > address space manipulation methods which abort() if called
> > > without BQL held.
> > 
> > However, the only devices that *should* be arriving on the channel
> > that the postcopy_ram_listen_thread is reading from are those
> > that are postcopiable (i.e. RAM and hmm block's dirty_bitmap).
> > Those load handlers are safe to be run while the other devices
> > are being changed.   Note the *should* - you could add a check
> > to fail if any other device arrives on that channel.
> 
> I think ultimately there should be either an explicit check, or,
> as you suggest in the paragraph below, a separate SaveVMHandler
> that runs without BQL held.

To me those are bugs happening during postcopy, so those abort()s in
memory.c are indeed for catching these issues too.

> Since the current state of just running these SaveVMHandlers
> without BQL in this case and hoping that nothing breaks is
> clearly sub-optimal.
> 
> > > I have previously even submitted a patch to explicitly document
> > > "load_state" SaveVMHandler as requiring BQL (which was also
> > > included in the previous version of this patch set) and it
> > > received a "Reviewed-by:" tag:
> > > https://lore.kernel.org/qemu-devel/6976f129df610c8207da4e531c8c0475ec204fa4.1730203967.git.maciej.szmigi...@oracle.com/
> > > https://lore.kernel.org/qemu-devel/e1949839932efaa531e2fe63ac13324e5787439c.1731773021.git.maciej.szmigi...@oracle.com/
> > > https://lore.kernel.org/qemu-devel/87o732bti7@suse.de/
> > 
> > It happens!
> > You could make this safer by having a load_state and a load_state_postcopy
> > member, and only mark the load_state as requiring the lock.
> 
> To not digress too much from the subject of this patch set
> (multifd VFIO device state transfer) for now I've just updated the
> TODO comment around that qemu_loadvm_state_main(), so hopefully this
> discussion won't get forgotten:
> https://gitlab

Re: [PATCH v4 13/33] migration/multifd: Device state transfer support - receive side

On Thu, Jan 30, 2025 at 11:08:34AM +0100, Maciej S. Szmigiero wrote:
> From: "Maciej S. Szmigiero" 
> 
> Add a basic support for receiving device state via multifd channels -
> channels that are shared with RAM transfers.
> 
> Depending whether MULTIFD_FLAG_DEVICE_STATE flag is present or not in the
> packet header either device state (MultiFDPacketDeviceState_t) or RAM
> data (existing MultiFDPacket_t) is read.
> 
> The received device state data is provided to
> qemu_loadvm_load_state_buffer() function for processing in the
> device's load_state_buffer handler.
> 
> Signed-off-by: Maciej S. Szmigiero 

I think I acked this one.  You could keep my R-b if...

[...]

> diff --git a/migration/multifd.h b/migration/multifd.h
> index 9e4baa066312..abf3acdcee40 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -62,6 +62,12 @@ MultiFDRecvData *multifd_get_recv_data(void);
>  #define MULTIFD_FLAG_UADK (8 << 1)
>  #define MULTIFD_FLAG_QATZIP (16 << 1)
>  
> +/*
> + * If set it means that this packet contains device state
> + * (MultiFDPacketDeviceState_t), not RAM data (MultiFDPacket_t).
> + */
> +#define MULTIFD_FLAG_DEVICE_STATE (1 << 6)

... if this won't conflict with MULTIFD_FLAG_QATZIP.

I think we should stick with one way to write it, then when rebase you can
see such conflicts - either your patch uses 32 << 1, or perhaps we should
start to switch to BIT() for all above instead..

-- 
Peter Xu

RE: [PATCH v5 13/17] aspeed/soc: Add AST2700 support

2025-02-03 Thread Jamin Lin

Hi Philippe,

> From: Philippe Mathieu-Daudé 
> Sent: Tuesday, February 4, 2025 12:41 AM
> To: Jamin Lin ; Cédric Le Goater ;
> Peter Maydell ; Andrew Jeffery
> ; Joel Stanley ; Alistair
> Francis ; Cleber Rosa ; Wainer
> dos Santos Moschetta ; Beraldo Leal
> ; open list:ASPEED BMCs ; open
> list:All patches CC here ; Jinjie Ruan
> 
> Cc: Troy Lee ; Yunlin Tang
> 
> Subject: Re: [PATCH v5 13/17] aspeed/soc: Add AST2700 support
> 
> On 3/2/25 08:43, Jamin Lin wrote:
> > Hi Philippe,
> >
> >> From: Jamin Lin
> >> Sent: Monday, February 3, 2025 3:29 PM
> >> To: Philippe Mathieu-Daudé ; Cédric Le Goater
> >> ; Peter Maydell ; Andrew
> >> Jeffery ; Joel Stanley ;
> >> Alistair Francis ; Cleber Rosa
> >> ; Wainer dos Santos Moschetta
> >> ; Beraldo Leal ; open
> >> list:ASPEED BMCs ; open list:All patches CC here
> >> ; Jinjie Ruan 
> >> Cc: Troy Lee ; Yunlin Tang
> >> 
> >> Subject: RE: [PATCH v5 13/17] aspeed/soc: Add AST2700 support
> >>
> >> Hi Philippe,
> >>
> >>> From: Philippe Mathieu-Daudé 
> >>> Sent: Thursday, January 30, 2025 11:14 PM
> >>> To: Jamin Lin ; Cédric Le Goater
> >>> ; Peter Maydell ; Andrew
> >>> Jeffery ; Joel Stanley
> >>> ; Alistair Francis ; Cleber
> >>> Rosa ; Wainer dos Santos Moschetta
> >> ;
> >>> Beraldo Leal ; open list:ASPEED BMCs
> >>> ; open list:All patches CC here
> >>> ; Jinjie Ruan 
> >>> Cc: Troy Lee ; Yunlin Tang
> >>> 
> >>> Subject: Re: [PATCH v5 13/17] aspeed/soc: Add AST2700 support
> >>>
> >>> Hi Jamin,
> >>>
> >>> On 4/6/24 07:44, Jamin Lin wrote:
>  Initial definitions for a simple machine using an AST2700 SOC
>  (Cortex-a35
> >>> CPU).
> 
>  AST2700 SOC and its interrupt controller are too complex to handle
>  in the common Aspeed SoC framework. We introduce a new ast2700
>  class with instance_init and realize handlers.
> 
>  AST2700 is a 64 bits quad core cpus and support 8 watchdog.
>  Update maximum ASPEED_CPUS_NUM to 4 and ASPEED_WDTS_NUM to
> 8.
>  In addition, update AspeedSocState to support scuio, sli, sliio and intc.
> 
>  Add TYPE_ASPEED27X0_SOC machine type.
> 
>  The SDMC controller is unlocked at SPL stage.
>  At present, only supports to emulate booting start from u-boot stage.
>  Set SDMC controller unlocked by default.
> 
>  In INTC, each interrupt of INT 128 to INT 136 combines 32 interrupts.
>  It connect GICINT IRQ GPIO-OUTPUT pins to GIC device with irq 128 to
> 136.
>  And, if a device irq is 128 to 136, its irq GPIO-OUTPUT pin is
>  connected to GICINT or-gates instead of GIC device.
> 
>  Signed-off-by: Troy Lee 
>  Signed-off-by: Jamin Lin 
>  ---
> hw/arm/aspeed_ast27x0.c | 563
> >>> 
> hw/arm/meson.build  |   1 +
> include/hw/arm/aspeed_soc.h |  28 +-
> 3 files changed, 590 insertions(+), 2 deletions(-)
> create mode 100644 hw/arm/aspeed_ast27x0.c
> >>>
> >>>
>  +static bool aspeed_soc_ast2700_gic_realize(DeviceState *dev, Error
>  +**errp) {
>  +Aspeed27x0SoCState *a = ASPEED27X0_SOC(dev);
>  +AspeedSoCState *s = ASPEED_SOC(dev);
>  +AspeedSoCClass *sc = ASPEED_SOC_GET_CLASS(s);
>  +SysBusDevice *gicbusdev;
>  +DeviceState *gicdev;
>  +QList *redist_region_count;
>  +int i;
>  +
>  +gicbusdev = SYS_BUS_DEVICE(&a->gic);
>  +gicdev = DEVICE(&a->gic);
>  +qdev_prop_set_uint32(gicdev, "revision", 3);
>  +qdev_prop_set_uint32(gicdev, "num-cpu", sc->num_cpus);
>  +qdev_prop_set_uint32(gicdev, "num-irq", AST2700_MAX_IRQ);
>  +
>  +redist_region_count = qlist_new();
>  +qlist_append_int(redist_region_count, sc->num_cpus);
>  +qdev_prop_set_array(gicdev, "redist-region-count",
>  + redist_region_count);
>  +
>  +if (!sysbus_realize(gicbusdev, errp)) {
>  +return false;
>  +}
>  +sysbus_mmio_map(gicbusdev, 0,
> sc->memmap[ASPEED_GIC_DIST]);
>  +sysbus_mmio_map(gicbusdev, 1,
> >> sc->memmap[ASPEED_GIC_REDIST]);
>  +
>  +for (i = 0; i < sc->num_cpus; i++) {
>  +DeviceState *cpudev = DEVICE(&a->cpu[i]);
>  +int NUM_IRQS = 256, ARCH_GIC_MAINT_IRQ = 9,
> >>> VIRTUAL_PMU_IRQ = 7;
>  +int ppibase = NUM_IRQS + i * GIC_INTERNAL + GIC_NR_SGIS;
>  +
>  +const int timer_irq[] = {
>  +[GTIMER_PHYS] = 14,
>  +[GTIMER_VIRT] = 11,
>  +[GTIMER_HYP]  = 10,
>  +[GTIMER_SEC]  = 13,
>  +};
>  +int j;
>  +
>  +for (j = 0; j < ARRAY_SIZE(timer_irq); j++) {
>  +qdev_connect_gpio_out(cpudev, j,
>  +qdev_get_gpio_in(gicdev, ppibase +
> >> timer_irq[j]));
>  +}
>  +
>  +qemu_irq irq = qdev_get_gpio_in(gicdev,
>  +ppibase +
> >>

Re: [PATCH v4 09/33] migration: postcopy_ram_listen_thread() needs to take BQL for some calls


On 3.02.2025 21:36, Peter Xu wrote:

On Mon, Feb 03, 2025 at 09:15:52PM +0100, Maciej S. Szmigiero wrote:

On 3.02.2025 20:58, Peter Xu wrote:

On Mon, Feb 03, 2025 at 02:57:36PM +0100, Maciej S. Szmigiero wrote:

On 2.02.2025 13:45, Dr. David Alan Gilbert wrote:

* Maciej S. Szmigiero (m...@maciej.szmigiero.name) wrote:

On 2.02.2025 03:06, Dr. David Alan Gilbert wrote:

* Maciej S. Szmigiero (m...@maciej.szmigiero.name) wrote:

From: "Maciej S. Szmigiero" 

postcopy_ram_listen_thread() is a free running thread, so it needs to
take BQL around function calls to migration methods requiring BQL.

qemu_loadvm_state_main() needs BQL held since it ultimately calls
"load_state" SaveVMHandlers.

migration_incoming_state_destroy() needs BQL held since it ultimately calls
"load_cleanup" SaveVMHandlers.

Signed-off-by: Maciej S. Szmigiero 
---
 migration/savevm.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/migration/savevm.c b/migration/savevm.c
index b0b74140daea..0ceea9638cc1 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2013,7 +2013,9 @@ static void *postcopy_ram_listen_thread(void *opaque)
  * in qemu_file, and thus we must be blocking now.
  */
 qemu_file_set_blocking(f, true);
+bql_lock();
 load_res = qemu_loadvm_state_main(f, mis);
+bql_unlock();


Doesn't that leave that held for a heck of a long time?


Yes, and it effectively broke "postcopy recover" test but I
think the reason for that is qemu_loadvm_state_main() and
its children don't drop BQL while waiting for I/O.

I've described this case in more detail in my reply to Fabiano here:
https://lore.kernel.org/qemu-devel/0a09e627-955e-4f26-8d08-0192ecd25...@maciej.szmigiero.name/


While it might be the cause in this case, my feeling is it's more fundamental
here - it's the whole reason that postcopy has a separate ram listen
thread.  As the destination is running, after it loads it's devices
and as it starts up the destination will be still loading RAM
(and other postcopiable devices) potentially for quite a while.
Holding the bql around the ram listen thread means that the
execution of the destination won't be able to take that lock
until the postcopy load has finished; so while that might apparently
complete, it'll lead to the destination stalling until that's finished
which defeats the whole point of postcopy.
That last one probably won't fail a test but it will lead to a long stall
if you give it a nice big guest with lots of RAM that it's rapidly
changing.


Okay, I understand the postcopy case/flow now.
Thanks for explaining it clearly.


I still think that "load_state" SaveVMHandlers need to be called
with BQL held since implementations apparently expect it that way:
for example, I think PCI device configuration restore calls
address space manipulation methods which abort() if called
without BQL held.


However, the only devices that *should* be arriving on the channel
that the postcopy_ram_listen_thread is reading from are those
that are postcopiable (i.e. RAM and hmm block's dirty_bitmap).
Those load handlers are safe to be run while the other devices
are being changed.   Note the *should* - you could add a check
to fail if any other device arrives on that channel.


I think ultimately there should be either an explicit check, or,
as you suggest in the paragraph below, a separate SaveVMHandler
that runs without BQL held.


To me those are bugs happening during postcopy, so those abort()s in
memory.c are indeed for catching these issues too.


Since the current state of just running these SaveVMHandlers
without BQL in this case and hoping that nothing breaks is
clearly sub-optimal.


I have previously even submitted a patch to explicitly document
"load_state" SaveVMHandler as requiring BQL (which was also
included in the previous version of this patch set) and it
received a "Reviewed-by:" tag:
https://lore.kernel.org/qemu-devel/6976f129df610c8207da4e531c8c0475ec204fa4.1730203967.git.maciej.szmigi...@oracle.com/
https://lore.kernel.org/qemu-devel/e1949839932efaa531e2fe63ac13324e5787439c.1731773021.git.maciej.szmigi...@oracle.com/
https://lore.kernel.org/qemu-devel/87o732bti7@suse.de/


It happens!
You could make this safer by having a load_state and a load_state_postcopy
member, and only mark the load_state as requiring the lock.


To not digress too much from the subject of this patch set
(multifd VFIO device state transfer) for now I've just updated the
TODO comment around that qemu_loadvm_state_main(), so hopefully this
discussion won't get forgotten:
https://gitlab.com/maciejsszmigiero/qemu/-/commit/046e3deac5b1dbc406b3e9571f62468bd6743e79


The commit message may still need some touch ups, e.g.:

postcopy_ram_listen_thread() is a free running thread, so it needs to
take BQL around function calls to migration methods requiring BQL.


This sentence is still not correct, IMHO. As Dave explained, the ram load
thread is designed to run without BQL at least for t

Re: [PATCH v4 08/33] migration/multifd: Allow premature EOF on TLS incoming channels


On 3.02.2025 21:20, Peter Xu wrote:

On Mon, Feb 03, 2025 at 07:53:00PM +0100, Maciej S. Szmigiero wrote:

On 3.02.2025 19:20, Peter Xu wrote:

On Thu, Jan 30, 2025 at 11:08:29AM +0100, Maciej S. Szmigiero wrote:

From: "Maciej S. Szmigiero" 

Multifd send channels are terminated by calling
qio_channel_shutdown(QIO_CHANNEL_SHUTDOWN_BOTH) in
multifd_send_terminate_threads(), which in the TLS case essentially
calls shutdown(SHUT_RDWR) on the underlying raw socket.

Unfortunately, this does not terminate the TLS session properly and
the receive side sees this as a GNUTLS_E_PREMATURE_TERMINATION error.

The only reason why this wasn't causing migration failures is because
the current migration code apparently does not check for migration
error being set after the end of the multifd receive process.

However, this will change soon so the multifd receive code has to be
prepared to not return an error on such premature TLS session EOF.
Use the newly introduced QIOChannelTLS method for that.

It's worth noting that even if the sender were to be changed to terminate
the TLS connection properly the receive side still needs to remain
compatible with older QEMU bit stream which does not do this.


If this is an existing bug, we could add a Fixes.


It is an existing issue but only uncovered by this patch set.

As far as I can see it was always there, so it would need some
thought where to point that Fixes tag.


If there's no way to trigger a real functional bug anyway, it's also ok we
omit the Fixes.


Two pure questions..

- What is the correct way to terminate the TLS session without this flag?


I guess one would need to call gnutls_bye() like in this GnuTLS example:
https://gitlab.com/gnutls/gnutls/-/blob/2b8c3e4c71ad380bbbffb32e6003b34ecad596e3/doc/examples/ex-client-anon.c#L102


- Why this is only needed by multifd sessions?


What uncovered the issue was switching the load threads to using
migrate_set_error() instead of their own result variable
(load_threads_ret) which you had requested during the previous
patch set version review:
https://lore.kernel.org/qemu-devel/Z1DbH5fwBaxtgrvH@x1n/

Turns out that the multifd receive code always returned
error in the TLS case, just nothing was previously checking for
that error presence.


What I was curious is whether this issue also exists for the main migration
channel when with tls, especially when e.g. multifd not enabled at all.  As
I don't see anywhere that qemu uses gnutls_bye() for any tls session.

I think it's a good to find that we overlooked this before.. and IMHO it's
always good we could fix this.

Does it mean we need proper gnutls_bye() somewhere?

If we need an explicit gnutls_bye(), then I wonder if that should be done
on the main channel as well.


That's a good question and looking at the code qemu_loadvm_state_main() exits
on receiving "QEMU_VM_EOF" section (that's different from receiving socket EOF)
and then optionally "QEMU_VM_VMDESCRIPTION" section is read with explicit size
in qemu_loadvm_state() - so still not until channel EOF.

Then I can't see anything else reading the channel until it is closed in
migration_incoming_state_destroy().

So most likely the main migration channel will never read far enough to
reach that GNUTLS_E_PREMATURE_TERMINATION error.


If we don't need gnutls_bye(), then should we always ignore pre-mature
termination of tls no matter if it's multifd or non-multifd channel (or
even a tls session that is not migration-related)?


So basically have this patch extended to calling
qio_channel_tls_set_premature_eof_okay() also on the main migration channel?


Thanks,


Thanks,
Maciej

Re: [PATCH v4 16/33] migration/multifd: Device state transfer support - send side

On Thu, Jan 30, 2025 at 11:08:37AM +0100, Maciej S. Szmigiero wrote:
> From: "Maciej S. Szmigiero" 
> 
> A new function multifd_queue_device_state() is provided for device to queue
> its state for transmission via a multifd channel.
> 
> Signed-off-by: Maciej S. Szmigiero 

Reviewed-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH v4 10/33] error: define g_autoptr() cleanup function for the Error type


On 3.02.2025 22:13, Daniel P. Berrangé wrote:

On Thu, Jan 30, 2025 at 11:08:31AM +0100, Maciej S. Szmigiero wrote:

From: "Maciej S. Szmigiero" 

Automatic memory management helps avoid memory safety issues.

Signed-off-by: Maciej S. Szmigiero 
---
  include/qapi/error.h | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/include/qapi/error.h b/include/qapi/error.h
index 71f8fb2c50ee..649ec8f1b6a2 100644
--- a/include/qapi/error.h
+++ b/include/qapi/error.h
@@ -437,6 +437,8 @@ Error *error_copy(const Error *err);>

q   */

  void error_free(Error *err);
  
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(Error, error_free)

+


This has been rejected by Markus in the past when I proposed. See the
rationale at the time here:

   https://lists.nongnu.org/archive/html/qemu-devel/2024-07/msg05503.html


Thanks for the pointer, I wasn't expecting this change to be controversial.
 

If you want this, the commit message will need to explain the use
case and justify why the existing error usage patterns are insufficient.


In this case it's about giving received Error to migrate_set_error()
which does *not* take ownership of it.

And the reason why migrate_set_error() does not take ownership of
incoming Error is that it might have an Error already set in
MigrationState, in this case it simply ignores the passed Error
(almost like being a NOP in this case).

I don't know whether this is enough of a justification for introducing
g_autoptr(Error).
I'm happy to drop this commit and change it to manual memory management
instead if it is not.

@Markus, what's your opinion here?


With regards,
Daniel


Thanks,
Maciej

Re: [PATCH v4 08/33] migration/multifd: Allow premature EOF on TLS incoming channels

On Mon, Feb 03, 2025 at 07:53:00PM +0100, Maciej S. Szmigiero wrote:
> On 3.02.2025 19:20, Peter Xu wrote:
> > On Thu, Jan 30, 2025 at 11:08:29AM +0100, Maciej S. Szmigiero wrote:
> > > From: "Maciej S. Szmigiero" 
> > > 
> > > Multifd send channels are terminated by calling
> > > qio_channel_shutdown(QIO_CHANNEL_SHUTDOWN_BOTH) in
> > > multifd_send_terminate_threads(), which in the TLS case essentially
> > > calls shutdown(SHUT_RDWR) on the underlying raw socket.
> > > 
> > > Unfortunately, this does not terminate the TLS session properly and
> > > the receive side sees this as a GNUTLS_E_PREMATURE_TERMINATION error.
> > > 
> > > The only reason why this wasn't causing migration failures is because
> > > the current migration code apparently does not check for migration
> > > error being set after the end of the multifd receive process.
> > > 
> > > However, this will change soon so the multifd receive code has to be
> > > prepared to not return an error on such premature TLS session EOF.
> > > Use the newly introduced QIOChannelTLS method for that.
> > > 
> > > It's worth noting that even if the sender were to be changed to terminate
> > > the TLS connection properly the receive side still needs to remain
> > > compatible with older QEMU bit stream which does not do this.
> > 
> > If this is an existing bug, we could add a Fixes.
> 
> It is an existing issue but only uncovered by this patch set.
> 
> As far as I can see it was always there, so it would need some
> thought where to point that Fixes tag.

If there's no way to trigger a real functional bug anyway, it's also ok we
omit the Fixes.

> > Two pure questions..
> > 
> >- What is the correct way to terminate the TLS session without this flag?
> 
> I guess one would need to call gnutls_bye() like in this GnuTLS example:
> https://gitlab.com/gnutls/gnutls/-/blob/2b8c3e4c71ad380bbbffb32e6003b34ecad596e3/doc/examples/ex-client-anon.c#L102
> 
> >- Why this is only needed by multifd sessions?
> 
> What uncovered the issue was switching the load threads to using
> migrate_set_error() instead of their own result variable
> (load_threads_ret) which you had requested during the previous
> patch set version review:
> https://lore.kernel.org/qemu-devel/Z1DbH5fwBaxtgrvH@x1n/
> 
> Turns out that the multifd receive code always returned
> error in the TLS case, just nothing was previously checking for
> that error presence.

What I was curious is whether this issue also exists for the main migration
channel when with tls, especially when e.g. multifd not enabled at all.  As
I don't see anywhere that qemu uses gnutls_bye() for any tls session.

I think it's a good to find that we overlooked this before.. and IMHO it's
always good we could fix this.

Does it mean we need proper gnutls_bye() somewhere?

If we need an explicit gnutls_bye(), then I wonder if that should be done
on the main channel as well.

If we don't need gnutls_bye(), then should we always ignore pre-mature
termination of tls no matter if it's multifd or non-multifd channel (or
even a tls session that is not migration-related)?

Thanks,

> 
> Another option would be to simply return to using
> load_threads_ret like the previous versions did and not
> experiment with touching global migration state because
> as we can see other places can unintentionally break.
> 
> If we go this route then these TLS EOF patches could be
> dropped.
> 
> > Thanks,
> > 
> 
> Thanks,
> Maciej
> 

-- 
Peter Xu

Re: [PATCH v2 14/15] iotests: Add qsd-migrate case

2025-02-03 Thread Kevin Wolf

Am 03.02.2025 um 20:35 hat Eric Blake geschrieben:
> On Fri, Jan 31, 2025 at 10:50:50AM +0100, Kevin Wolf wrote:
> > Test that it's possible to migrate a VM that uses an image on shared
> > storage through qemu-storage-daemon.
> > 
> > Signed-off-by: Kevin Wolf 
> > ---
> >  tests/qemu-iotests/tests/qsd-migrate | 132 +++
> >  tests/qemu-iotests/tests/qsd-migrate.out |  51 +
> >  2 files changed, 183 insertions(+)
> >  create mode 100755 tests/qemu-iotests/tests/qsd-migrate
> >  create mode 100644 tests/qemu-iotests/tests/qsd-migrate.out
> > 
> > diff --git a/tests/qemu-iotests/tests/qsd-migrate 
> > b/tests/qemu-iotests/tests/qsd-migrate
> > new file mode 100755
> > index 00..687bda6f93
> > --- /dev/null
> > +++ b/tests/qemu-iotests/tests/qsd-migrate
> > @@ -0,0 +1,132 @@
> > +#!/usr/bin/env python3
> > +# group: rw quick
> 
> > +
> > +with iotests.FilePath('disk.img') as path, \
> > + iotests.FilePath('nbd-src.sock', base_dir=iotests.sock_dir) as 
> > nbd_src, \
> > + iotests.FilePath('nbd-dst.sock', base_dir=iotests.sock_dir) as 
> > nbd_dst, \
> > + iotests.FilePath('migrate.sock', base_dir=iotests.sock_dir) as 
> > mig_sock, \
> > + iotests.VM(path_suffix="-src") as vm_src, \
> > + iotests.VM(path_suffix="-dst") as vm_dst:
> > +
> 
> > +
> > +iotests.log('\nTest I/O on the source')
> > +vm_src.hmp_qemu_io('virtio0/virtio-backend', 'write -P 0x11 0 4k',
> > +   use_log=True, qdev=True)
> > +vm_src.hmp_qemu_io('virtio0/virtio-backend', 'read -P 0x11 0 4k',
> > +   use_log=True, qdev=True)
> > +
> > +iotests.log('\nStarting migration...')
> 
> 
> Is it worth adding a test that qemu_io fails to write on the
> destination while it is inactive (to ensure we are properly rejecting
> modification of an inactive image)?

The problem with that is that the failure mode for qemu_io (which acts
as if it were a device, not an external interface) is an assertion
failure.

The other test (in patch 15) tests writes on the NBD export, which fails
gracefully.

> > +
> > +mig_caps = [
> > +{'capability': 'events', 'state': True},
> > +{'capability': 'pause-before-switchover', 'state': True},
> > +]
> > +vm_src.qmp_log('migrate-set-capabilities', capabilities=mig_caps)
> > +vm_dst.qmp_log('migrate-set-capabilities', capabilities=mig_caps)
> > +vm_src.qmp_log('migrate', uri=f'unix:{mig_sock}',
> > +   filters=[iotests.filter_qmp_testfiles])
> > +
> > +vm_src.event_wait('MIGRATION',
> > +  match={'data': {'status': 'pre-switchover'}})
> > +
> > +iotests.log('\nPre-switchover: Reconfigure QSD instances')
> > +
> > +iotests.log(qsd_src.qmp('blockdev-set-active', {'active': False}))
> > +iotests.log(qsd_dst.qmp('blockdev-set-active', {'active': True}))
> 
> Also, should you attempt a read on both src and dst while both sides
> are inactive, to prove that reads can take a snapshot in the middle of
> the handover?

I think this could be done without any problems.

Kevin

Re: [PATCH v4 10/33] error: define g_autoptr() cleanup function for the Error type

2025-02-03 Thread Daniel P . Berrangé

On Thu, Jan 30, 2025 at 11:08:31AM +0100, Maciej S. Szmigiero wrote:
> From: "Maciej S. Szmigiero" 
> 
> Automatic memory management helps avoid memory safety issues.
> 
> Signed-off-by: Maciej S. Szmigiero 
> ---
>  include/qapi/error.h | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/include/qapi/error.h b/include/qapi/error.h
> index 71f8fb2c50ee..649ec8f1b6a2 100644
> --- a/include/qapi/error.h
> +++ b/include/qapi/error.h
> @@ -437,6 +437,8 @@ Error *error_copy(const Error *err);>
q   */
>  void error_free(Error *err);
>  
> +G_DEFINE_AUTOPTR_CLEANUP_FUNC(Error, error_free)
> +

This has been rejected by Markus in the past when I proposed. See the
rationale at the time here:

  https://lists.nongnu.org/archive/html/qemu-devel/2024-07/msg05503.html

If you want this, the commit message will need to explain the use
case and justify why the existing error usage patterns are insufficient.

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH V1 05/26] vfio/container: preserve descriptors

2025-02-03 Thread Steven Sistare


On 2/3/2025 12:48 PM, Cédric Le Goater wrote:

On 1/29/25 15:43, Steve Sistare wrote:

At vfio creation time, save the value of vfio container, group, and device
descriptors in CPR state.  On qemu restart, vfio_realize() finds and uses
the saved descriptors, and remembers the reused status for subsequent
patches.  The reused status is cleared when vmstate load finishes.

During reuse, device and iommu state is already configured, so operations
in vfio_realize that would modify the configuration, such as vfio ioctl's,
are skipped.  The result is that vfio_realize constructs qemu data
structures that reflect the current state of the device.

Signed-off-by: Steve Sistare 
---
  hw/vfio/container.c   | 105 ++
  hw/vfio/cpr-legacy.c  |  17 +++
  include/hw/vfio/vfio-common.h |   2 +
  3 files changed, 105 insertions(+), 19 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index a90ce6c..81d0ccc 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -31,6 +31,7 @@
  #include "system/reset.h"
  #include "trace.h"
  #include "qapi/error.h"
+#include "migration/cpr.h"
  #include "pci.h"
  VFIOGroupList vfio_group_list =
@@ -415,12 +416,28 @@ static bool vfio_set_iommu(int container_fd, int group_fd,
  }
  static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
-    Error **errp)
+    bool reused, Error **errp)


Please rename 'reused' to 'cpr_reused'. We should know what this parameter
is for and I don't see any other use than CPR.


Hi Cedric, glad to virtually meet you, and thanks for reviewing this.

There is no other notion of "reused" in qemu -- CPR is the first to introduce
it.  Thus "reused" is unambiguous, it always refers to CPR.  IMO shorter names
without underscores make the code more readable, as long as they are 
unambiguous.

Also, the "reused" identifier already appears in the initial series for
cpr-transfer, and to switch now to a different identifier leaves us with two
names for the same functionality.  Right now I can cscope "reused" and find
everything.

For those reasons, I prefer reused, but if you feel strongly, I will rename it.


  {
  int iommu_type;
  const char *vioc_name;
  VFIOContainer *container;
+    /*
+ * If container is reused, just set its type and skip the ioctls, as the
+ * container and group are already configured in the kernel.
+ * VFIO_TYPE1v2_IOMMU is the only type that supports reuse/cpr.
+ */
+    if (reused) {
+    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) {
+    iommu_type = VFIO_TYPE1v2_IOMMU;
+    goto skip_iommu;
+    } else {
+    error_setg(errp, "container was reused but VFIO_TYPE1v2_IOMMU "
+ "is not supported");
+    return NULL;
+    }
+    }
+


Can we use 'iommu_type' below instead and avoid VFIO_CHECK_EXTENSION
ioctl ? and then set the iommu unless CPR reused is set.


Sure, I'll mke that change.


  iommu_type = vfio_get_iommu_type(fd, errp);
  if (iommu_type < 0) {
  return NULL;
@@ -430,10 +447,12 @@ static VFIOContainer *vfio_create_container(int fd, 
VFIOGroup *group,
  return NULL;
  }
+skip_iommu:


I think we can avoid this 'skip_iommu' label with some minor refactoring.


  vioc_name = vfio_get_iommu_class_name(iommu_type);
  container = VFIO_IOMMU_LEGACY(object_new(vioc_name));
  container->fd = fd;
+    container->reused = reused;
  container->iommu_type = iommu_type;
  return container;
  }
@@ -543,10 +562,13 @@ static bool vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
  VFIOContainer *container;
  VFIOContainerBase *bcontainer;
  int ret, fd;
+    bool reused;


cpr_reused.


  VFIOAddressSpace *space;
  VFIOIOMMUClass *vioc;
  space = vfio_get_address_space(as);
+    fd = cpr_find_fd("vfio_container_for_group", group->groupid);
+    reused = (fd > 0);



hmm, so we are deducing from the existence of a CprFd state element
that we are doing a live update of the VM.  This seems to me to be a
somewhat quick heuristic.

Isn't there a global helper ? Isn't the VM aware that it's being
restarted after a live update ? I am not familiar with the CPR
sequence.


There is a global mode that can be checked, but we would still need to
fetch the fd.  Checking the fd alone yields tighter code.  It also seems
perfectly logical to me when reading the code.  Can't find the cpr fd?
Then we are not doing cpr.  BTW, it is not heuristic.  The cpr fd exists
at creation time iff we are doing cpr.


  /*
   * VFIO is currently incompatible with discarding of RAM insofar as the
@@ -579,28 +601,52 @@ static bool vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
   * details once we know which type of IOMMU we are using.
   */
+    /*
+ * If the container is reused,

Re: [PATCH V1 04/26] vfio/container: register container for cpr

2025-02-03 Thread Steven Sistare


On 2/3/2025 12:01 PM, Cédric Le Goater wrote:

On 1/29/25 15:43, Steve Sistare wrote:

Register a legacy container for cpr-transfer.  Add a blocker if the kernel
does not support VFIO_UPDATE_VADDR or VFIO_UNMAP_ALL.

This is mostly boiler plate.  The fields to to saved and restored are added
in subsequent patches.

Signed-off-by: Steve Sistare 
---
  hw/vfio/container.c   |  6 ++--
  hw/vfio/cpr-legacy.c  | 68 +++
  hw/vfio/meson.build   |  3 +-
  include/hw/vfio/vfio-common.h |  3 ++
  4 files changed, 76 insertions(+), 4 deletions(-)
  create mode 100644 hw/vfio/cpr-legacy.c

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 4ebb526..a90ce6c 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -618,7 +618,7 @@ static bool vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
  }
  bcontainer = &container->bcontainer;
-    if (!vfio_cpr_register_container(bcontainer, errp)) {
+    if (!vfio_legacy_cpr_register_container(container, errp)) {
  goto free_container_exit;
  }
@@ -666,7 +666,7 @@ enable_discards_exit:
  vfio_ram_block_discard_disable(container, false);
  unregister_container_exit:
-    vfio_cpr_unregister_container(bcontainer);
+    vfio_legacy_cpr_unregister_container(container);
  free_container_exit:
  object_unref(container);
@@ -710,7 +710,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
  VFIOAddressSpace *space = bcontainer->space;
  trace_vfio_disconnect_container(container->fd);
-    vfio_cpr_unregister_container(bcontainer);
+    vfio_legacy_cpr_unregister_container(container);
  close(container->fd);
  object_unref(container);
diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c
new file mode 100644
index 000..d3bbc05
--- /dev/null
+++ b/hw/vfio/cpr-legacy.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021-2025 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include 
+#include "qemu/osdep.h"
+#include "hw/vfio/vfio-common.h"
+#include "migration/blocker.h"
+#include "migration/cpr.h"
+#include "migration/migration.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+
+static bool vfio_cpr_supported(VFIOContainer *container, Error **errp)
+{
+    if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) {
+    error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR");
+    return false;
+
+    } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) {
+    error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL");
+    return false;
+
+    } else {
+    return true;
+    }
+}
+
+static const VMStateDescription vfio_container_vmstate = {
+    .name = "vfio-container",
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .needed = cpr_needed_for_reuse,
+    .fields = (VMStateField[]) {
+    VMSTATE_END_OF_LIST()
+    }
+};
+
+bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp)
+{
+    VFIOContainerBase *bcontainer = &container->bcontainer;
+    Error **cpr_blocker = &container->cpr_blocker;
+
+    if (!vfio_cpr_register_container(bcontainer, errp)) {
+    return false;
+    }
+
+    if (!vfio_cpr_supported(container, cpr_blocker)) {
+    return migrate_add_blocker_modes(cpr_blocker, errp,
+ MIG_MODE_CPR_TRANSFER, -1) == 0;
+    }
+
+    vmstate_register(NULL, -1, &vfio_container_vmstate, container);
+
+    return true;
+}
+
+void vfio_legacy_cpr_unregister_container(VFIOContainer *container)
+{
+    VFIOContainerBase *bcontainer = &container->bcontainer;
+
+    vfio_cpr_unregister_container(bcontainer);
+    migrate_del_blocker(&container->cpr_blocker);
+    vmstate_unregister(NULL, &vfio_container_vmstate, container);
+}
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index bba776f..5487815 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -5,13 +5,14 @@ vfio_ss.add(files(
    'container-base.c',
    'container.c',
    'migration.c',
-  'cpr.c',
  ))
  vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c'))
  vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files(
    'iommufd.c',
  ))
  vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
+  'cpr.c',
+  'cpr-legacy.c',
    'display.c',
    'pci-quirks.c',
    'pci.c',
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 0c60be5..53e554f 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -84,6 +84,7 @@ typedef struct VFIOContainer {
  VFIOContainerBase bcontainer;
  int fd; /* /dev/vfio/vfio, empowered by the attached groups */
  unsigned iommu_type;
+    Error *cpr_blocker;
  QLIST_HEAD(, VFIOGroup) group_list;
  } VFIOContainer;
@@ -258,6 +259,8 @@ int vfio_kvm_device_del_fd(int fd, Error

[PATCH 0/2] nbd: Allow debugging tuning of handshake limit

Reviving a patch that has been sitting in my tree for a while.  It's
mostly useful for low-level integration testing (such as debugging
libnbd as an NBD client).

Eric Blake (2):
  qemu-nbd: Allow users to adjust handshake limit
  nbd/server: Allow users to adjust handshake limit in QMP

 docs/tools/qemu-nbd.rst|  5 +
 qapi/block-export.json | 10 +
 include/block/nbd.h|  6 ++---
 block/monitor/block-hmp-cmds.c |  4 ++--
 blockdev-nbd.c | 26 ++---
 qemu-nbd.c | 41 +-
 6 files changed, 64 insertions(+), 28 deletions(-)

-- 
2.48.1

[PATCH 2/2] nbd/server: Allow users to adjust handshake limit in QMP

Although defaulting the handshake limit to 10 seconds was a nice QoI
change to weed out intentionally slow clients, it can interfere with
integration testing done with manual NBD_OPT commands over 'nbdsh
--opt-mode'.  Expose a QMP knob 'handshake-max-secs' to allow the user
to alter the timeout away from the default.

The parameter name here intentionally matches the spelling of the
constant added in commit fb1c2aaa98, and not the command-line spelling
added in the previous patch for qemu-nbd; that's because in QMP,
longer names serve as good self-documentation, and unlike the command
line, machines don't have problems generating longer spellings.

Signed-off-by: Eric Blake 
---
 qapi/block-export.json | 10 ++
 include/block/nbd.h|  6 +++---
 block/monitor/block-hmp-cmds.c |  4 ++--
 blockdev-nbd.c | 26 ++
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/qapi/block-export.json b/qapi/block-export.json
index ce33fe378df..58ae6a5e1d7 100644
--- a/qapi/block-export.json
+++ b/qapi/block-export.json
@@ -17,6 +17,10 @@
 #
 # @addr: Address on which to listen.
 #
+# @handshake-max-secs: Time limit, in seconds, at which a client that
+# has not completed the negotiation handshake will be disconnected,
+# or 0 for no limit (since 10.0; default: 10).
+#
 # @tls-creds: ID of the TLS credentials object (since 2.6).
 #
 # @tls-authz: ID of the QAuthZ authorization object used to validate
@@ -34,6 +38,7 @@
 ##
 { 'struct': 'NbdServerOptions',
   'data': { 'addr': 'SocketAddress',
+'*handshake-max-secs': 'uint32',
 '*tls-creds': 'str',
 '*tls-authz': 'str',
 '*max-connections': 'uint32' } }
@@ -52,6 +57,10 @@
 #
 # @addr: Address on which to listen.
 #
+# @handshake-max-secs: Time limit, in seconds, at which a client that
+# has not completed the negotiation handshake will be disconnected,
+# or 0 for no limit (since 10.0; default: 10).
+#
 # @tls-creds: ID of the TLS credentials object (since 2.6).
 #
 # @tls-authz: ID of the QAuthZ authorization object used to validate
@@ -72,6 +81,7 @@
 ##
 { 'command': 'nbd-server-start',
   'data': { 'addr': 'SocketAddressLegacy',
+'*handshake-max-secs': 'uint32',
 '*tls-creds': 'str',
 '*tls-authz': 'str',
 '*max-connections': 'uint32' },
diff --git a/include/block/nbd.h b/include/block/nbd.h
index d4f8b21aecc..92987c76fd6 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -428,9 +428,9 @@ void nbd_client_put(NBDClient *client);
 void nbd_server_is_qemu_nbd(int max_connections);
 bool nbd_server_is_running(void);
 int nbd_server_max_connections(void);
-void nbd_server_start(SocketAddress *addr, const char *tls_creds,
-  const char *tls_authz, uint32_t max_connections,
-  Error **errp);
+void nbd_server_start(SocketAddress *addr, uint32_t handshake_max_secs,
+  const char *tls_creds, const char *tls_authz,
+  uint32_t max_connections, Error **errp);
 void nbd_server_start_options(NbdServerOptions *arg, Error **errp);

 /* nbd_read
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index 1d312513fc4..0cfcbfe7c21 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -402,8 +402,8 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict)
 goto exit;
 }

-nbd_server_start(addr, NULL, NULL, NBD_DEFAULT_MAX_CONNECTIONS,
- &local_err);
+nbd_server_start(addr, NBD_DEFAULT_HANDSHAKE_MAX_SECS, NULL, NULL,
+ NBD_DEFAULT_MAX_CONNECTIONS, &local_err);
 qapi_free_SocketAddress(addr);
 if (local_err != NULL) {
 goto exit;
diff --git a/blockdev-nbd.c b/blockdev-nbd.c
index 9e61fbaf2b2..e9f53e83d48 100644
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -28,6 +28,7 @@ typedef struct NBDConn {

 typedef struct NBDServerData {
 QIONetListener *listener;
+uint32_t handshake_max_secs;
 QCryptoTLSCreds *tlscreds;
 char *tlsauthz;
 uint32_t max_connections;
@@ -84,8 +85,7 @@ static void nbd_accept(QIONetListener *listener, 
QIOChannelSocket *cioc,
 nbd_update_server_watch(nbd_server);

 qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server");
-/* TODO - expose handshake timeout as QMP option */
-nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS,
+nbd_client_new(cioc, nbd_server->handshake_max_secs,
nbd_server->tlscreds, nbd_server->tlsauthz,
nbd_blockdev_client_closed, conn);
 }
@@ -162,9 +162,9 @@ static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, 
Error **errp)
 }


-void nbd_server_start(SocketAddress *addr, const char *tls_creds,
-  const char *tls_authz, uint32_t max_connections,
-  Error **errp)
+void nbd_server_start(SocketAddress *addr, uint32_t handshake_max

Re: [PATCH V1 06/26] vfio/container: preserve DMA mappings

2025-02-03 Thread Steven Sistare


On 2/3/2025 1:25 PM, Cédric Le Goater wrote:

On 1/29/25 15:43, Steve Sistare wrote:

Preserve DMA mappings during cpr-transfer.

In the container pre_save handler, suspend the use of virtual addresses
in DMA mappings with VFIO_DMA_UNMAP_FLAG_VADDR, because guest RAM will
be remapped at a different VA after exec.  DMA to already-mapped pages
continues.

Because the vaddr is temporarily invalid, mediated devices cannot be
supported, so add a blocker for them.  This restriction will not apply
to iommufd containers when CPR is added for them in a future patch.

In new QEMU, do not register the memory listener at device creation time.
Register it later, in the container post_load handler, after all vmstate
that may affect regions and mapping boundaries has been loaded.  The
post_load registration will cause the listener to invoke its callback on
each flat section, and the calls will match the mappings remembered by the
kernel.  Modify vfio_dma_map (which is called by the listener) to pass the
new VA to the kernel using VFIO_DMA_MAP_FLAG_VADDR.

Signed-off-by: Steve Sistare 
---
  hw/vfio/container.c   | 44 +++
  hw/vfio/cpr-legacy.c  | 32 +++
  include/hw/vfio/vfio-common.h |  3 +++
  3 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 81d0ccc..2b5125e 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -32,6 +32,7 @@
  #include "trace.h"
  #include "qapi/error.h"
  #include "migration/cpr.h"
+#include "migration/blocker.h"
  #include "pci.h"
  VFIOGroupList vfio_group_list =
@@ -132,6 +133,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase 
*bcontainer,
  int ret;
  Error *local_err = NULL;
+    assert(!container->reused);
+
  if (iotlb && vfio_devices_all_dirty_tracking_started(bcontainer)) {
  if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
  bcontainer->dirty_pages_supported) {
@@ -183,12 +186,24 @@ static int vfio_legacy_dma_map(const VFIOContainerBase 
*bcontainer, hwaddr iova,
    bcontainer);
  struct vfio_iommu_type1_dma_map map = {
  .argsz = sizeof(map),
-    .flags = VFIO_DMA_MAP_FLAG_READ,
  .vaddr = (__u64)(uintptr_t)vaddr,
  .iova = iova,
  .size = size,
  };
+    /*
+ * Set the new vaddr for any mappings registered during cpr load.
+ * Reused is cleared thereafter.
+ */
+    if (container->reused) {
+    map.flags = VFIO_DMA_MAP_FLAG_VADDR;
+    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
+    goto fail;
+    }
+    return 0;
+    }


This is a bit ugly.

When reaching routine vfio_attach_device(), could we detect that CPR is
in progress and replace the 'VFIOIOMMUClass *' temporarily with a set of
CPR specific handlers ?


Good idea, I'll try it.  I wrote this code years ago before the dma
map and unmap functions were defined in an ops vector.


+
+    map.flags = VFIO_DMA_MAP_FLAG_READ;
  if (!readonly) {
  map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
  }
@@ -205,7 +220,11 @@ static int vfio_legacy_dma_map(const VFIOContainerBase 
*bcontainer, hwaddr iova,
  return 0;
  }
-    error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
+fail:
+    error_report("vfio_dma_map %s (iova %lu, size %ld, va %p): %s",
+    (container->reused ? "VADDR" : ""), iova, size, vaddr,
+    strerror(errno));
+



FYI, I am currently trying to remove this error report.



  return -errno;
  }
@@ -689,8 +708,17 @@ static bool vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
  group->container = container;
  QLIST_INSERT_HEAD(&container->group_list, group, container_next);
-    bcontainer->listener = vfio_memory_listener;
-    memory_listener_register(&bcontainer->listener, bcontainer->space->as);
+    /*
+ * If reused, register the listener later, after all state that may
+ * affect regions and mapping boundaries has been cpr load'ed.  Later,
+ * the listener will invoke its callback on each flat section and call
+ * vfio_dma_map to supply the new vaddr, and the calls will match the
+ * mappings remembered by the kernel.
+ */
+    if (!reused) {
+    bcontainer->listener = vfio_memory_listener;
+    memory_listener_register(&bcontainer->listener, bcontainer->space->as);
+    }


oh ! This is an important change. Please move in its own patch.


OK.


  if (bcontainer->error) {
  error_propagate_prepend(errp, bcontainer->error,
@@ -1002,6 +1030,13 @@ static bool vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
  return false;
  }
+    if (vbasedev->mdev) {
+    error_setg(&vbasedev->cpr_mdev_blocker,
+   "CPR does not support vfio mdev %s", vbasedev->name);
+    migrate_add_blocker_modes(&vbasedev->cpr_mdev_blocker, &error_fatal

[PATCH 1/2] qemu-nbd: Allow users to adjust handshake limit