[PATCH rfcv2 11/20] intel_iommu: Check for compatibility with IOMMUFD backed device when x-flts=on

2025-02-19 Thread Zhenzhong Duan
When vIOMMU is configured x-flts=on in scalable mode, stage-1 page table
is passed to host to construct nested page table. We need to check
compatibility of some critical IOMMU capabilities between vIOMMU and
host IOMMU to ensure guest stage-1 page table could be used by host.

For instance, vIOMMU supports stage-1 1GB huge page mapping, but host
does not, then this IOMMUFD backed device should be failed.

Declare an enum type host_iommu_device_iommu_hw_info_type aliased to
iommu_hw_info_type which come from iommufd header file. This can avoid
build failure on windows which doesn't support iommufd.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 include/system/host_iommu_device.h | 13 
 hw/i386/intel_iommu.c  | 34 ++
 2 files changed, 47 insertions(+)

diff --git a/include/system/host_iommu_device.h 
b/include/system/host_iommu_device.h
index 250600fc1d..aa3885d7ee 100644
--- a/include/system/host_iommu_device.h
+++ b/include/system/host_iommu_device.h
@@ -133,5 +133,18 @@ struct HostIOMMUDeviceClass {
 #define HOST_IOMMU_DEVICE_CAP_FS1GP 3
 #define HOST_IOMMU_DEVICE_CAP_ERRATA4
 
+/**
+ * enum host_iommu_device_iommu_hw_info_type - IOMMU Hardware Info Types
+ * @HOST_IOMMU_DEVICE_IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not
+ * report hardware info
+ * @HOST_IOMMU_DEVICE_IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
+ *
+ * This is alias to enum iommu_hw_info_type but for general purpose.
+ */
+enum host_iommu_device_iommu_hw_info_type {
+HOST_IOMMU_DEVICE_IOMMU_HW_INFO_TYPE_NONE,
+HOST_IOMMU_DEVICE_IOMMU_HW_INFO_TYPE_INTEL_VTD,
+};
+
 #define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX   64
 #endif
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7709f55be5..9de60e607d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -39,6 +39,7 @@
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
 #include "trace.h"
+#include "system/iommufd.h"
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -4346,6 +4347,39 @@ static bool vtd_check_hiod(IntelIOMMUState *s, 
HostIOMMUDevice *hiod,
 return true;
 }
 
+/* Remaining checks are all stage-1 translation specific */
+if (!object_dynamic_cast(OBJECT(hiod), TYPE_HOST_IOMMU_DEVICE_IOMMUFD)) {
+error_setg(errp, "Need IOMMUFD backend when x-flts=on");
+return false;
+}
+
+ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE, errp);
+if (ret < 0) {
+return false;
+}
+if (ret != HOST_IOMMU_DEVICE_IOMMU_HW_INFO_TYPE_INTEL_VTD) {
+error_setg(errp, "Incompatible host platform IOMMU type %d", ret);
+return false;
+}
+
+ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_NESTING, errp);
+if (ret < 0) {
+return false;
+}
+if (ret != 1) {
+error_setg(errp, "Host IOMMU doesn't support nested translation");
+return false;
+}
+
+ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_FS1GP, errp);
+if (ret < 0) {
+return false;
+}
+if (s->fs1gp && ret != 1) {
+error_setg(errp, "Stage-1 1GB huge page is unsupported by host IOMMU");
+return false;
+}
+
 error_setg(errp, "host device is uncompatible with stage-1 translation");
 return false;
 }
-- 
2.34.1




[PATCH rfcv2 15/20] intel_iommu: ERRATA_772415 workaround

2025-02-19 Thread Zhenzhong Duan
On a system influenced by ERRATA_772415, IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17
is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the readonly
range mapped on stage-2 page table could still be written.

Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
Update, Errata Details, SPR17.

[0] 
https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update

We utilize the new added IOMMUFD container/ioas/hwpt management framework in
VTD. Add a check to create new VTDIOASContainer to hold RW-only mappings,
then this VTDIOASContainer can be used as backend for device with
ERRATA_772415. See below diagram for details:

  IntelIOMMUState
 |
 V
.--..--..---.
| VTDIOASContainer |--->| VTDIOASContainer |--->| VTDIOASContainer  |-->...
| (iommufd0,RW&RO) || (iommufd1,RW&RO) || (iommufd0,RW only)|
.--..--..---.
 |   |  |
 |   .-->...|
 V  V
  .---..---.  .---.
  |   VTDS2Hwpt(CC)   |--->| VTDS2Hwpt(non-CC) |-->...| VTDS2Hwpt(CC) 
|-->...
  .---..---.  .---.
  ||   ||
  ||   ||
.---.  .---.  ..  ..
| IOMMUFD   |  | IOMMUFD   |  | IOMMUFD|  | IOMMUFD|
| Device(CC)|  | Device(CC)|  | Device |  | Device(CC) |
| (iommufd0)|  | (iommufd0)|  | (non-CC)   |  | (errata)   |
|   |  |   |  | (iommufd0) |  | (iommufd0) |
.---.  .---.  ..  ..

Changed to pass VTDHostIOMMUDevice pointer to vtd_check_hdev() so errata
could be saved.

Suggested-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  1 +
 include/hw/i386/intel_iommu.h  |  1 +
 hw/i386/intel_iommu.c  | 26 +++---
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 23b7e236b0..8558781af8 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -654,5 +654,6 @@ typedef struct VTDHostIOMMUDevice {
 PCIBus *bus;
 uint8_t devfn;
 HostIOMMUDevice *hiod;
+uint32_t errata;
 } VTDHostIOMMUDevice;
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 594281c1d3..9b156dc32e 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -103,6 +103,7 @@ typedef struct VTDPASIDCacheEntry {
 typedef struct VTDIOASContainer {
 struct IOMMUFDBackend *iommufd;
 uint32_t ioas_id;
+uint32_t errata;
 MemoryListener listener;
 QLIST_HEAD(, VTDS2Hwpt) s2_hwpt_list;
 QLIST_ENTRY(VTDIOASContainer) next;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e36ac44110..dae1716629 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2443,7 +2443,8 @@ static void vtd_context_global_invalidate(IntelIOMMUState 
*s)
 }
 
 #ifdef CONFIG_IOMMUFD
-static bool iommufd_listener_skipped_section(MemoryRegionSection *section)
+static bool iommufd_listener_skipped_section(VTDIOASContainer *container,
+ MemoryRegionSection *section)
 {
 return !memory_region_is_ram(section->mr) ||
memory_region_is_protected(section->mr) ||
@@ -2453,7 +2454,8 @@ static bool 
iommufd_listener_skipped_section(MemoryRegionSection *section)
 * are never accessed by the CPU and beyond the address width of
 * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
 */
-   section->offset_within_address_space & (1ULL << 63);
+   section->offset_within_address_space & (1ULL << 63) ||
+   (container->errata && section->readonly);
 }
 
 static void iommufd_listener_region_add_s2domain(MemoryListener *listener,
@@ -2469,7 +2471,7 @@ static void 
iommufd_listener_region_add_s2domain(MemoryListener *listener,
 Error *err = NULL;
 int ret;
 
-if (iommufd_listener_skipped_section(section)) {
+if (iommufd_listener_skipped_section(container, section)) {
 return;
 }
 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
@@ -2520,7 +2522,7 @@ static void 
iommufd_listener_region_del_s2domain(MemoryListener *listener,
 Int128 llend, llsize;
 int ret;
 
-if (iommufd_listener_skipped_section(se

[PATCH rfcv2 14/20] intel_iommu: Bind/unbind guest page table to host

2025-02-19 Thread Zhenzhong Duan
This captures the guest PASID table entry modifications and
propagates the changes to host to attach a hwpt with type determined
per guest PGTT configuration.

When PGTT is Pass-through(100b), the hwpt on host side is a stage-2
page table(GPA->HPA). When PGTT is First-stage Translation only(001b),
the hwpt on host side is a nested page table.

The guest page table is configured as stage-1 page table (gIOVA->GPA)
whose translation result would further go through host VT-d stage-2
page table(GPA->HPA) under nested translation mode. This is the key
to support gIOVA over stage-1 page table for Intel VT-d in
virtualization environment.

Stage-2 page table could be shared by different devices if there is
no conflict and devices link to same iommufd object, i.e. devices
under same host IOMMU can share same stage-2 page table. If there
is conflict, i.e. there is one device under non cache coherency
mode which is different from others, it requires a separate
stage-2 page table in non-CC mode.

See below example diagram:

  IntelIOMMUState
 |
 V
.--..--.
| VTDIOASContainer |--->| VTDIOASContainer |--->...
|(iommufd0)||(iommufd1)|
.--..--.
 |   |
 |   .-->...
 V
  .---..---.
  |   VTDS2Hwpt(CC)   |--->| VTDS2Hwpt(non-CC) |-->...
  .---..---.
  ||   |
  ||   |
.---.  .---.  ..
| IOMMUFD   |  | IOMMUFD   |  | IOMMUFD|
| Device(CC)|  | Device(CC)|  | Device |
| (iommufd0)|  | (iommufd0)|  | (non-CC)   |
|   |  |   |  | (iommufd0) |
.---.  .---.  ..

Co-Authored-by: Yi Liu 
Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  11 +
 include/hw/i386/intel_iommu.h  |  24 ++
 hw/i386/intel_iommu.c  | 581 +++--
 hw/i386/trace-events   |   8 +
 4 files changed, 604 insertions(+), 20 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 632fda2853..23b7e236b0 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -563,6 +563,13 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1  0xffe0ULL
 
+typedef enum VTDPASIDOp {
+VTD_PASID_BIND,
+VTD_PASID_UPDATE,
+VTD_PASID_UNBIND,
+VTD_OP_NUM
+} VTDPASIDOp;
+
 typedef enum VTDPCInvType {
 /* force reset all */
 VTD_PASID_CACHE_FORCE_RESET = 0,
@@ -578,6 +585,7 @@ typedef struct VTDPASIDCacheInfo {
 uint32_t pasid;
 PCIBus *bus;
 uint16_t devfn;
+bool error_happened;
 } VTDPASIDCacheInfo;
 
 /* PASID Table Related Definitions */
@@ -606,6 +614,9 @@ typedef struct VTDPASIDCacheInfo {
 
 #define VTD_SM_PASID_ENTRY_FLPM  3ULL
 #define VTD_SM_PASID_ENTRY_FLPTPTR   (~0xfffULL)
+#define VTD_SM_PASID_ENTRY_SRE_BIT(val)  (!!((val) & 1ULL))
+#define VTD_SM_PASID_ENTRY_WPE_BIT(val)  (!!(((val) >> 4) & 1ULL))
+#define VTD_SM_PASID_ENTRY_EAFE_BIT(val) (!!(((val) >> 7) & 1ULL))
 
 /* First Level Paging Structure */
 /* Masks for First Level Paging Entry */
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index fbc9da903a..594281c1d3 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -100,10 +100,32 @@ typedef struct VTDPASIDCacheEntry {
 bool cache_filled;
 } VTDPASIDCacheEntry;
 
+typedef struct VTDIOASContainer {
+struct IOMMUFDBackend *iommufd;
+uint32_t ioas_id;
+MemoryListener listener;
+QLIST_HEAD(, VTDS2Hwpt) s2_hwpt_list;
+QLIST_ENTRY(VTDIOASContainer) next;
+Error *error;
+} VTDIOASContainer;
+
+typedef struct VTDS2Hwpt {
+uint32_t users;
+uint32_t hwpt_id;
+VTDIOASContainer *container;
+QLIST_ENTRY(VTDS2Hwpt) next;
+} VTDS2Hwpt;
+
+typedef struct VTDHwpt {
+uint32_t hwpt_id;
+VTDS2Hwpt *s2_hwpt;
+} VTDHwpt;
+
 struct VTDAddressSpace {
 PCIBus *bus;
 uint8_t devfn;
 uint32_t pasid;
+VTDHwpt hwpt;
 AddressSpace as;
 IOMMUMemoryRegion iommu;
 MemoryRegion root;  /* The root container of the device */
@@ -303,6 +325,8 @@ struct IntelIOMMUState {
 
 GHashTable *vtd_host_iommu_dev; /* VTDHostIOMMUDevice */
 
+QLIST_HEAD(, VTDIOASContainer) containers;
+
 /* interrupt remapping */
 bool intr_enabled;  /* Whether guest enabled IR */
 dma_addr_t intr_root;   /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index b8f3b85803..e36ac44110 100644
--- a/hw/i3

[PATCH rfcv2 00/20] intel_iommu: Enable stage-1 translation for passthrough device

2025-02-19 Thread Zhenzhong Duan
Hi,

Per Jason Wang's suggestion, iommufd nesting series[1] is split into
"Enable stage-1 translation for emulated device" series and
"Enable stage-1 translation for passthrough device" series.

This series is 2nd part focusing on passthrough device. We don't do
shadowing of guest page table for passthrough device but pass stage-1
page table to host side to construct a nested domain. There was some
effort to enable this feature in old days, see [2] for details.

The key design is to utilize the dual-stage IOMMU translation
(also known as IOMMU nested translation) capability in host IOMMU.
As the below diagram shows, guest I/O page table pointer in GPA
(guest physical address) is passed to host and be used to perform
the stage-1 address translation. Along with it, modifications to
present mappings in the guest I/O page table should be followed
with an IOTLB invalidation.

.-.  .---.
|   vIOMMU|  | Guest I/O page table  |
| |  '---'
./
| PASID Entry |--- PASID cache flush --+
'-'|
| |V
| |   I/O page table pointer in GPA
'-'
Guest
--| Shadow |---|
  vv   v
Host
.-.  ..
|   pIOMMU|  |  FS for GIOVA->GPA |
| |  ''
./  |
| PASID Entry | V (Nested xlate)
'\.--.
| |   | SS for GPA->HPA, unmanaged domain|
| |   '--'
'-'
Where:
 - FS = First stage page tables
 - SS = Second stage page tables


There are some interactions between VFIO and vIOMMU
* vIOMMU registers PCIIOMMUOps [set|unset]_iommu_device to PCI
  subsystem. VFIO calls them to register/unregister HostIOMMUDevice
  instance to vIOMMU at vfio device realize stage.
* vIOMMU calls HostIOMMUDeviceIOMMUFD interface [at|de]tach_hwpt
  to bind/unbind device to IOMMUFD backed domains, either nested
  domain or not.

See below diagram:

VFIO Device Intel IOMMU
.-. .---.
| | |   |
|   .-|PCIIOMMUOps  |.-.|
|   | IOMMUFD |(set_iommu_device)   || Host IOMMU  ||
|   | Device  |>|| Device list ||
|   .-|(unset_iommu_device) |.-.|
| | |   |   |
| | |   V   |
|   .-|  HostIOMMUDeviceIOMMUFD |  .-.  |
|   | IOMMUFD |(attach_hwpt)|  | Host IOMMU  |  |
|   | link|<|  |   Device|  |
|   .-|(detach_hwpt)|  .-.  |
| | |   |   |
| | |   ... |
.-. .---.

Based on Yi's suggestion, this design is optimal in sharing ioas/hwpt
whenever possible and create new one on demand, also supports multiple
iommufd objects and ERRATA_772415.

E.g., Stage-2 page table could be shared by different devices if there
is no conflict and devices link to same iommufd object, i.e. devices
under same host IOMMU can share same stage-2 page table. If there is
conflict, i.e. there is one device under non cache coherency mode
which is different from others, it requires a separate stage-2 page
table in non-CC mode.

SPR platform has ERRATA_772415 which requires no readonly mappings
in stage-2 page table. This series supports creating VTDIOASContainer
with no readonly mappings. If there is a rare case that some IOMMUs
on a multiple IOMMU host have ERRATA_772415 and others not, this
design can still survive.

See below example diagram for a full view:

  IntelIOMMUState
 |
 V
.--..--..---.
| VTDIOASContainer |--->| VTDIOASContainer |--->| VTDIOASContainer  |-->...
| (iommufd0,RW&RO) || (iommufd1,RW&RO) || (iommufd0,RW only)|
.--..--..---.
 |   |  |
 |   .-->...|
 V  V
  .---..---.  .---.

[PATCH rfcv2 16/20] intel_iommu: Replay pasid binds after context cache invalidation

2025-02-19 Thread Zhenzhong Duan
From: Yi Liu 

This replays guest pasid attachments after context cache invalidation.
This is a behavior to ensure safety. Actually, programmer should issue
pasid cache invalidation with proper granularity after issuing a context
cache invalidation.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  1 +
 hw/i386/intel_iommu.c  | 51 --
 hw/i386/trace-events   |  1 +
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 8558781af8..8f7be7f123 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -575,6 +575,7 @@ typedef enum VTDPCInvType {
 VTD_PASID_CACHE_FORCE_RESET = 0,
 /* pasid cache invalidation rely on guest PASID entry */
 VTD_PASID_CACHE_GLOBAL_INV,
+VTD_PASID_CACHE_DEVSI,
 VTD_PASID_CACHE_DOMSI,
 VTD_PASID_CACHE_PASIDSI,
 } VTDPCInvType;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index dae1716629..e7376ba6a7 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -91,6 +91,10 @@ static void vtd_address_space_refresh_all(IntelIOMMUState 
*s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
 static void vtd_pasid_cache_reset(IntelIOMMUState *s);
+static void vtd_pasid_cache_sync(IntelIOMMUState *s,
+ VTDPASIDCacheInfo *pc_info);
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  PCIBus *bus, uint16_t devfn);
 
 static void vtd_panic_require_caching_mode(void)
 {
@@ -2423,6 +2427,8 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
 
 static void vtd_context_global_invalidate(IntelIOMMUState *s)
 {
+VTDPASIDCacheInfo pc_info = { .error_happened = false, };
+
 trace_vtd_inv_desc_cc_global();
 /* Protects context cache */
 vtd_iommu_lock(s);
@@ -2440,6 +2446,9 @@ static void vtd_context_global_invalidate(IntelIOMMUState 
*s)
  * VT-d emulation codes.
  */
 vtd_iommu_replay_all(s);
+
+pc_info.type = VTD_PASID_CACHE_GLOBAL_INV;
+vtd_pasid_cache_sync(s, &pc_info);
 }
 
 #ifdef CONFIG_IOMMUFD
@@ -2995,6 +3004,21 @@ static void 
vtd_context_device_invalidate(IntelIOMMUState *s,
  * happened.
  */
 vtd_address_space_sync(vtd_as);
+/*
+ * Per spec, context flush should also followed with PASID
+ * cache and iotlb flush. Regards to a device selective
+ * context cache invalidation:
+ * if (emaulted_device)
+ *invalidate pasid cache and pasid-based iotlb
+ * else if (assigned_device)
+ *check if the device has been bound to any pasid
+ *invoke pasid_unbind regards to each bound pasid
+ * Here, we have vtd_pasid_cache_devsi() to invalidate pasid
+ * caches, while for piotlb in QEMU, we don't have it yet, so
+ * no handling. For assigned device, host iommu driver would
+ * flush piotlb when a pasid unbind is pass down to it.
+ */
+ vtd_pasid_cache_devsi(s, vtd_as->bus, devfn);
 }
 }
 }
@@ -3743,6 +3767,11 @@ static gboolean vtd_flush_pasid(gpointer key, gpointer 
value,
 /* Fall through */
 case VTD_PASID_CACHE_GLOBAL_INV:
 break;
+case VTD_PASID_CACHE_DEVSI:
+if (pc_info->bus != vtd_as->bus || pc_info->devfn != vtd_as->devfn) {
+return false;
+}
+break;
 default:
 error_report("invalid pc_info->type");
 abort();
@@ -3934,6 +3963,11 @@ static void 
vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
 case VTD_PASID_CACHE_GLOBAL_INV:
 /* loop all assigned devices */
 break;
+case VTD_PASID_CACHE_DEVSI:
+walk_info.bus = pc_info->bus;
+walk_info.devfn = pc_info->devfn;
+vtd_replay_pasid_bind_for_dev(s, start, end, &walk_info);
+return;
 case VTD_PASID_CACHE_FORCE_RESET:
 /* For force reset, no need to go further replay */
 return;
@@ -3968,8 +4002,7 @@ static void 
vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
  * It includes updating the pasid cache in vIOMMU and updating the
  * pasid bindings per guest's latest pasid entry presence.
  */
-static void vtd_pasid_cache_sync(IntelIOMMUState *s,
- VTDPASIDCacheInfo *pc_info)
+static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo 
*pc_info)
 {
 if (!s->flts || !s->root_scalable || !s->dmar_enabled) {
 return;
@@ -4027,6 +4060,20 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
 vtd_iommu_unlock(s);
 }
 
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  PCIBus *bus, uint16_t devfn)
+{
+VTDPASIDCacheInfo pc_info = { .error_happened = false, };
+
+tra

[PATCH rfcv2 07/20] iommufd: Implement query of HOST_IOMMU_DEVICE_CAP_[NESTING|FS1GP]

2025-02-19 Thread Zhenzhong Duan
Implement query of HOST_IOMMU_DEVICE_CAP_[NESTING|FS1GP] for IOMMUFD
backed host IOMMU device.

Query on these two capabilities is not supported for legacy backend
because there is no plan to support nesting with leacy backend backed
host device.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  1 +
 backends/iommufd.c |  4 
 hw/vfio/iommufd.c  | 11 +++
 3 files changed, 16 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index e8b211e8b0..2cda744786 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -191,6 +191,7 @@
 #define VTD_ECAP_PT (1ULL << 6)
 #define VTD_ECAP_SC (1ULL << 7)
 #define VTD_ECAP_MHMV   (15ULL << 20)
+#define VTD_ECAP_NEST   (1ULL << 26)
 #define VTD_ECAP_SRS(1ULL << 31)
 #define VTD_ECAP_PASID  (1ULL << 40)
 #define VTD_ECAP_SMTS   (1ULL << 43)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 574f330c27..0a1a40cbba 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -370,6 +370,10 @@ static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int 
cap, Error **errp)
 return caps->type;
 case HOST_IOMMU_DEVICE_CAP_AW_BITS:
 return vfio_device_get_aw_bits(hiod->agent);
+case HOST_IOMMU_DEVICE_CAP_NESTING:
+return caps->nesting;
+case HOST_IOMMU_DEVICE_CAP_FS1GP:
+return caps->fs1gp;
 default:
 error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
 return -EINVAL;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 175c4fe1f4..df6a12d200 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -26,6 +26,7 @@
 #include "qemu/chardev_open.h"
 #include "pci.h"
 #include "exec/ram_addr.h"
+#include "hw/i386/intel_iommu_internal.h"
 
 static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly)
@@ -843,6 +844,16 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice 
*hiod, void *opaque,
 caps->type = type;
 caps->hw_caps = hw_caps;
 
+switch (type) {
+case IOMMU_HW_INFO_TYPE_INTEL_VTD:
+caps->nesting = !!(data.vtd.ecap_reg & VTD_ECAP_NEST);
+caps->fs1gp = !!(data.vtd.cap_reg & VTD_CAP_FS1GP);
+break;
+case IOMMU_HW_INFO_TYPE_ARM_SMMUV3:
+case IOMMU_HW_INFO_TYPE_NONE:
+break;
+}
+
 return true;
 }
 
-- 
2.34.1




[PATCH rfcv2 08/20] iommufd: Implement query of HOST_IOMMU_DEVICE_CAP_ERRATA

2025-02-19 Thread Zhenzhong Duan
Implement query of HOST_IOMMU_DEVICE_CAP_ERRATA for IOMMUFD
backed host IOMMU device.

Query on this capability is not supported for legacy backend
because there is no plan to support nesting with leacy backend
backed host device.

Signed-off-by: Zhenzhong Duan 
---
 include/system/host_iommu_device.h | 2 ++
 backends/iommufd.c | 2 ++
 hw/vfio/iommufd.c  | 1 +
 3 files changed, 5 insertions(+)

diff --git a/include/system/host_iommu_device.h 
b/include/system/host_iommu_device.h
index 18f8b5e5cf..250600fc1d 100644
--- a/include/system/host_iommu_device.h
+++ b/include/system/host_iommu_device.h
@@ -32,6 +32,7 @@ typedef struct HostIOMMUDeviceCaps {
 uint64_t hw_caps;
 bool nesting;
 bool fs1gp;
+uint32_t errata;
 } HostIOMMUDeviceCaps;
 
 #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
@@ -130,6 +131,7 @@ struct HostIOMMUDeviceClass {
 #define HOST_IOMMU_DEVICE_CAP_AW_BITS   1
 #define HOST_IOMMU_DEVICE_CAP_NESTING   2
 #define HOST_IOMMU_DEVICE_CAP_FS1GP 3
+#define HOST_IOMMU_DEVICE_CAP_ERRATA4
 
 #define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX   64
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 0a1a40cbba..3c23caef96 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -374,6 +374,8 @@ static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int 
cap, Error **errp)
 return caps->nesting;
 case HOST_IOMMU_DEVICE_CAP_FS1GP:
 return caps->fs1gp;
+case HOST_IOMMU_DEVICE_CAP_ERRATA:
+return caps->errata;
 default:
 error_setg(errp, "%s: unsupported capability %x", hiod->name, cap);
 return -EINVAL;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index df6a12d200..58bff030e1 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -848,6 +848,7 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice 
*hiod, void *opaque,
 case IOMMU_HW_INFO_TYPE_INTEL_VTD:
 caps->nesting = !!(data.vtd.ecap_reg & VTD_ECAP_NEST);
 caps->fs1gp = !!(data.vtd.cap_reg & VTD_CAP_FS1GP);
+caps->errata = data.vtd.flags & IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
 break;
 case IOMMU_HW_INFO_TYPE_ARM_SMMUV3:
 case IOMMU_HW_INFO_TYPE_NONE:
-- 
2.34.1




[PATCH rfcv2 18/20] intel_iommu: Refresh pasid bind when either SRTP or TE bit is changed

2025-02-19 Thread Zhenzhong Duan
From: Yi Liu 

When either 'Set Root Table Pointer' or 'Translation Enable' bit is changed,
the pasid bindings on host side become stale and need to be updated.

Introduce a helper function vtd_refresh_pasid_bind() for that purpose.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 8f7fb473f5..225e332132 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -89,6 +89,7 @@ struct vtd_iotlb_key {
 
 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
+static void vtd_refresh_pasid_bind(IntelIOMMUState *s);
 
 static void vtd_pasid_cache_reset(IntelIOMMUState *s);
 static void vtd_pasid_cache_sync(IntelIOMMUState *s,
@@ -3366,6 +3367,7 @@ static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
 vtd_reset_caches(s);
 vtd_address_space_refresh_all(s);
+vtd_refresh_pasid_bind(s);
 }
 
 /* Set Interrupt Remap Table Pointer */
@@ -3400,6 +3402,7 @@ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool 
en)
 
 vtd_reset_caches(s);
 vtd_address_space_refresh_all(s);
+vtd_refresh_pasid_bind(s);
 }
 
 /* Handle Interrupt Remap Enable/Disable */
@@ -4109,6 +4112,28 @@ static void 
vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
 }
 }
 
+static void vtd_refresh_pasid_bind(IntelIOMMUState *s)
+{
+VTDPASIDCacheInfo pc_info = { .error_happened = false,
+  .type = VTD_PASID_CACHE_GLOBAL_INV };
+
+/*
+ * Only when dmar is enabled, should pasid bindings replayed,
+ * otherwise no need to replay.
+ */
+if (!s->dmar_enabled) {
+return;
+}
+
+if (!s->flts || !s->root_scalable) {
+return;
+}
+
+vtd_iommu_lock(s);
+vtd_replay_guest_pasid_bindings(s, &pc_info);
+vtd_iommu_unlock(s);
+}
+
 /*
  * This function syncs the pasid bindings between guest and host.
  * It includes updating the pasid cache in vIOMMU and updating the
-- 
2.34.1




[PATCH rfcv2 13/20] intel_iommu: Add PASID cache management infrastructure

2025-02-19 Thread Zhenzhong Duan
This adds an new entry VTDPASIDCacheEntry in VTDAddressSpace to cache the
pasid entry and track PASID usage and future PASID tagged DMA address
translation support in vIOMMU.

VTDAddressSpace of PCI_NO_PASID is allocated when device is plugged and
never freed. For other pasid, VTDAddressSpace instance is created/destroyed
per the guest pasid entry set up/destroy for passthrough devices. While for
emulated devices, VTDAddressSpace instance is created in the PASID tagged DMA
translation and be destroyed per guest PASID cache invalidation. This focuses
on the PASID cache management for passthrough devices as there is no PASID
capable emulated devices yet.

When guest modifies a PASID entry, QEMU will capture the guest pasid selective
pasid cache invalidation, allocate or remove a VTDAddressSpace instance per the
invalidation reasons:

*) a present pasid entry moved to non-present
*) a present pasid entry to be a present entry
*) a non-present pasid entry moved to present

vIOMMU emulator could figure out the reason by fetching latest guest pasid entry
and compare it with the PASID cache.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  29 ++
 include/hw/i386/intel_iommu.h  |   6 +
 hw/i386/intel_iommu.c  | 484 -
 hw/i386/trace-events   |   4 +
 4 files changed, 513 insertions(+), 10 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 18bc22fc72..632fda2853 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -315,6 +315,7 @@ typedef enum VTDFaultReason {
   * request while disabled */
 VTD_FR_IR_SID_ERR = 0x26,   /* Invalid Source-ID */
 
+VTD_FR_RTADDR_INV_TTM = 0x31,  /* Invalid TTM in RTADDR */
 /* PASID directory entry access failure */
 VTD_FR_PASID_DIR_ACCESS_ERR = 0x50,
 /* The Present(P) field of pasid directory entry is 0 */
@@ -492,6 +493,15 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff0f1c0ULL
 #define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
 
+#define VTD_INV_DESC_PASIDC_G  (3ULL << 4)
+#define VTD_INV_DESC_PASIDC_PASID(val) (((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PASIDC_DID(val)   (((val) >> 16) & VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PASIDC_RSVD_VAL0  0xfff0f1c0ULL
+
+#define VTD_INV_DESC_PASIDC_DSI(0ULL << 4)
+#define VTD_INV_DESC_PASIDC_PASID_SI   (1ULL << 4)
+#define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
@@ -548,10 +558,28 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_CTX_ENTRY_LEGACY_SIZE 16
 #define VTD_CTX_ENTRY_SCALABLE_SIZE   32
 
+#define VTD_SM_CONTEXT_ENTRY_PDTS(val)  (((val) >> 9) & 0x7)
 #define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xf
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
 #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1  0xffe0ULL
 
+typedef enum VTDPCInvType {
+/* force reset all */
+VTD_PASID_CACHE_FORCE_RESET = 0,
+/* pasid cache invalidation rely on guest PASID entry */
+VTD_PASID_CACHE_GLOBAL_INV,
+VTD_PASID_CACHE_DOMSI,
+VTD_PASID_CACHE_PASIDSI,
+} VTDPCInvType;
+
+typedef struct VTDPASIDCacheInfo {
+VTDPCInvType type;
+uint16_t domain_id;
+uint32_t pasid;
+PCIBus *bus;
+uint16_t devfn;
+} VTDPASIDCacheInfo;
+
 /* PASID Table Related Definitions */
 #define VTD_PASID_DIR_BASE_ADDR_MASK  (~0xfffULL)
 #define VTD_PASID_TABLE_BASE_ADDR_MASK (~0xfffULL)
@@ -563,6 +591,7 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_PASID_TABLE_BITS_MASK (0x3fULL)
 #define VTD_PASID_TABLE_INDEX(pasid)  ((pasid) & VTD_PASID_TABLE_BITS_MASK)
 #define VTD_PASID_ENTRY_FPD   (1ULL << 1) /* Fault Processing Disable 
*/
+#define VTD_PASID_TBL_ENTRY_NUM   (1ULL << 6)
 
 /* PASID Granular Translation Type Mask */
 #define VTD_PASID_ENTRY_P  1ULL
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 50f9b27a45..fbc9da903a 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -95,6 +95,11 @@ struct VTDPASIDEntry {
 uint64_t val[8];
 };
 
+typedef struct VTDPASIDCacheEntry {
+struct VTDPASIDEntry pasid_entry;
+bool cache_filled;
+} VTDPASIDCacheEntry;
+
 struct VTDAddressSpace {
 PCIBus *bus;
 uint8_t devfn;
@@ -107,6 +112,7 @@ struct VTDAddressSpace {
 MemoryRegion iommu_ir_fault; /* Interrupt region for catching fault */
 IntelIOMMUState *iommu_state;
 VTDContextCacheEntry context_cache_entry;
+VTDPASIDCacheEntry pasid_cache_entry;
 QLIST_ENTRY(VTDAddressSpace) next;
 /* Superset of notifier flags that this address space has */
 IOMMUNotifierFlag notifier_flags;
diff --git a/hw/i386/intel_iommu.c b

[PATCH rfcv2 06/20] host_iommu_device: Define two new capabilities HOST_IOMMU_DEVICE_CAP_[NESTING|FS1GP]

2025-02-19 Thread Zhenzhong Duan
Signed-off-by: Zhenzhong Duan 
---
 include/system/host_iommu_device.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/include/system/host_iommu_device.h 
b/include/system/host_iommu_device.h
index df782598f2..18f8b5e5cf 100644
--- a/include/system/host_iommu_device.h
+++ b/include/system/host_iommu_device.h
@@ -22,10 +22,16 @@
  *
  * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents
  *   the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl)
+ *
+ * @nesting: nesting page table support.
+ *
+ * @fs1gp: first stage(a.k.a, Stage-1) 1GB huge page support.
  */
 typedef struct HostIOMMUDeviceCaps {
 uint32_t type;
 uint64_t hw_caps;
+bool nesting;
+bool fs1gp;
 } HostIOMMUDeviceCaps;
 
 #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
@@ -122,6 +128,8 @@ struct HostIOMMUDeviceClass {
  */
 #define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE0
 #define HOST_IOMMU_DEVICE_CAP_AW_BITS   1
+#define HOST_IOMMU_DEVICE_CAP_NESTING   2
+#define HOST_IOMMU_DEVICE_CAP_FS1GP 3
 
 #define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX   64
 #endif
-- 
2.34.1




[PATCH rfcv2 10/20] intel_iommu: Optimize context entry cache utilization

2025-02-19 Thread Zhenzhong Duan
There are many call sites referencing context entry by calling
vtd_as_to_context_entry() which will traverse the DMAR table.

In most cases we can use cached context entry in vtd_as->context_cache_entry
except it's stale. Currently only global and domain context invalidation
stales it.

So introduce a helper function vtd_as_to_context_entry() to fetch from cache
before trying with vtd_dev_to_context_entry().

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 36 +++-
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index df5fb30bc8..7709f55be5 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1597,6 +1597,22 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, 
uint8_t bus_num,
 return 0;
 }
 
+static int vtd_as_to_context_entry(VTDAddressSpace *vtd_as, VTDContextEntry 
*ce)
+{
+IntelIOMMUState *s = vtd_as->iommu_state;
+uint8_t bus_num = pci_bus_num(vtd_as->bus);
+uint8_t devfn = vtd_as->devfn;
+VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry;
+
+/* Try to fetch context-entry from cache first */
+if (cc_entry->context_cache_gen == s->context_cache_gen) {
+*ce = cc_entry->context_entry;
+return 0;
+} else {
+return vtd_dev_to_context_entry(s, bus_num, devfn, ce);
+}
+}
+
 static int vtd_sync_shadow_page_hook(const IOMMUTLBEvent *event,
  void *private)
 {
@@ -1649,9 +1665,7 @@ static int vtd_address_space_sync(VTDAddressSpace *vtd_as)
 return 0;
 }
 
-ret = vtd_dev_to_context_entry(vtd_as->iommu_state,
-   pci_bus_num(vtd_as->bus),
-   vtd_as->devfn, &ce);
+ret = vtd_as_to_context_entry(vtd_as, &ce);
 if (ret) {
 if (ret == -VTD_FR_CONTEXT_ENTRY_P) {
 /*
@@ -1710,8 +1724,7 @@ static bool vtd_as_pt_enabled(VTDAddressSpace *as)
 assert(as);
 
 s = as->iommu_state;
-if (vtd_dev_to_context_entry(s, pci_bus_num(as->bus), as->devfn,
- &ce)) {
+if (vtd_as_to_context_entry(as, &ce)) {
 /*
  * Possibly failed to parse the context entry for some reason
  * (e.g., during init, or any guest configuration errors on
@@ -2443,8 +2456,7 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState 
*s, uint16_t domain_id)
 vtd_iommu_unlock(s);
 
 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
-if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
-  vtd_as->devfn, &ce) &&
+if (!vtd_as_to_context_entry(vtd_as, &ce) &&
 domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) {
 vtd_address_space_sync(vtd_as);
 }
@@ -2466,8 +2478,7 @@ static void 
vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
 hwaddr size = (1 << am) * VTD_PAGE_SIZE;
 
 QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
-ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
-   vtd_as->devfn, &ce);
+ret = vtd_as_to_context_entry(vtd_as, &ce);
 if (!ret && domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) {
 uint32_t rid2pasid = PCI_NO_PASID;
 
@@ -2974,8 +2985,7 @@ static void vtd_piotlb_pasid_invalidate(IntelIOMMUState 
*s,
 vtd_iommu_unlock(s);
 
 QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
-if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
-  vtd_as->devfn, &ce) &&
+if (!vtd_as_to_context_entry(vtd_as, &ce) &&
 domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) {
 uint32_t rid2pasid = VTD_CE_GET_RID2PASID(&ce);
 
@@ -4154,7 +4164,7 @@ static void vtd_report_ir_illegal_access(VTDAddressSpace 
*vtd_as,
 assert(vtd_as->pasid != PCI_NO_PASID);
 
 /* Try out best to fetch FPD, we can't do anything more */
-if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
+if (vtd_as_to_context_entry(vtd_as, &ce) == 0) {
 is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
 if (!is_fpd_set && s->root_scalable) {
 vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, vtd_as->pasid);
@@ -4491,7 +4501,7 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, 
IOMMUNotifier *n)
 /* replay is protected by BQL, page walk will re-setup it safely */
 iova_tree_remove(vtd_as->iova_tree, map);
 
-if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
+if (vtd_as_to_context_entry(vtd_as, &ce) == 0) {
 trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
   "legacy mode",
   bus_n, PCI_SLOT(vtd_as->devfn),
-- 
2.34.1




[PATCH rfcv2 05/20] vfio/iommufd: Implement [at|de]tach_hwpt handlers

2025-02-19 Thread Zhenzhong Duan
Implement [at|de]tach_hwpt handlers in VFIO subsystem. vIOMMU
utilizes them to attach to or detach from hwpt on host side.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 53639bf88b..175c4fe1f4 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -802,6 +802,24 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap;
 };
 
+static bool
+host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+   uint32_t hwpt_id, Error **errp)
+{
+VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent;
+
+return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp);
+}
+
+static bool
+host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+   Error **errp)
+{
+VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent;
+
+return iommufd_cdev_detach_ioas_hwpt(vbasedev, errp);
+}
+
 static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
   Error **errp)
 {
@@ -863,11 +881,15 @@ hiod_iommufd_vfio_get_page_size_mask(HostIOMMUDevice 
*hiod)
 static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data)
 {
 HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
+HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc);
 
 hiodc->realize = hiod_iommufd_vfio_realize;
 hiodc->realize_late = hiod_iommufd_vfio_realize_late;
 hiodc->get_iova_ranges = hiod_iommufd_vfio_get_iova_ranges;
 hiodc->get_page_size_mask = hiod_iommufd_vfio_get_page_size_mask;
+
+idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt;
+idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH rfcv2 19/20] intel_iommu: Bypass replay in stage-1 page table mode

2025-02-19 Thread Zhenzhong Duan
VFIO utilizes replay to setup initial shadow iommu mappings.
But when stage-1 page table is configured, it is passed to
host to construct nested page table, there is no replay needed.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 225e332132..e4b83cbe50 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -5743,6 +5743,14 @@ static void vtd_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 VTDContextEntry ce;
 DMAMap map = { .iova = 0, .size = HWADDR_MAX };
 
+/*
+ * Replay on stage-1 page table is meaningless as stage-1 page table
+ * is passthroughed to host to construct nested page table
+ */
+if (s->flts && s->root_scalable) {
+return;
+}
+
 /* replay is protected by BQL, page walk will re-setup it safely */
 iova_tree_remove(vtd_as->iova_tree, map);
 
-- 
2.34.1




[PATCH rfcv2 17/20] intel_iommu: Propagate PASID-based iotlb invalidation to host

2025-02-19 Thread Zhenzhong Duan
From: Yi Liu 

This traps the guest PASID-based iotlb invalidation request and propagate it
to host.

Intel VT-d 3.0 supports nested translation in PASID granular. Guest SVA support
could be implemented by configuring nested translation on specific PASID. This
is also known as dual stage DMA translation.

Under such configuration, guest owns the GVA->GPA translation which is
configured as stage-1 page table in host side for a specific pasid, and host
owns GPA->HPA translation. As guest owns stage-1 translation table, piotlb
invalidation should be propagated to host since host IOMMU will cache first
level page table related mappings during DMA address translation.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |   6 ++
 hw/i386/intel_iommu.c  | 116 -
 2 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 8f7be7f123..630394a8c3 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -589,6 +589,12 @@ typedef struct VTDPASIDCacheInfo {
 bool error_happened;
 } VTDPASIDCacheInfo;
 
+typedef struct VTDPIOTLBInvInfo {
+uint16_t domain_id;
+uint32_t pasid;
+struct iommu_hwpt_vtd_s1_invalidate *inv_data;
+} VTDPIOTLBInvInfo;
+
 /* PASID Table Related Definitions */
 #define VTD_PASID_DIR_BASE_ADDR_MASK  (~0xfffULL)
 #define VTD_PASID_TABLE_BASE_ADDR_MASK (~0xfffULL)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e7376ba6a7..8f7fb473f5 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2938,12 +2938,108 @@ static int vtd_bind_guest_pasid(VTDAddressSpace 
*vtd_as,
 
 return ret;
 }
+
+/*
+ * Caller of this function should hold iommu_lock.
+ */
+static void vtd_invalidate_piotlb(VTDAddressSpace *vtd_as,
+  struct iommu_hwpt_vtd_s1_invalidate *cache)
+{
+VTDHostIOMMUDevice *vtd_hiod;
+HostIOMMUDeviceIOMMUFD *idev;
+VTDHwpt *hwpt = &vtd_as->hwpt;
+int devfn = vtd_as->devfn;
+struct vtd_as_key key = {
+.bus = vtd_as->bus,
+.devfn = devfn,
+};
+IntelIOMMUState *s = vtd_as->iommu_state;
+uint32_t entry_num = 1; /* Only implement one request for simplicity */
+
+if (!hwpt) {
+return;
+}
+
+vtd_hiod = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+if (!vtd_hiod || !vtd_hiod->hiod) {
+return;
+}
+idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+
+if (iommufd_backend_invalidate_cache(idev->iommufd, hwpt->hwpt_id,
+ IOMMU_HWPT_INVALIDATE_DATA_VTD_S1,
+ sizeof(*cache), &entry_num, cache)) {
+error_report("Cache flush failed, entry_num %d", entry_num);
+}
+}
+
+/*
+ * This function is a loop function for the s->vtd_address_spaces
+ * list with VTDPIOTLBInvInfo as execution filter. It propagates
+ * the piotlb invalidation to host. Caller of this function
+ * should hold iommu_lock.
+ */
+static void vtd_flush_pasid_iotlb(gpointer key, gpointer value,
+  gpointer user_data)
+{
+VTDPIOTLBInvInfo *piotlb_info = user_data;
+VTDAddressSpace *vtd_as = value;
+VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
+uint32_t pasid;
+uint16_t did;
+
+/* Replay only fill pasid entry cache for passthrough device */
+if (!pc_entry->cache_filled ||
+!vtd_pe_pgtt_is_flt(&pc_entry->pasid_entry)) {
+return;
+}
+
+if (vtd_as_to_iommu_pasid(vtd_as, &pasid)) {
+return;
+}
+
+did = vtd_pe_get_domain_id(&pc_entry->pasid_entry);
+
+if (piotlb_info->domain_id == did && piotlb_info->pasid == pasid) {
+vtd_invalidate_piotlb(vtd_as, piotlb_info->inv_data);
+}
+}
+
+static void vtd_flush_pasid_iotlb_all(IntelIOMMUState *s,
+  uint16_t domain_id, uint32_t pasid,
+  hwaddr addr, uint64_t npages, bool ih)
+{
+struct iommu_hwpt_vtd_s1_invalidate cache_info = { 0 };
+VTDPIOTLBInvInfo piotlb_info;
+
+cache_info.addr = addr;
+cache_info.npages = npages;
+cache_info.flags = ih ? IOMMU_VTD_INV_FLAGS_LEAF : 0;
+
+piotlb_info.domain_id = domain_id;
+piotlb_info.pasid = pasid;
+piotlb_info.inv_data = &cache_info;
+
+/*
+ * Here loops all the vtd_as instances in s->vtd_address_spaces
+ * to find out the affected devices since piotlb invalidation
+ * should check pasid cache per architecture point of view.
+ */
+g_hash_table_foreach(s->vtd_address_spaces,
+ vtd_flush_pasid_iotlb, &piotlb_info);
+}
 #else
 static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as,
 VTDPASIDEntry *pe, VTDPASIDOp op)
 {
 return 0;
 }
+
+static void vtd_flush_pasid_iotlb_all(IntelIOMMUState *s,
+   

[PATCH rfcv2 02/20] vfio/iommufd: Add properties and handlers to TYPE_HOST_IOMMU_DEVICE_IOMMUFD

2025-02-19 Thread Zhenzhong Duan
New added properties include IOMMUFD handle, devid and hwpt_id.
IOMMUFD handle and devid are used to allocate/free ioas and hwpt.
hwpt_id is used to re-attach IOMMUFD backed device to its default
VFIO sub-system created hwpt, i.e., when vIOMMU is disabled by
guest. These properties are initialized in .realize_late() handler.

New added handlers include [at|de]tach_hwpt. They are used to
attach/detach hwpt. VFIO and VDPA have different way to attach
and detach, so implementation will be in sub-class instead of
HostIOMMUDeviceIOMMUFD.

Add two wrappers host_iommu_device_iommufd_[at|de]tach_hwpt to
wrap the two handlers.

This is a prerequisite patch for following ones.

Signed-off-by: Zhenzhong Duan 
---
 include/system/iommufd.h | 50 
 backends/iommufd.c   | 22 ++
 2 files changed, 72 insertions(+)

diff --git a/include/system/iommufd.h b/include/system/iommufd.h
index 5d02e9d148..a871601df5 100644
--- a/include/system/iommufd.h
+++ b/include/system/iommufd.h
@@ -66,4 +66,54 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, 
uint32_t hwpt_id,
  uint32_t *entry_num, void *data_ptr);
 
 #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
+OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass,
+HOST_IOMMU_DEVICE_IOMMUFD)
+
+/* Abstract of host IOMMU device with iommufd backend */
+struct HostIOMMUDeviceIOMMUFD {
+HostIOMMUDevice parent_obj;
+
+IOMMUFDBackend *iommufd;
+uint32_t devid;
+uint32_t hwpt_id;
+};
+
+struct HostIOMMUDeviceIOMMUFDClass {
+HostIOMMUDeviceClass parent_class;
+
+/**
+ * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table.
+ * VFIO and VDPA device can have different implementation.
+ *
+ * Mandatory callback.
+ *
+ * @idev: host IOMMU device backed by IOMMUFD backend.
+ *
+ * @hwpt_id: ID of IOMMUFD hardware page table.
+ *
+ * @errp: pass an Error out when attachment fails.
+ *
+ * Returns: true on success, false on failure.
+ */
+bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id,
+Error **errp);
+/**
+ * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table.
+ * VFIO and VDPA device can have different implementation.
+ *
+ * Mandatory callback.
+ *
+ * @idev: host IOMMU device backed by IOMMUFD backend.
+ *
+ * @errp: pass an Error out when attachment fails.
+ *
+ * Returns: true on success, false on failure.
+ */
+bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp);
+};
+
+bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+   uint32_t hwpt_id, Error **errp);
+bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+   Error **errp);
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index fc32aad5cb..574f330c27 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -341,6 +341,26 @@ int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, 
uint32_t hwpt_id,
 return ret;
 }
 
+bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+   uint32_t hwpt_id, Error **errp)
+{
+HostIOMMUDeviceIOMMUFDClass *idevc =
+HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev);
+
+g_assert(idevc->attach_hwpt);
+return idevc->attach_hwpt(idev, hwpt_id, errp);
+}
+
+bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+   Error **errp)
+{
+HostIOMMUDeviceIOMMUFDClass *idevc =
+HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev);
+
+g_assert(idevc->detach_hwpt);
+return idevc->detach_hwpt(idev, errp);
+}
+
 static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
 {
 HostIOMMUDeviceCaps *caps = &hiod->caps;
@@ -379,6 +399,8 @@ static const TypeInfo types[] = {
 }, {
 .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
 .parent = TYPE_HOST_IOMMU_DEVICE,
+.instance_size = sizeof(HostIOMMUDeviceIOMMUFD),
+.class_size = sizeof(HostIOMMUDeviceIOMMUFDClass),
 .class_init = hiod_iommufd_class_init,
 .abstract = true,
 }
-- 
2.34.1




[PATCH rfcv2 04/20] vfio/iommufd: Implement HostIOMMUDeviceClass::realize_late() handler

2025-02-19 Thread Zhenzhong Duan
There are three iommufd related elements iommufd handle, devid and
hwpt_id. hwpt_id is ready only after VFIO device attachment. Device
id and iommufd handle are ready before attachment, but they are all
iommufd related stuff, initialize them together with hwpt_id.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index df61edffc0..53639bf88b 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -828,6 +828,19 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice 
*hiod, void *opaque,
 return true;
 }
 
+static bool hiod_iommufd_vfio_realize_late(HostIOMMUDevice *hiod, void *opaque,
+   Error **errp)
+{
+VFIODevice *vdev = opaque;
+HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
+
+idev->iommufd = vdev->iommufd;
+idev->devid = vdev->devid;
+idev->hwpt_id = vdev->hwpt->hwpt_id;
+
+return true;
+}
+
 static GList *
 hiod_iommufd_vfio_get_iova_ranges(HostIOMMUDevice *hiod)
 {
@@ -852,6 +865,7 @@ static void hiod_iommufd_vfio_class_init(ObjectClass *oc, 
void *data)
 HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
 
 hiodc->realize = hiod_iommufd_vfio_realize;
+hiodc->realize_late = hiod_iommufd_vfio_realize_late;
 hiodc->get_iova_ranges = hiod_iommufd_vfio_get_iova_ranges;
 hiodc->get_page_size_mask = hiod_iommufd_vfio_get_page_size_mask;
 };
-- 
2.34.1




[PATCH rfcv2 20/20] intel_iommu: Enable host device when x-flts=on in scalable mode

2025-02-19 Thread Zhenzhong Duan
Now that all infrastructures of supporting passthrough device running
with stage-1 translation are there, enable it now.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e4b83cbe50..908c28f9be 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -5583,8 +5583,7 @@ static bool vtd_check_hiod(IntelIOMMUState *s, 
VTDHostIOMMUDevice *vtd_hiod,
 }
 vtd_hiod->errata = ret;
 
-error_setg(errp, "host device is uncompatible with stage-1 translation");
-return false;
+return true;
 }
 
 static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
-- 
2.34.1




Re: [PATCH v7 RESEND 0/5] i386: Support SMP Cache Topology

2025-02-19 Thread Zhao Liu
Hi Paolo,

A gentle poke. I plan to add cache models for Intel CPUs and extend
this smp_cache interface after this series. :-)

(The 1st patch of general machine has been picked by Phili.)

Thanks,
Zhao

> > Alireza Sanaee (1):
> >   i386/cpu: add has_caches flag to check smp_cache configuration
> > 
> > Zhao Liu (4):
> >   hw/core/machine: Reject thread level cache
> >   i386/cpu: Support module level cache topology
> >   i386/cpu: Update cache topology with machine's configuration
> >   i386/pc: Support cache topology in -machine for PC machine
> > 
> >  hw/core/machine-smp.c |  9 ++
> >  hw/i386/pc.c  |  4 +++
> >  include/hw/boards.h   |  3 ++
> >  qemu-options.hx   | 30 +-
> >  target/i386/cpu.c | 71 ++-
> >  5 files changed, 115 insertions(+), 2 deletions(-)
> > 
> > -- 
> > 2.34.1
> > 
> 



Re: [PATCH] hw/timer/hpet: Detect invalid access to TN registers

2025-02-19 Thread Paolo Bonzini

On 2/18/25 10:07, Philippe Mathieu-Daudé wrote:

On 18/2/25 09:53, Paolo Bonzini wrote:

On 2/18/25 08:37, Zhao Liu wrote:

"addr & 0x18" ignores invalid address, so that the trace in default
branch (trace_hpet_ram_{read|write}_invalid()) doesn't work.

Mask addr by "0x1f & ~4", in which 0x1f means to get the complete TN
registers access and ~4 means to keep any invalid address offset.


I think this is less readable.

The reason to use !4 in the Rust code is because the initial AND is done
in a separate function, timer_and_addr().


Having a quick look at the model without looking at the specs:

include/hw/timer/hpet.h:20:#define HPET_LEN    0x400

hw/timer/hpet.c:439:static uint64_t hpet_ram_read(...,
hw/timer/hpet.c-441-{
hw/timer/hpet.c-448-    /*address range of all TN regs*/
hw/timer/hpet.c-449-    if (addr >= 0x100 && addr <= 0x3ff) {
hw/timer/hpet.c-450-    uint8_t timer_id = (addr - 0x100) / 0x20;
     ...
hw/timer/hpet.c-469-    } else {
hw/timer/hpet.c-470-    switch (addr & ~4) {
  ...
hw/timer/hpet.c-488-    }
hw/timer/hpet.c-489-    }
hw/timer/hpet.c-490-    return 0;
hw/timer/hpet.c-491-}

hw/timer/hpet.c:699:    memory_region_init_io(&s->iomem, obj,
   &hpet_ram_ops, s,
   "hpet", HPET_LEN);

I suppose we want to register multiple timers of I/O size 0x20 at 0x100,
and the I/O size of 0x20 at 0x000 is a generic control region.

Maybe split hpet_ram_ops in 2 (hpet_cfg_ops and hpet_tmr_ops), mapping
the first one once at 0x000 and the other 24 times at 0x100-0x3ff?


You would have to come up with a way to get the index though.  It seems 
to be adding churn for no particular reason.


I'd rather look into how to make decoding code *easy* without making 
everything MemoryRegions.  As I explained yesterday, while I'm not yet 
sure that Rust is going to stay in QEMU, I'd like to have as many 
examples as possible to help tilting the balance one way or the other. 
And indeed in the Rust version of HPET, timer_and_addr() could be 
extended to something like this:


// Start with the same "enum for registers" pattern that PL011 uses:
#[derive(qemu_api_macros::TryInto)]
#[repr(u64)]
enum TimerRegister {
CFG = 0,
CMP = 8,
ROUTE = 16,
}

#[derive(qemu_api_macros::TryInto)]
#[repr(u64)]
enum GlobalRegister {
CAP = 0,
CFG = 0x10,
INT_STATUS = 0x20,
COUNTER = 0xF0,
}

// Go one step further and define types for all possible outcomes:
#[derive(Copy)]
enum HPETRegister {
Timer(&BqlRefCell, TimerRegister),
Global(GlobalRegister),
Unknown(hwaddr),
}

struct HPETAddrDecode {
u32 shift,
u32 len,
HPETRegister reg,
}

fn decode(&self, addr: hwaddr, size: u32) -> HPETAddrDecode {
let shift = ((addr & 4) * 8) as u32;
let len = std::cmp::min(size * 8, 64 - shift);

addr &= !4;
let reg = if (0x100..=0x3ff).contains(&addr) {
let timer_id: usize = ((addr - 0x100) / 0x20) as usize;
TimerRegister::try_from(addr)
.map(|reg| HPETRegister::Timer(&self.timers[timer_id], reg))
} else {
GlobalRegister::try_from(addr)
.map(HPETRegister::Global)
}

// reg is now a Result
// convert the Err case into HPETRegister as well
let reg = reg.unwrap_or_else(HPETRegister::Unknown);
HPETAddrDecode { shift, len, reg }
}

(untested).  The read and write functions then can do something like

let val = match decoded.reg {
Timer(timer, reg) => timer.borrow_mut().read(decoded),
Global(GlobalRegister::CAP) => self.capability.get(),
Global(GlobalRegister::CFG) => self.config.get(),
...
}
val >> decoded.shift

and for write:

match decoded.reg {
Timer(timer, reg) => timer.borrow_mut().write(decoded, value),
Global(GlobalRegister::CAP) => {}, // read-only
Global(GlobalRegister::CFG) => self.set_cfg_reg(decoded, value),
...
}


The above could be a scheme that new devices could copy.  Overall I 
think it would be shorter code than what is there in rust/hw/timer/hpet 
(which is IMO already better than C, mind!).


The honest question for people with less experience is whether this is 
readable at all; whether seeing it helps you learn the language or 
discourages you.


Paolo


No clue what is between 0x020-0x0ff.

My 2 cents looking at QDev modelling to avoid these address
manipulations.

Regards,

Phil.







Re: [PATCH 28/42] qapi/parser: prohibit untagged sections between tagged sections

2025-02-19 Thread Markus Armbruster
John Snow  writes:

> On Wed, Feb 12, 2025 at 4:06 AM Markus Armbruster  wrote:
>
>> John Snow  writes:
>>
>> > This is being done primarily to ensure consistency between the source
>> > documents and the final, rendered HTML output. Because
>> > member/feature/returns sections will always appear in a visually grouped
>> > element in the HTML output, prohibiting free paragraphs between those
>> > sections ensures ordering consistency between source and the final
>> > render.
>> >
>> > Additionally, prohibiting such "middle" text paragraphs allows us to
>> > classify all plain text sections as either "intro" or "detail"
>> > sections, because these sections must either appear before structured
>> > elements ("intro") or afterwards ("detail").
>> >
>> > This keeps the inlining algorithm simpler with fewer "splice" points
>> > when inlining multiple documentation blocks.
>>
>> Mention the two "middle" paragraphs you have to eliminate in this patch?
>>
>
> OK; I will mention that this patch adjusts the source documentation but I
> won't go into detail on which. You can read the patch to find out easily
> enough.
>
>
>>
>> >
>> > Signed-off-by: John Snow 
>> > ---
>> >  qapi/net.json   |  4 ++--
>> >  qapi/qom.json   |  4 ++--
>> >  scripts/qapi/parser.py  | 16 
>> >  tests/qapi-schema/doc-good.json |  4 ++--
>> >  tests/qapi-schema/doc-good.out  |  4 ++--
>> >  tests/qapi-schema/doc-good.txt  |  8 
>> >  6 files changed, 28 insertions(+), 12 deletions(-)
>> >
>> > diff --git a/qapi/net.json b/qapi/net.json
>> > index 2739a2f4233..49bc7de64e9 100644
>> > --- a/qapi/net.json
>> > +++ b/qapi/net.json
>> > @@ -655,13 +655,13 @@
>> >  # this to zero disables this function.  This member is mutually
>> >  # exclusive with @reconnect.  (default: 0) (Since: 9.2)
>> >  #
>> > -# Only SocketAddress types 'unix', 'inet' and 'fd' are supported.
>> > -#
>> >  # Features:
>> >  #
>> >  # @deprecated: Member @reconnect is deprecated.  Use @reconnect-ms
>> >  # instead.
>> >  #
>> > +# Only SocketAddress types 'unix', 'inet' and 'fd' are supported.
>> > +#
>> >  # Since: 7.2
>> >  ##
>> >  { 'struct': 'NetdevStreamOptions',
>>
>> The text moved applies to member @addr.  You're moving it even farther
>> away from @addr.  Move it into @addr instead?  Could be done as a
>> separate cleanup patch to keep this one as simple as possible; matter of
>> taste.
>>
>
> Mmm, I was doing a mechanical hacksaw job here, admittedly. I can do a more
> tactful adjustment. I think it should be in this patch in order to preserve
> the context of precisely *why* it was juggled around, because I admit in
> this one case it is a slight downgrade.
>
> Moving it into @addr.
>
>
>>
>> The same text is in NetdevDgramOptions below, where it applies to both
>> @remote and @local.  It just happens to follow @remote and @local
>> immediately, because there are no other members and no features.  Hmm.
>>
>> Ideally, we'd have a way to put such notes next to the stuff they apply
>> to without having to rely on happy accidents like "no features".
>> Alternatively, have a way to link stuff and note.  Footnotes?  Food for
>> thought, not demand.
>>
>
> Yes, we discussed this at KVM Forum and I was dreaming of some complicated
> solution like "section-details: " or something that allows us to add
> amendments to documentation regions that aren't associated with any one
> particular member or feature, but can be inserted visually at that point.
>
> I know it's a capability you'd like to preserve, but I think we only use it
> once, so I'd be happy to push this off until a bit later and just suffer
> the indignity of slightly suboptimal documentation in one spot until then
> in exchange for the massive upgrade everywhere else.

If minor degradations are the price of major improvement, we pay.

> What would help a good deal is if you could brainstorm some source syntax
> that you think would be pleasant for the purpose, and then for my end I can
> worry about how to munge the docutils tree and HTML renderer to make it
> happen in some pleasing way.

Can we make do with just ReST?  Footnotes?  Best if we can control their
placement somehow.

> For now... "Figure out how to add notes or footnotes to the members section
> as a whole" added to the "for later" part of my tasklist?

Yes, please, with the understanding that this is *not* a blocker for
getting the new doc generator accepted.

>> > diff --git a/qapi/qom.json b/qapi/qom.json
>> > index 28ce24cd8d0..11277d1f84c 100644
>> > --- a/qapi/qom.json
>> > +++ b/qapi/qom.json
>> > @@ -195,12 +195,12 @@
>> >  #
>> >  # @typename: the type name of an object
>> >  #
>> > +# Returns: a list of ObjectPropertyInfo describing object properties
>> > +#
>> >  # .. note:: Objects can create properties at runtime, for example to
>> >  #describe links between different devices and/or objects.  These
>> >  #properties are not included in the output of

[PATCH rfcv2 01/20] backends/iommufd: Add helpers for invalidating user-managed HWPT

2025-02-19 Thread Zhenzhong Duan
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 include/system/iommufd.h |  3 +++
 backends/iommufd.c   | 30 ++
 backends/trace-events|  1 +
 3 files changed, 34 insertions(+)

diff --git a/include/system/iommufd.h b/include/system/iommufd.h
index cbab75bfbf..5d02e9d148 100644
--- a/include/system/iommufd.h
+++ b/include/system/iommufd.h
@@ -61,6 +61,9 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, 
uint32_t hwpt_id,
   uint64_t iova, ram_addr_t size,
   uint64_t page_size, uint64_t *data,
   Error **errp);
+int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id,
+ uint32_t data_type, uint32_t entry_len,
+ uint32_t *entry_num, void *data_ptr);
 
 #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index d57da44755..fc32aad5cb 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -311,6 +311,36 @@ bool iommufd_backend_get_device_info(IOMMUFDBackend *be, 
uint32_t devid,
 return true;
 }
 
+int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id,
+ uint32_t data_type, uint32_t entry_len,
+ uint32_t *entry_num, void *data_ptr)
+{
+int ret, fd = be->fd;
+struct iommu_hwpt_invalidate cache = {
+.size = sizeof(cache),
+.hwpt_id = hwpt_id,
+.data_type = data_type,
+.entry_len = entry_len,
+.entry_num = *entry_num,
+.data_uptr = (uintptr_t)data_ptr,
+};
+
+ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache);
+
+trace_iommufd_backend_invalidate_cache(fd, hwpt_id, data_type, entry_len,
+   *entry_num, cache.entry_num,
+   (uintptr_t)data_ptr, ret);
+if (ret) {
+*entry_num = cache.entry_num;
+error_report("IOMMU_HWPT_INVALIDATE failed: %s", strerror(errno));
+ret = -errno;
+} else {
+g_assert(*entry_num == cache.entry_num);
+}
+
+return ret;
+}
+
 static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
 {
 HostIOMMUDeviceCaps *caps = &hiod->caps;
diff --git a/backends/trace-events b/backends/trace-events
index 40811a3162..5a23db6c8a 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -18,3 +18,4 @@ iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, 
uint32_t pt_id, uint32_
 iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d 
(%d)"
 iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) 
" iommufd=%d hwpt=%u enable=%d (%d)"
 iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, 
uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u 
iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)"
+iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t 
data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t 
data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u 
entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)"
-- 
2.34.1




[PATCH rfcv2 12/20] intel_iommu: Introduce a new structure VTDHostIOMMUDevice

2025-02-19 Thread Zhenzhong Duan
Introduce a new structure VTDHostIOMMUDevice which replaces
HostIOMMUDevice to be stored in hash table.

It includes a reference to HostIOMMUDevice and IntelIOMMUState,
also includes BDF information which will be used in future
patches.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  7 +++
 include/hw/i386/intel_iommu.h  |  2 +-
 hw/i386/intel_iommu.c  | 14 --
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 2cda744786..18bc22fc72 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -28,6 +28,7 @@
 #ifndef HW_I386_INTEL_IOMMU_INTERNAL_H
 #define HW_I386_INTEL_IOMMU_INTERNAL_H
 #include "hw/i386/intel_iommu.h"
+#include "system/host_iommu_device.h"
 
 /*
  * Intel IOMMU register specification
@@ -608,4 +609,10 @@ typedef struct VTDRootEntry VTDRootEntry;
 /* Bits to decide the offset for each level */
 #define VTD_LEVEL_BITS   9
 
+typedef struct VTDHostIOMMUDevice {
+IntelIOMMUState *iommu_state;
+PCIBus *bus;
+uint8_t devfn;
+HostIOMMUDevice *hiod;
+} VTDHostIOMMUDevice;
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index e95477e855..50f9b27a45 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -295,7 +295,7 @@ struct IntelIOMMUState {
 /* list of registered notifiers */
 QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
-GHashTable *vtd_host_iommu_dev; /* HostIOMMUDevice */
+GHashTable *vtd_host_iommu_dev; /* VTDHostIOMMUDevice */
 
 /* interrupt remapping */
 bool intr_enabled;  /* Whether guest enabled IR */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 9de60e607d..fafa199f52 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -281,7 +281,10 @@ static gboolean vtd_hiod_equal(gconstpointer v1, 
gconstpointer v2)
 
 static void vtd_hiod_destroy(gpointer v)
 {
-object_unref(v);
+VTDHostIOMMUDevice *vtd_hiod = v;
+
+object_unref(vtd_hiod->hiod);
+g_free(vtd_hiod);
 }
 
 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
@@ -4388,6 +4391,7 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
  HostIOMMUDevice *hiod, Error **errp)
 {
 IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hiod;
 struct vtd_as_key key = {
 .bus = bus,
 .devfn = devfn,
@@ -4404,6 +4408,12 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 return false;
 }
 
+vtd_hiod = g_malloc0(sizeof(VTDHostIOMMUDevice));
+vtd_hiod->bus = bus;
+vtd_hiod->devfn = (uint8_t)devfn;
+vtd_hiod->iommu_state = s;
+vtd_hiod->hiod = hiod;
+
 if (!vtd_check_hiod(s, hiod, errp)) {
 vtd_iommu_unlock(s);
 return false;
@@ -4414,7 +4424,7 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 new_key->devfn = devfn;
 
 object_ref(hiod);
-g_hash_table_insert(s->vtd_host_iommu_dev, new_key, hiod);
+g_hash_table_insert(s->vtd_host_iommu_dev, new_key, vtd_hiod);
 
 vtd_iommu_unlock(s);
 
-- 
2.34.1




[PATCH rfcv2 03/20] HostIOMMUDevice: Introduce realize_late callback

2025-02-19 Thread Zhenzhong Duan
Currently we have realize() callback which is called before attachment.
But there are still some elements e.g., hwpt_id is not ready before
attachment. So we need a realize_late() callback to further initialize
them.

Currently, this callback is only useful for iommufd backend. For legacy
backend nothing needs to be initialized after attachment.

Signed-off-by: Zhenzhong Duan 
---
 include/system/host_iommu_device.h | 17 +
 hw/vfio/common.c   | 17 ++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/include/system/host_iommu_device.h 
b/include/system/host_iommu_device.h
index 809cced4ba..df782598f2 100644
--- a/include/system/host_iommu_device.h
+++ b/include/system/host_iommu_device.h
@@ -66,6 +66,23 @@ struct HostIOMMUDeviceClass {
  * Returns: true on success, false on failure.
  */
 bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+/**
+ * @realize_late: initialize host IOMMU device instance after attachment,
+ *some elements e.g., ioas are ready only after attachment.
+ *This callback initialize them.
+ *
+ * Optional callback.
+ *
+ * @hiod: pointer to a host IOMMU device instance.
+ *
+ * @opaque: pointer to agent device of this host IOMMU device,
+ *  e.g., VFIO base device or VDPA device.
+ *
+ * @errp: pass an Error out when realize fails.
+ *
+ * Returns: true on success, false on failure.
+ */
+bool (*realize_late)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
 /**
  * @get_cap: check if a host IOMMU device capability is supported.
  *
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index abbdc56b6d..e198b1e5a2 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1550,6 +1550,7 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev,
 const VFIOIOMMUClass *ops =
 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
 HostIOMMUDevice *hiod = NULL;
+HostIOMMUDeviceClass *hiod_ops = NULL;
 
 if (vbasedev->iommufd) {
 ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
@@ -1560,16 +1561,26 @@ bool vfio_attach_device(char *name, VFIODevice 
*vbasedev,
 
 if (!vbasedev->mdev) {
 hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
+hiod_ops = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
 vbasedev->hiod = hiod;
 }
 
 if (!ops->attach_device(name, vbasedev, as, errp)) {
-object_unref(hiod);
-vbasedev->hiod = NULL;
-return false;
+goto err_attach;
+}
+
+if (hiod_ops && hiod_ops->realize_late &&
+!hiod_ops->realize_late(hiod, vbasedev, errp)) {
+ops->detach_device(vbasedev);
+goto err_attach;
 }
 
 return true;
+
+err_attach:
+object_unref(hiod);
+vbasedev->hiod = NULL;
+return false;
 }
 
 void vfio_detach_device(VFIODevice *vbasedev)
-- 
2.34.1




[PATCH rfcv2 09/20] intel_iommu: Rename vtd_ce_get_rid2pasid_entry to vtd_ce_get_pasid_entry

2025-02-19 Thread Zhenzhong Duan
In early days vtd_ce_get_rid2pasid_entry() is used to get pasid entry of
rid2pasid, then extend to any pasid. So a new name vtd_ce_get_pasid_entry
is better to match its functions.

No functional change intended.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7fde0603bf..df5fb30bc8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -944,7 +944,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
 return 0;
 }
 
-static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
+static int vtd_ce_get_pasid_entry(IntelIOMMUState *s,
   VTDContextEntry *ce,
   VTDPASIDEntry *pe,
   uint32_t pasid)
@@ -1025,7 +1025,7 @@ static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
 VTDPASIDEntry pe;
 
 if (s->root_scalable) {
-vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
+vtd_ce_get_pasid_entry(s, ce, &pe, pasid);
 if (s->flts) {
 return VTD_PE_GET_FL_LEVEL(&pe);
 } else {
@@ -1048,7 +1048,7 @@ static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
 VTDPASIDEntry pe;
 
 if (s->root_scalable) {
-vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
+vtd_ce_get_pasid_entry(s, ce, &pe, pasid);
 return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
 }
 
@@ -1116,7 +1116,7 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState 
*s,
 VTDPASIDEntry pe;
 
 if (s->root_scalable) {
-vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
+vtd_ce_get_pasid_entry(s, ce, &pe, pasid);
 if (s->flts) {
 return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
 } else {
@@ -1522,7 +1522,7 @@ static int vtd_ce_rid2pasid_check(IntelIOMMUState *s,
  * has valid rid2pasid setting, which includes valid
  * rid2pasid field and corresponding pasid entry setting
  */
-return vtd_ce_get_rid2pasid_entry(s, ce, &pe, PCI_NO_PASID);
+return vtd_ce_get_pasid_entry(s, ce, &pe, PCI_NO_PASID);
 }
 
 /* Map a device to its corresponding domain (context-entry) */
@@ -1611,7 +1611,7 @@ static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
 VTDPASIDEntry pe;
 
 if (s->root_scalable) {
-vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
+vtd_ce_get_pasid_entry(s, ce, &pe, pasid);
 return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
 }
 
@@ -1687,7 +1687,7 @@ static bool vtd_dev_pt_enabled(IntelIOMMUState *s, 
VTDContextEntry *ce,
 int ret;
 
 if (s->root_scalable) {
-ret = vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
+ret = vtd_ce_get_pasid_entry(s, ce, &pe, pasid);
 if (ret) {
 /*
  * This error is guest triggerable. We should assumt PT
-- 
2.34.1




Re: [PATCH 29/42] qapi: Add "Details:" disambiguation marker

2025-02-19 Thread Markus Armbruster
John Snow  writes:

> On Mon, Feb 17, 2025 at 6:55 AM Markus Armbruster  wrote:
>
>> John Snow  writes:
>>
>> > This clarifies sections that are mistaken by the parser as "intro"
>> > sections to be "details" sections instead.
>> >
>> > Signed-off-by: John Snow 
>> > ---
>> >  qapi/machine.json  | 2 ++
>> >  qapi/migration.json| 4 
>> >  qapi/qom.json  | 4 
>> >  qapi/yank.json | 2 ++
>> >  scripts/qapi/parser.py | 8 
>> >  5 files changed, 20 insertions(+)
>>
>> Missing updates for the new syntax
>>
>> * Documentation: docs/devel/qapi-code-gen.rst
>>
>
>> * Positive test case(s): tests/qapi-schema/doc-good.json
>>
>> * Maybe a negative test case for _tag_check() failure
>>
>>
> Understood; I wasn't entirely sure if this concept would fly, so I saved
> the polish and you got an RFC quality patch. Forgive me, please! If you

As I wrote in review of PATCH 28, this is good strategy.

> think this approach is fine, I will certainly do all the things you
> outlined above.
>
>
>> [...]
>>
>> > diff --git a/scripts/qapi/parser.py b/scripts/qapi/parser.py
>> > index c5d2b950a82..5890a13b5ba 100644
>> > --- a/scripts/qapi/parser.py
>> > +++ b/scripts/qapi/parser.py
>> > @@ -544,6 +544,14 @@ def _tag_check(what: str) -> None:
>> >  raise QAPIParseError(
>> >  self, 'feature descriptions expected')
>> >  have_tagged = True
>> > +elif line == 'Details:':
>> > +_tag_check("Details")
>>
>> This one.
>>
>
> ACK
>
>
>>
>> > +self.accept(False)
>> > +line = self.get_doc_line()
>> > +while line == '':
>> > +self.accept(False)
>> > +line = self.get_doc_line()
>> > +have_tagged = True
>> >  elif match := self._match_at_name_colon(line):
>> >  # description
>> >  if have_tagged:
>>
>>




[PATCH 1/1] qapi/char.json: minor doc rewording for `hub` device

2025-02-19 Thread Roman Penyaev
Refine documentation for the hub device, specify the maximum.

Signed-off-by: Roman Penyaev 
Cc: Marc-André Lureau 
Cc: Markus Armbruster 
Cc: qemu-devel@nongnu.org
---
 qapi/char.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qapi/char.json b/qapi/char.json
index f02b66c06b3e..dde2f9538f81 100644
--- a/qapi/char.json
+++ b/qapi/char.json
@@ -337,7 +337,7 @@
 #
 # Configuration info for hub chardevs.
 #
-# @chardevs: List of chardev IDs, which should be added to this hub
+# @chardevs: IDs to be added to this hub (maximum 4 devices).
 #
 # Since: 10.0
 ##
-- 
2.43.0




Re: [PATCH v9 2/4] chardev/char-hub: implement backend chardev aggregator

2025-02-19 Thread Roman Penyaev
On Tue, Feb 18, 2025 at 8:57 AM Markus Armbruster  wrote:
>
> Just realized this has been committed already.  I'm not complaining, I
> was late.  Address my doc nits in a followup patch?

Sent. Please take a look.

--
Roman



Re: [PATCH 1/2] migration: Prioritize RDMA in ram_save_target_page()

2025-02-19 Thread Zhijian Li (Fujitsu)


On 19/02/2025 06:03, Peter Xu wrote:
> On Tue, Feb 18, 2025 at 05:30:40PM -0300, Fabiano Rosas wrote:
>> Li Zhijian via  writes:
>>
>>> Address an error in RDMA-based migration by ensuring RDMA is prioritized
>>> when saving pages in `ram_save_target_page()`.
>>>
>>> Previously, the RDMA protocol's page-saving step was placed after other
>>> protocols due to a refactoring in commit bc38dc2f5f3. This led to migration
>>> failures characterized by unknown control messages and state loading errors
>>> destination:
>>> (qemu) qemu-system-x86_64: Unknown control message QEMU FILE
>>> qemu-system-x86_64: error while loading state section id 1(ram)
>>> qemu-system-x86_64: load of migration failed: Operation not permitted
>>> source:
>>> (qemu) qemu-system-x86_64: RDMA is in an error state waiting migration to 
>>> abort!
>>> qemu-system-x86_64: failed to save SaveStateEntry with id(name): 1(ram): -1
>>> qemu-system-x86_64: rdma migration: recv polling control error!
>>> qemu-system-x86_64: warning: Early error. Sending error.
>>> qemu-system-x86_64: warning: rdma migration: send polling control error
>>>
>>> RDMA migration implemented its own protocol/method to send pages to
>>> destination side, hand over to RDMA first to prevent pages being saved by
>>> other protocol.
>>>
>>> Fixes: bc38dc2f5f3 ("migration: refactor ram_save_target_page functions")
>>> Signed-off-by: Li Zhijian 
>>> ---
>>>   migration/ram.c | 9 +
>>>   1 file changed, 5 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/migration/ram.c b/migration/ram.c
>>> index 6f460fd22d2..635a2fe443a 100644
>>> --- a/migration/ram.c
>>> +++ b/migration/ram.c
>>> @@ -1964,6 +1964,11 @@ static int ram_save_target_page(RAMState *rs, 
>>> PageSearchStatus *pss)
>>>   ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
>>>   int res;
>>>   
>>> +/* Hand over to RDMA first */
>>> +if (control_save_page(pss, offset, &res)) {
>>> +return res;
>>> +}
>>> +
>>
>> Can we hoist that migrate_rdma() from inside the function? Since the
>> other paths already check first before calling their functions.
> 

Yeah, it sounds good to me.


> If we're talking about hoist and stuff.. and if we want to go slightly
> further, I wonder if we could also drop RAM_SAVE_CONTROL_NOT_SUPP.
> 
>  if (!migrate_rdma() || migration_in_postcopy()) {
>  return RAM_SAVE_CONTROL_NOT_SUPP;
>  }
> 
> We should make sure rdma_control_save_page() won't get invoked at all in
> either case above..  

> For postcopy, maybe we could fail in the QMP migrate /
> migrate_incoming cmd, at migration_channels_and_transport_compatible()

I tried to kill RAM_SAVE_CONTROL_NOT_SUPP, but It seems it doesn't need to 
touch any postcopy logic
"in the QMP migrate / migrate_incoming cmd, at 
migration_channels_and_transport_compatible()"

Is there something I might have overlooked?

A whole draft diff would be like below:
It includes 3 parts:

migration/rdma: Remove unnecessary RAM_SAVE_CONTROL_NOT_SUPP check in 
rdma_control_save_page()
migration: kill RAM_SAVE_CONTROL_NOT_SUPP
migration: open control_save_page() to ram_save_target_page()

diff --git a/migration/ram.c b/migration/ram.c
index 589b6505eb2..fc6a964fd64 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1143,32 +1143,6 @@ static int save_zero_page(RAMState *rs, PageSearchStatus 
*pss,
  return len;
  }
  
-/*
- * @pages: the number of pages written by the control path,
- *< 0 - error
- *> 0 - number of pages written
- *
- * Return true if the pages has been saved, otherwise false is returned.
- */
-static bool control_save_page(PageSearchStatus *pss,
-  ram_addr_t offset, int *pages)
-{
-int ret;
-
-ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset,
- TARGET_PAGE_SIZE);
-if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
-return false;
-}
-
-if (ret == RAM_SAVE_CONTROL_DELAYED) {
-*pages = 1;
-return true;
-}
-*pages = ret;
-return true;
-}
-
  /*
   * directly send the page to the stream
   *
@@ -1964,6 +1938,16 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss)
  ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
  int res;
  
+if (migrate_rdma() && !migration_in_postcopy()) {
+res = rdma_control_save_page(pss->pss_channel, pss->block->offset,
+ offset, TARGET_PAGE_SIZE);
+
+if (res == RAM_SAVE_CONTROL_DELAYED) {
+res = 1;
+}
+return res;
+}
+
  if (!migrate_multifd()
  || migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) {
  if (save_zero_page(rs, pss, offset)) {
@@ -1976,10 +1960,6 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss)
  return ram_save_multifd_page(block, offset);
  }
  }
  
-if (control_save_page(pss, offset, &res)) 

[PATCH] linux-user: fix resource leaks in gen-vdso

2025-02-19 Thread Daniel P . Berrangé
There are a number of resource leaks in gen-vdso. In theory they are
harmless because this is a short lived process, but when building QEMU
with --extra-cflags="-fsanitize=address" problems ensure. The gen-vdso
program is run as part of the build, and that aborts due to the
sanitizer identifying memory leaks, leaving QEMU unbuildable.

FAILED: libqemu-x86_64-linux-user.a.p/vdso.c.inc
/var/home/berrange/src/virt/qemu/build/linux-user/gen-vdso -o 
libqemu-x86_64-linux-user.a.p/vdso.c.inc ../linux-user/x86_64/vdso.so

=
==1696332==ERROR: LeakSanitizer: detected memory leaks

Direct leak of 2968 byte(s) in 1 object(s) allocated from:
#0 0x56495873f1f3  
(/var/home/berrange/src/virt/qemu/build/linux-user/gen-vdso+0xa11f3) (BuildId: 
b69e241ad44719b6f3934f3c71dfc6727e8bdb12)
#1 0x564958780b90  
(/var/home/berrange/src/virt/qemu/build/linux-user/gen-vdso+0xe2b90) (BuildId: 
b69e241ad44719b6f3934f3c71dfc6727e8bdb12)

This complaint is about the 'buf' variable, however, the FILE objects
are also leaked in some error scenarios, so this fix refactors the
cleanup paths to fix all leaks. For completeness it also reports an
error if fclose() fails on 'inf'.

Signed-off-by: Daniel P. Berrangé 
---
 linux-user/gen-vdso.c | 29 +
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/linux-user/gen-vdso.c b/linux-user/gen-vdso.c
index 721f38d5a3..88d94b19eb 100644
--- a/linux-user/gen-vdso.c
+++ b/linux-user/gen-vdso.c
@@ -56,13 +56,14 @@ static unsigned rt_sigreturn_addr;
 
 int main(int argc, char **argv)
 {
-FILE *inf, *outf;
+FILE *inf = NULL, *outf = NULL;
 long total_len;
 const char *prefix = "vdso";
 const char *inf_name;
 const char *outf_name = NULL;
-unsigned char *buf;
+unsigned char *buf = NULL;
 bool need_bswap;
+int ret = EXIT_FAILURE;
 
 while (1) {
 int opt = getopt(argc, argv, "o:p:r:s:");
@@ -129,7 +130,6 @@ int main(int argc, char **argv)
 fprintf(stderr, "%s: incomplete read\n", inf_name);
 return EXIT_FAILURE;
 }
-fclose(inf);
 
 /*
  * Identify which elf flavor we're processing.
@@ -205,19 +205,24 @@ int main(int argc, char **argv)
 fprintf(outf, ".rt_sigreturn_ofs = 0x%x,\n", rt_sigreturn_addr);
 fprintf(outf, "};\n");
 
-/*
- * Everything should have gone well.
- */
-if (fclose(outf)) {
-goto perror_outf;
-}
-return EXIT_SUCCESS;
+ret = EXIT_SUCCESS;
+
+ cleanup:
+free(buf);
+
+if (outf &&
+fclose(outf) != 0)
+ret = EXIT_FAILURE;
+if (inf &&
+fclose(inf) != 0)
+ret = EXIT_FAILURE;
+return ret;
 
  perror_inf:
 perror(inf_name);
-return EXIT_FAILURE;
+goto cleanup;
 
  perror_outf:
 perror(outf_name);
-return EXIT_FAILURE;
+goto cleanup;
 }
-- 
2.47.1




Re: [PATCH v7 08/52] i386/tdx: Initialize TDX before creating TD vcpus

2025-02-19 Thread Francesco Lavra
On Fri, 2025-01-24 at 08:20 -0500, Xiaoyao Li wrote:
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 45867dbe0839..e35a9fbd687e 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -540,8 +540,15 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>  
>  trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
>  
> +    /*
> + * tdx_pre_create_vcpu() may call cpu_x86_cpuid(). It in turn
> may call
> + * kvm_vm_ioctl(). Set cpu->kvm_state in advance to avoid NULL
> pointer
> + * dereference.
> + */
> +    cpu->kvm_state = s;

This assignment should be removed from kvm_create_vcpu(), as now it's
redundant there.

>  ret = kvm_arch_pre_create_vcpu(cpu, errp);
>  if (ret < 0) {
> +    cpu->kvm_state = NULL;

No need to reset cpu->kvm_state to NULL, there already are other error
conditions under which cpu->kvm_state remains initialized.

>  goto err;
>  }
>  
> @@ -550,6 +557,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>  error_setg_errno(errp, -ret,
>   "kvm_init_vcpu: kvm_create_vcpu failed
> (%lu)",
>   kvm_arch_vcpu_id(cpu));
> +    cpu->kvm_state = NULL;

Same here.


Permissively Licensing the CPU Component

2025-02-19 Thread Faisal Al-Humaimidi
Hello QEMU developers,

I understand from this page, https://wiki.qemu.org/License, that TCG is
being licensed permissively (BSD license) so it can be integrated as a
library in other projects, which is great! However, I'd like to know if the
CPU part of QEMU, no peripherals included, is also permissively licensed
(maybe BSD or some other permissive license such as LGPL, ...etc.)?

The reason I am asking this question is because projects such as the
Unicorn Engine would greatly benefit from a permissively licensed CPU
component so that the project itself (Unicorn Engine) is also released
under a permissive license, which would be great for people wanting to do
research with the Unicorn Engine but are tied with proprietary code (e.g.,
in proprietary university research projects). I have started a discussion
for that matter with the Unicorn Engine developers in their GitHub page,
https://github.com/unicorn-engine/unicorn/issues/2114, and it would be
great to have a feedback regarding this matter from the official QEMU
developers, whether directly on the issue or a reply to this email and I'd
relay your response.


Kind regards,
Faisal Al-Humaimidi


Re: Permissively Licensing the CPU Component

2025-02-19 Thread Daniel P . Berrangé
On Wed, Feb 19, 2025 at 01:59:08AM -0800, Faisal Al-Humaimidi wrote:
> Hello QEMU developers,
> 
> I understand from this page, https://wiki.qemu.org/License, that TCG is
> being licensed permissively (BSD license) so it can be integrated as a
> library in other projects, which is great!

Not so fast. Individual source files may be under the BSD license, but
those source files are rarely buildable & usable in isolation. They will
consume APIs in other parts of QEMU which are under the GPL license, and
thus the combined work will be under the GPL per that wiki License page
above.

IOW, if you wanted to takes pieces which are BSD license and use them
exclusively under BSD in a combined work, you would need to re-implement
any other code it depends on which was not also BSD licensed. This is
unlikely to be a sensible investment of time IMHO.

> However, I'd like to know if the
> CPU part of QEMU, no peripherals included, is also permissively licensed
> (maybe BSD or some other permissive license such as LGPL, ...etc.)?

Again, QEMU as a whole is under the GPL-v2-only, because the process of
building QEMU into a functional binary pulls together code under many
licenses with GPL-v2-only being the one that sets the overall terms in
QEMU's case. 

> The reason I am asking this question is because projects such as the
> Unicorn Engine would greatly benefit from a permissively licensed CPU
> component so that the project itself (Unicorn Engine) is also released
> under a permissive license, which would be great for people wanting to do
> research with the Unicorn Engine but are tied with proprietary code (e.g.,
> in proprietary university research projects). I have started a discussion
> for that matter with the Unicorn Engine developers in their GitHub page,
> https://github.com/unicorn-engine/unicorn/issues/2114, and it would be
> great to have a feedback regarding this matter from the official QEMU
> developers, whether directly on the issue or a reply to this email and I'd
> relay your response.

Since AFAICT unicorn engine has copied in the entire of the QEMU source
tree, and many of the BSD bits of QEMU will consume other GPL code, I
don't see a possibility to change. Individual source files can be under
a variety of licenses, but the combined work will inevitably be under
the GPL-v2-only.

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH] meson: Display summary of Darwin libraries detected

2025-02-19 Thread Paolo Bonzini
On Tue, Feb 18, 2025 at 3:55 PM Phil Dennis-Jordan  wrote:
> It is not cross-architecture. So, the PVG guest drivers with x86-64 macOS 
> don't
> give useful results with the aarch64 macOS host PVG framework. (I suspect a
> mismatch in texture memory layout and perhaps some other data format issues
> Apple's GPUs use a "swizzled" memory layout, whereas Intel/AMD/NVIDIA's do 
> not.)

Thanks, that helps.

> In summary, hw/display/apple-gfx-mmio.m has a hard dependency on aarch64 
> *host*
> systems, and both apple-gfx device variants only make sense when host arch 
> matches
> guest arch. (Unless you're a developer attempting to find a workaround to the
> incompatibility.)  I'm very much not a Meson expert, so this was my best 
> attempt at
> encoding these facts in the build system. (And nobody suggested anything 
> better during
> review.)

I'll find a way to do that, thanks. I also would prefer to add
--enable/--disable-pvg; I
can take care of that, but I just need to know...

>> Either way, the Kconfig file does not need "depends on AARCH64" and it
>> should have just
>>
>>  depends on MAC_PVG
>>
>> with an "imply" somewhere in hw/arm/Kconfig.

... which boards should enable MAC_PVG_MMIO? Is it only VIRT, or something
else?

Paolo




Re: [PATCH] meson: Display summary of Darwin libraries detected

2025-02-19 Thread Phil Dennis-Jordan
On Wed, 19 Feb 2025 at 11:28, Paolo Bonzini  wrote:

> On Tue, Feb 18, 2025 at 3:55 PM Phil Dennis-Jordan 
> wrote:
> > It is not cross-architecture. So, the PVG guest drivers with x86-64
> macOS don't
> > give useful results with the aarch64 macOS host PVG framework. (I
> suspect a
> > mismatch in texture memory layout and perhaps some other data format
> issues
> > Apple's GPUs use a "swizzled" memory layout, whereas Intel/AMD/NVIDIA's
> do not.)
>
> Thanks, that helps.
>
> > In summary, hw/display/apple-gfx-mmio.m has a hard dependency on aarch64
> *host*
> > systems, and both apple-gfx device variants only make sense when host
> arch matches
> > guest arch. (Unless you're a developer attempting to find a workaround
> to the
> > incompatibility.)  I'm very much not a Meson expert, so this was my best
> attempt at
> > encoding these facts in the build system. (And nobody suggested anything
> better during
> > review.)
>
> I'll find a way to do that, thanks. I also would prefer to add
> --enable/--disable-pvg; I
> can take care of that, but I just need to know...


>> Either way, the Kconfig file does not need "depends on AARCH64" and it
> >> should have just
> >>
> >>  depends on MAC_PVG
> >>
> >> with an "imply" somewhere in hw/arm/Kconfig.
>
> ... which boards should enable MAC_PVG_MMIO? Is it only VIRT, or something
> else?


I doubt anyone will be able to use it productively with virt, but I am
regularly surprised by people's creativity.

The intended target machine is vmapple, which is the only known way to run
aarch64 macOS guests. The PVG patches were part of that series, the PCI
variant is useful independently of it though; Philippe said at one point
early Jan/late Dec he was going to merge the rest of the patch series
containing that machine type, but there were some question marks about the
software GICv3 dependency and HVF I believe. I'll try to rebase that series
and re-post it in the next few days - I've been too busy with another
project to stay on top of chasing that down, but I'd really like to get it
done for 10.0.

Phil


Re: [PATCH] meson: Display summary of Darwin libraries detected

2025-02-19 Thread Paolo Bonzini
On Wed, Feb 19, 2025 at 11:36 AM Phil Dennis-Jordan  wrote:
>> ... which boards should enable MAC_PVG_MMIO? Is it only VIRT, or something
>> else?
>
> I doubt anyone will be able to use it productively with virt, but I am 
> regularly surprised by people's creativity.

Ah okay, so for now it's effectively dead code.

> The intended target machine is vmapple, which is the only known way
> to run aarch64 macOS guests. [...]
> I'll try to rebase that series and re-post it in the next few days

Thanks, I'll keep an eye on that.

Paolo




Re: [PATCH v7 16/52] i386/tdvf: Introduce function to parse TDVF metadata

2025-02-19 Thread Francesco Lavra
On Fri, 2025-01-24 at 08:20 -0500, Xiaoyao Li wrote:
> +int tdvf_parse_metadata(TdxFirmware *fw, void *flash_ptr, int size)
> +{
> +    g_autofree TdvfSectionEntry *sections = NULL;
> +    TdvfMetadata *metadata;
> +    ssize_t entries_size;
> +    int i;
> +
> +    metadata = tdvf_get_metadata(flash_ptr, size);
> +    if (!metadata) {
> +    return -EINVAL;
> +    }
> +
> +    /* load and parse metadata entries */
> +    fw->nr_entries = le32_to_cpu(metadata->NumberOfSectionEntries);
> +    if (fw->nr_entries < 2) {
> +    error_report("Invalid number of fw entries (%u) in TDVF
> Metadata",
> + fw->nr_entries);
> +    return -EINVAL;
> +    }
> +
> +    entries_size = fw->nr_entries * sizeof(TdvfSectionEntry);
> +    if (metadata->Length != sizeof(*metadata) + entries_size) {
> +    error_report("TDVF metadata len (0x%x) mismatch, expected
> (0x%x)",
> + metadata->Length,
> + (uint32_t)(sizeof(*metadata) + entries_size));
> +    return -EINVAL;
> +    }
> +
> +    fw->entries = g_new(TdxFirmwareEntry, fw->nr_entries);
> +    sections = g_new(TdvfSectionEntry, fw->nr_entries);
> +
> +    if (!memcpy(sections, (void *)metadata + sizeof(*metadata),
> entries_size)) {
> +    error_report("Failed to read TDVF section entries");

memcpy() cannot fail...



Re: [PATCH 1/4] hw/riscv/virt: KVM AIA refinement

2025-02-19 Thread Yong-Xuan Wang
Hi Daniel,


On Tue, Feb 18, 2025 at 3:24 AM Daniel Henrique Barboza
 wrote:
>
>
>
> On 2/17/25 5:19 AM, Yong-Xuan Wang wrote:
> > KVM AIA is only needed to be set when the virt machine use the AIA MSI.
> > So we can move the KVM AIA configuration into virt_create_aia() to reduce
> > the condition checking.
> >
> > Signed-off-by: Yong-Xuan Wang 
> > ---
>
> Unfortunately this doesn't work.
>
> The reason is that kvm_riscv_aia_create(), as it is now, is called only once
> during virt_machine_init() and it's already handling initialization for each 
> socket:
>
>
>  for (socket = 0; socket < socket_count; socket++) {
>  socket_imsic_base = imsic_base + socket * (1U << group_shift);
>  hart_count = riscv_socket_hart_count(machine, socket);
>  base_hart = riscv_socket_first_hartid(machine, socket);
>
>  if (max_hart_per_socket < hart_count) {
>  max_hart_per_socket = hart_count;
>  }
>
>  for (i = 0; i < hart_count; i++) {
>  imsic_addr = socket_imsic_base + i * IMSIC_HART_SIZE(guest_bits);
>  ret = kvm_device_access(aia_fd, KVM_DEV_RISCV_AIA_GRP_ADDR,
>  KVM_DEV_RISCV_AIA_ADDR_IMSIC(i + 
> base_hart),
>  &imsic_addr, true, NULL);
>  if (ret < 0) {
>  error_report("KVM AIA: failed to set the IMSIC address for 
> hart %d", i);
>  exit(1);
>  }
>  }
>  }
>
> After this change, kvm_riscv_aia_create() is being called once for each 
> socket since it's
> now being called inside virt_create_aia(). And this will cause errors when 
> running qemu-kvm
> with more than one socket:
>
> ./qemu-system-riscv64 \
> -machine virt,accel=kvm,aia=aplic-imsic -m 2G \
> -object memory-backend-ram,size=1G,id=m0 \
> -object memory-backend-ram,size=1G,id=m1 \
> -smp 2,sockets=2,cores=1,threads=1 \
> -numa node,memdev=m0,cpus=0,nodeid=0 \
> -numa node,memdev=m1,cpus=1,nodeid=1 \
>  (...)
> qemu-system-riscv64: KVM AIA: failed to set the IMSIC address for hart 0
>

Oh I forgot to test the NUMA config. Sorry.

>
> To make this patch work we would need changes in kvm_riscv_aia_create() to 
> handle just the
> current socket. The loop I mentioned above is one place, and there's another 
> place where
> we set group_bits and group_shift if socket_count > 1.
>

Also we need to find a place to initialize the in-kernel AIA after
setting up all the IMSICs among sockets. This would make things more
complicated. I will remove this patch in the next version. Thank you!

Regards,
Yong-Xuan


> To be honest I'm not sure if all these extra required changes are worth the 
> simplification
> this patch is proposing.
>
>
> Thanks,
>
> Daniel
>
>
>
>
>
> >   hw/riscv/virt.c | 79 +++--
> >   1 file changed, 37 insertions(+), 42 deletions(-)
> >
> > diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
> > index dae46f4733cd..a52117ef71ee 100644
> > --- a/hw/riscv/virt.c
> > +++ b/hw/riscv/virt.c
> > @@ -58,14 +58,6 @@
> >   #include "qapi/qapi-visit-common.h"
> >   #include "hw/virtio/virtio-iommu.h"
> >
> > -/* KVM AIA only supports APLIC MSI. APLIC Wired is always emulated by 
> > QEMU. */
> > -static bool virt_use_kvm_aia_aplic_imsic(RISCVVirtAIAType aia_type)
> > -{
> > -bool msimode = aia_type == VIRT_AIA_TYPE_APLIC_IMSIC;
> > -
> > -return riscv_is_kvm_aia_aplic_imsic(msimode);
> > -}
> > -
> >   static bool virt_use_emulated_aplic(RISCVVirtAIAType aia_type)
> >   {
> >   bool msimode = aia_type == VIRT_AIA_TYPE_APLIC_IMSIC;
> > @@ -1298,10 +1290,12 @@ static DeviceState *virt_create_plic(const 
> > MemMapEntry *memmap, int socket,
> >   return ret;
> >   }
> >
> > -static DeviceState *virt_create_aia(RISCVVirtAIAType aia_type, int 
> > aia_guests,
> > +static DeviceState *virt_create_aia(RISCVVirtState *s,
> >   const MemMapEntry *memmap, int socket,
> >   int base_hartid, int hart_count)
> >   {
> > +RISCVVirtAIAType aia_type = s->aia_type;
> > +int aia_guests = s->aia_guests;
> >   int i;
> >   hwaddr addr = 0;
> >   uint32_t guest_bits;
> > @@ -1309,6 +1303,28 @@ static DeviceState *virt_create_aia(RISCVVirtAIAType 
> > aia_type, int aia_guests,
> >   DeviceState *aplic_m = NULL;
> >   bool msimode = aia_type == VIRT_AIA_TYPE_APLIC_IMSIC;
> >
> > +if (!kvm_enabled()) {
> > +/* Per-socket M-level APLIC */
> > +aplic_m = riscv_aplic_create(memmap[VIRT_APLIC_M].base +
> > + socket * memmap[VIRT_APLIC_M].size,
> > + memmap[VIRT_APLIC_M].size,
> > + (msimode) ? 0 : base_hartid,
> > + (msimode) ? 0 : hart_count,
> > + VIRT_IR

Re: [PATCH v7 19/52] i386/tdx: Track mem_ptr for each firmware entry of TDVF

2025-02-19 Thread Francesco Lavra
On Fri, 2025-01-24 at 08:20 -0500, Xiaoyao Li wrote:
> +static void tdx_finalize_vm(Notifier *notifier, void *unused)
> +{
> +    TdxFirmware *tdvf = &tdx_guest->tdvf;
> +    TdxFirmwareEntry *entry;
> +
> +    for_each_tdx_fw_entry(tdvf, entry) {
> +    switch (entry->type) {
> +    case TDVF_SECTION_TYPE_BFV:
> +    case TDVF_SECTION_TYPE_CFV:
> +    entry->mem_ptr = tdvf->mem_ptr + entry->data_offset;
> +    break;
> +    case TDVF_SECTION_TYPE_TD_HOB:
> +    case TDVF_SECTION_TYPE_TEMP_MEM:
> +    entry->mem_ptr = qemu_ram_mmap(-1, entry->size,
> +  
> qemu_real_host_page_size(), 0, 0);
> +    break;

Should check for MAP_FAILED return value.

> +    default:
> +    error_report("Unsupported TDVF section %d", entry-
> >type);
> +    exit(1);

Section entry types have already been checked against valid types in
tdvf_parse_and_check_section_entry(), no need to check them again here.


Re: [PATCH 7/8] target/riscv/kvm: rename riscv-aia to riscv-imsic

2025-02-19 Thread Yong-Xuan Wang
Hi Andrew,

On Mon, Feb 17, 2025 at 10:07 PM Andrew Jones  wrote:
>
> On Mon, Feb 17, 2025 at 04:17:27PM +0800, Yong-Xuan Wang wrote:
> > The riscv-aia property only controls the in-kernel IMSIC mode, the
> > emulation of AIA MSI mode is controlled by the kernel-irqchip property.
> > Rename the riscv-aia property to riscv-imsic to prevent the confusion.
> >
> > Signed-off-by: Yong-Xuan Wang 
> > ---
> >  target/riscv/kvm/kvm-cpu.c | 52 --
> >  1 file changed, 27 insertions(+), 25 deletions(-)
> >
> > diff --git a/target/riscv/kvm/kvm-cpu.c b/target/riscv/kvm/kvm-cpu.c
> > index c047d5f36951..ab53b76ab81f 100644
> > --- a/target/riscv/kvm/kvm-cpu.c
> > +++ b/target/riscv/kvm/kvm-cpu.c
> > @@ -1798,9 +1798,9 @@ void kvm_riscv_set_irq(RISCVCPU *cpu, int irq, int 
> > level)
> >  }
> >  }
> >
> > -static int aia_mode;
> > +static int imsic_mode;
> >
> > -static const char *kvm_aia_mode_str(uint64_t mode)
> > +static const char *kvm_imsic_mode_str(uint64_t mode)
> >  {
> >  switch (mode) {
> >  case KVM_DEV_RISCV_AIA_MODE_EMUL:
> > @@ -1813,19 +1813,19 @@ static const char *kvm_aia_mode_str(uint64_t mode)
> >  };
> >  }
> >
> > -static char *riscv_get_kvm_aia(Object *obj, Error **errp)
> > +static char *riscv_get_kvm_imsic(Object *obj, Error **errp)
> >  {
> > -return g_strdup(kvm_aia_mode_str(aia_mode));
> > +return g_strdup(kvm_imsic_mode_str(imsic_mode));
> >  }
> >
> > -static void riscv_set_kvm_aia(Object *obj, const char *val, Error **errp)
> > +static void riscv_set_kvm_imsic(Object *obj, const char *val, Error **errp)
> >  {
> >  if (!strcmp(val, "emul")) {
> > -aia_mode = KVM_DEV_RISCV_AIA_MODE_EMUL;
> > +imsic_mode = KVM_DEV_RISCV_AIA_MODE_EMUL;
> >  } else if (!strcmp(val, "hwaccel")) {
> > -aia_mode = KVM_DEV_RISCV_AIA_MODE_HWACCEL;
> > +imsic_mode = KVM_DEV_RISCV_AIA_MODE_HWACCEL;
> >  } else if (!strcmp(val, "auto")) {
> > -aia_mode = KVM_DEV_RISCV_AIA_MODE_AUTO;
> > +imsic_mode = KVM_DEV_RISCV_AIA_MODE_AUTO;
> >  } else {
> >  error_setg(errp, "Invalid KVM AIA mode");
> >  error_append_hint(errp, "Valid values are emul, hwaccel, and 
> > auto.\n");
> > @@ -1834,13 +1834,15 @@ static void riscv_set_kvm_aia(Object *obj, const 
> > char *val, Error **errp)
> >
> >  void kvm_arch_accel_class_init(ObjectClass *oc)
> >  {
> > -object_class_property_add_str(oc, "riscv-aia", riscv_get_kvm_aia,
> > -  riscv_set_kvm_aia);
> > -object_class_property_set_description(oc, "riscv-aia",
> > -"Set KVM AIA mode. Valid values are 'emul', 'hwaccel' and 'auto'. "
> > -"Changing KVM AIA modes relies on host support. Defaults to 'auto' 
> > "
> > -"if the host supports it");
> > -object_property_set_default_str(object_class_property_find(oc, 
> > "riscv-aia"),
> > +object_class_property_add_str(oc, "riscv-imsic", riscv_get_kvm_imsic,
> > +  riscv_set_kvm_imsic);
> > +object_class_property_set_description(oc, "riscv-imsic",
> > +"Set KVM IMSIC mode. Valid values are 'emul', 'hwaccel' and 
> > 'auto'. "
> > +"Changing KVM IMSIC modes relies on host support. Defaults to 
> > 'auto' "
> > +"if the host supports it. This property only takes effect when the 
> > "
> > +"kernel-irqchip=on|split when using AIA MSI.");
> > +object_property_set_default_str(object_class_property_find(oc,
> > +   
> > "riscv-imsic"),
> >  "auto");
>
> We can't change property names without deprecating the old name (which
> isn't likely worth it).
>

ok. I will remove this patch in the next version. Thank you!

Regards,
Yong-Xuan

> Thanks,
> drew
>
>
> >  }
> >
> > @@ -1851,7 +1853,7 @@ void kvm_riscv_aia_create(MachineState *machine, 
> > uint64_t group_shift,
> >  {
> >  int ret, i;
> >  int aia_fd = -1;
> > -uint64_t default_aia_mode;
> > +uint64_t default_imsic_mode;
> >  uint64_t socket_count = riscv_socket_count(machine);
> >  uint64_t max_hart_per_socket = 0;
> >  uint64_t socket, base_hart, hart_count, socket_imsic_base, imsic_addr;
> > @@ -1867,24 +1869,24 @@ void kvm_riscv_aia_create(MachineState *machine, 
> > uint64_t group_shift,
> >
> >  ret = kvm_device_access(aia_fd, KVM_DEV_RISCV_AIA_GRP_CONFIG,
> >  KVM_DEV_RISCV_AIA_CONFIG_MODE,
> > -&default_aia_mode, false, NULL);
> > +&default_imsic_mode, false, NULL);
> >  if (ret < 0) {
> > -error_report("KVM AIA: failed to get current KVM AIA mode");
> > +error_report("KVM AIA: failed to get current KVM IMSIC mode");
> >  exit(1);
> >  }
> >
> > -if (default_aia_mode != aia_mode) {
> > +if (default_imsic_mode != imsic_mode) {
> >  ret = kvm_device_access(a

Query on the dirty bitmap

2025-02-19 Thread prashant patil
Hello All,
Hope this email finds you well.

I have been trying with qemu for a while now, and have come across a
problem specific to dirty bitmaps. I have enabled bitmap on the qcow2 disk
image using 'qemu-img bitmap' command, exposed the bitmap over a unix
socket using 'qemu-nbd' command. Now when I try to read the bitmap using
'qemu-img map' command with 'x-dirty-bitmap=qemu:dirty-bitmap:{bitmap}'
option, I get one single extent which shows that the entire disk is dirty.
Note that the disk size is 5 GB, and has only a few MB of data in it, and
had added very small data after the bitmap was enabled. Bitmap output has
been pasted below.

[{ "start": 0, "length": 5368709120, "depth": 0, "present": true, "zero":
false, "data": true, "compressed": false, "offset": 0}]

Can someone please help me understand why the bitmap content shows the
entire disk as dirty?

Regards
Prashant


[Bug 2072564] Re: qemu-aarch64-static segfaults running ldconfig.real (amd64 host)

2025-02-19 Thread Dimitry Andric
Upstream has committed https://gitlab.com/qemu-
project/qemu/-/commit/4b7b20a3 which fixes the segfaults. A prerequisite
for the qemu 8.2.2 package in Ubuntu 24.04 is https://gitlab.com/qemu-
project/qemu/-/commit/c81d1faf, so here is a patch that includes both.


** Patch added: "Fix qemu-aarch64-static segfaults"
   
https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2072564/+attachment/5858748/+files/fix-lp2072564-1.diff

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/2072564

Title:
  qemu-aarch64-static segfaults running ldconfig.real (amd64 host)

Status in QEMU:
  New
Status in qemu package in Ubuntu:
  Triaged

Bug description:
  This affects the qemu-user-static 1:8.2.2+ds-0ubuntu1 package on
  Ubuntu 24.04, running on a amd64 host.

  When running docker containers with Ubuntu 22.04 in them, emulating
  arm64 with qemu-aarch64-static, invocations of ldconfig (actually
  ldconfig.real) segfault. For example:

  $ docker run -ti --platform linux/arm64/v8 ubuntu:22.04 
  root@8861ff640a1c:/# /sbin/ldconfig.real
  Segmentation fault

  If you copy the ldconfig.real binary to the host, and run it directly
  via qemu-aarch64-static:

  $ gdb --args qemu-aarch64-static ./ldconfig.real 
  GNU gdb (Ubuntu 15.0.50.20240403-0ubuntu1) 15.0.50.20240403-git
  Copyright (C) 2024 Free Software Foundation, Inc.
  License GPLv3+: GNU GPL version 3 or later 
  This is free software: you are free to change and redistribute it.
  There is NO WARRANTY, to the extent permitted by law.
  Type "show copying" and "show warranty" for details.
  This GDB was configured as "x86_64-linux-gnu".
  Type "show configuration" for configuration details.
  For bug reporting instructions, please see:
  .
  Find the GDB manual and other documentation resources online at:
  .

  For help, type "help".
  Type "apropos word" to search for commands related to "word"...
  Reading symbols from qemu-aarch64-static...
  Reading symbols from 
/home/dim/.cache/debuginfod_client/86579812b213be0964189499f62f176bea817bf2/debuginfo...
  (gdb) r
  Starting program: /usr/bin/qemu-aarch64-static ./ldconfig.real
  [Thread debugging using libthread_db enabled]
  Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
  [New Thread 0x776006c0 (LWP 28378)]

  Thread 1 "qemu-aarch64-st" received signal SIGSEGV, Segmentation fault.
  0x7fffe801645b in ?? ()
  (gdb) disassemble 
  No function contains program counter for selected frame.

  It looks like this is a known qemu regression after v8.1.1:
  https://gitlab.com/qemu-project/qemu/-/issues/1913

  Downgrading the package to qemu-user-
  static_8.0.4+dfsg-1ubuntu3_amd64.deb fixes the segfault.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/2072564/+subscriptions




Re: [PATCH] meson: Display summary of Darwin libraries detected

2025-02-19 Thread Philippe Mathieu-Daudé

On 19/2/25 11:38, Paolo Bonzini wrote:

On Wed, Feb 19, 2025 at 11:36 AM Phil Dennis-Jordan  wrote:

... which boards should enable MAC_PVG_MMIO? Is it only VIRT, or something
else?


I doubt anyone will be able to use it productively with virt, but I am 
regularly surprised by people's creativity.


Ah okay, so for now it's effectively dead code.


Correct (still tested before merging). I have Phil's vmapple series
queued but it fails the '--disable-hvf' build config so I had to drop
it before sending the PR. I kept the PVG device to save Phil from
respining again and again the same patches, hoping for a quick fix
to merge the rest, waiting for an update on whether Phil's suggestion
to fix the GIC dependency was OK or not, but no update so far, see:

https://lore.kernel.org/qemu-devel/CAAibmn0puNwDvHcU8xYi1EJ=dnaehasotkck0kmf4diqfr7...@mail.gmail.com/

and my proposal:

https://lore.kernel.org/qemu-devel/20241227202435.48055-1-phi...@linaro.org/

And now I see Phil's reply which I missed ...:
https://lore.kernel.org/qemu-devel/CAAibmn1g6+btdRX99ZUvbaBm7hP_AnAGNfDHz4Wgi3fPn=w...@mail.gmail.com/




The intended target machine is vmapple, which is the only known way
to run aarch64 macOS guests. [...]
I'll try to rebase that series and re-post it in the next few days


Thanks, I'll keep an eye on that.

Paolo






Re: [PATCH 2/2] hw/intc: Have ARM_GIC select ARM_GICV3 when KVM is not available

2025-02-19 Thread Philippe Mathieu-Daudé

On 7/1/25 17:05, Phil Dennis-Jordan wrote:



On Sat, 28 Dec 2024 at 11:48, Phil Dennis-Jordan > wrote:




On Fri, 27 Dec 2024 at 21:24, Philippe Mathieu-Daudé
mailto:phi...@linaro.org>> wrote:

When the KVM accelerator is selected, the Kconfig ARM_GIC key
selects the KVM GIC implementation (ARM_GIC_KVM).
For other accelerators (TCG, HVF, ...), select the generic
implementation.

Signed-off-by: Philippe Mathieu-Daudé mailto:phi...@linaro.org>>
---
  hw/intc/Kconfig | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/intc/Kconfig b/hw/intc/Kconfig
index 7547528f2c2..762139d8df3 100644
--- a/hw/intc/Kconfig
+++ b/hw/intc/Kconfig
@@ -23,7 +23,7 @@ config APIC

  config ARM_GIC
      bool
-    select ARM_GICV3 if TCG
+    select ARM_GICV3 if !KVM


Wouldn't this disable the ARM_GICV3 by default when building with --
enable-tcg --enable-kvm? And then there would be no GIC available
when running the built QEMU in TCG mode. (Bear with me, I'm a
relative Meson newbie.)



OK, I've managed to answer my own question now by setting up a RPi5 with 
aarch64 RPOS and building QEMU on that with --enable-kvm --enable-tcg. 
With patch 1/2 applied on current upstream master, I get:


$ build/qemu-system-aarch64 -accel tcg -M virt,gic-version=3 -smp 4 -m 4G
/[runs successfully]/

with 2/2 applied as well:

$ build/qemu-system-aarch64 -accel tcg -M virt,gic-version=3 -smp 4 -m 4G
qemu-system-aarch64: tcg does not support GICv3 emulation
$

In other words, with the proposed change, ARM_GICV3 is indeed deselected 
if KVM is selected even if TCG is also selected.



Possibly very basic question: is there any support for kernel- 
irqchip=off on aarch64/KVM?
If yes, don't we need ARM_GICV3 in that case anyway, so we should drop 
any accel dependency on it in the first place?


Cc'ing Alex & Gustavo who are more familiar with GIC devices.

If we definitely don't need the software GIC in KVM-only builds, I guess 
we're down to listing each accel that needs it individually. Is that:


select ARM_GICV3 if TCG || HVF || NVMM || WHPX || XEN_EMU || XEN

or can we drop any of those? Have I missed anything? What about QTest?






Re: [PULL 23/32] tests/functional: extend test_aarch64_virt with vulkan test

2025-02-19 Thread Thomas Huth

On 19/02/2025 14.25, Philippe Mathieu-Daudé wrote:

(+Markus for CLI)

On 10/1/25 14:17, Alex Bennée wrote:

Now that we have virtio-gpu Vulkan support, let's add a test for it.
Currently this is using images build by buildroot:

   https://lists.buildroot.org/pipermail/buildroot/2024-December/768196.html

Reviewed-by: Thomas Huth 
Signed-off-by: Alex Bennée 
Message-Id: <20250108121054.1126164-24-alex.ben...@linaro.org>

diff --git a/tests/functional/test_aarch64_virt.py b/tests/functional/ 
test_aarch64_virt.py

index 201c5ed023..6b2336a28d 100755
--- a/tests/functional/test_aarch64_virt.py
+++ b/tests/functional/test_aarch64_virt.py
@@ -13,10 +13,12 @@
  import logging
  from subprocess import check_call, DEVNULL
+from qemu.machine.machine import VMLaunchFailure
+
  from qemu_test import QemuSystemTest, Asset
-from qemu_test import exec_command_and_wait_for_pattern
+from qemu_test import exec_command, exec_command_and_wait_for_pattern
  from qemu_test import wait_for_console_pattern
-from qemu_test import get_qemu_img
+from qemu_test import skipIfMissingCommands, get_qemu_img
  class Aarch64VirtMachine(QemuSystemTest):
@@ -132,5 +134,73 @@ def test_aarch64_virt_gicv2(self):
  self.common_aarch64_virt("virt,gic-version=2")
+    ASSET_VIRT_GPU_KERNEL = Asset(
+    'https://fileserver.linaro.org/s/ce5jXBFinPxtEdx/'
+    'download?path=%2F&files='
+    'Image',
+    '89e5099d26166204cc5ca4bb6d1a11b92c217e1f82ec67e3ba363d09157462f6')
+
+    ASSET_VIRT_GPU_ROOTFS = Asset(
+    'https://fileserver.linaro.org/s/ce5jXBFinPxtEdx/'
+    'download?path=%2F&files='
+    'rootfs.ext4.zstd',
+    '792da7573f5dc2913ddb7c638151d4a6b2d028a4cb2afb38add513c1924bdad4')
+
+    @skipIfMissingCommands('zstd')
+    def test_aarch64_virt_with_gpu(self):
+    # This tests boots with a buildroot test image that contains
+    # vkmark and other GPU exercising tools. We run a headless
+    # weston that nevertheless still exercises the virtio-gpu
+    # backend.
+
+    self.set_machine('virt')
+    self.require_accelerator("tcg")
+
+    kernel_path = self.ASSET_VIRT_GPU_KERNEL.fetch()
+    image_path = self.uncompress(self.ASSET_VIRT_GPU_ROOTFS, 
format="zstd")

+
+    self.vm.set_console()
+    kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE +
+   'console=ttyAMA0 root=/dev/vda')
+
+    self.vm.add_args("-accel", "tcg")
+    self.vm.add_args("-cpu", "neoverse-v1,pauth-impdef=on")
+    self.vm.add_args("-machine", "virt,gic-version=max",
+ '-kernel', kernel_path,
+ '-append', kernel_command_line)
+    self.vm.add_args("-smp", "2", "-m", "2048")
+    self.vm.add_args("-device",
+ "virtio-gpu-gl-pci,hostmem=4G,blob=on,venus=on")
+    self.vm.add_args("-display", "egl-headless")
+    self.vm.add_args("-display", "dbus,gl=on")


[*]


+    self.vm.add_args("-device", "virtio-blk-device,drive=hd0")
+    self.vm.add_args("-blockdev",
+ "driver=raw,file.driver=file,"
+ "node-name=hd0,read-only=on,"
+ f"file.filename={image_path}")
+    self.vm.add_args("-snapshot")
+
+    try:
+    self.vm.launch()
+    except VMLaunchFailure as excp:
+    if "old virglrenderer, blob resources unsupported" in 
excp.output:

+    self.skipTest("No blob support for virtio-gpu")
+    elif "old virglrenderer, venus unsupported" in excp.output:
+    self.skipTest("No venus support for virtio-gpu")


This seems dependent on the order of the CLI arguments, as I got:

qemu-system-aarch64: -device virtio-gpu-gl-pci,hostmem=4G,blob=on,venus=on: 
'virtio-gpu-gl-pci' is not a valid device model name


I understand it is too complex to check this device availability with
meson, in order to avoid running the test.

Can we use device introspection instead, like we do in QTest with
qtest_qom_has_concrete_type() for accelerators? Maybe in the lines of:

   @skipIfMissingQOMType('virtio-gpu-gl-pci')


We already have "self.require_device('...')" that can be used to check for 
the availability of devices and skip the test if it is not built in ... 
would that be suitable here?


 Thomas




Re: [PATCH 2/2] [NOT-FOR-MERGE] Add qtest for migration over RDMA

2025-02-19 Thread Peter Xu
On Wed, Feb 19, 2025 at 10:20:21AM -0300, Fabiano Rosas wrote:
> Peter Xu  writes:
> 
> > On Wed, Feb 19, 2025 at 05:33:26AM +, Zhijian Li (Fujitsu) wrote:
> >> 
> >> 
> >> On 19/02/2025 06:40, Peter Xu wrote:
> >> > On Tue, Feb 18, 2025 at 06:03:48PM -0300, Fabiano Rosas wrote:
> >> >> Li Zhijian via  writes:
> >> >>
> >> >>> This qtest requirs there is RXE link in the host.
> >> >>>
> >> >>> Here is an example to show how to add this RXE link:
> >> >>> $ ./new-rdma-link.sh
> >> >>> 192.168.22.93
> >> >>>
> >> >>> Signed-off-by: Li Zhijian 
> >> >>> ---
> >> >>> The RDMA migration was broken again...due to lack of sufficient 
> >> >>> test/qtest.
> >> >>>
> >> >>> It's urgly to add and execute a script to establish an RDMA link in
> >> >>> the C program. If anyone has a better suggestion, please let me know.
> >> >>>
> >> >>> $ cat ./new-rdma-link.sh
> >> >>> get_ipv4_addr() {
> >> >>>  ip -4 -o addr show dev "$1" |
> >> >>>  sed -n 
> >> >>> 's/.*[[:blank:]]inet[[:blank:]]*\([^[:blank:]/]*\).*/\1/p'
> >> >>> }
> >> >>>
> >> >>> has_soft_rdma() {
> >> >>>  rdma link | grep -q " netdev $1[[:blank:]]*\$"
> >> >>> }
> >> >>>
> >> >>> start_soft_rdma() {
> >> >>>  local type
> >> >>>
> >> >>>  modprobe rdma_rxe || return $?
> >> >>>  type=rxe
> >> >>>  (
> >> >>>  cd /sys/class/net &&
> >> >>>  for i in *; do
> >> >>>  [ -e "$i" ] || continue
> >> >>>  [ "$i" = "lo" ] && continue
> >> >>>  [ "$(<"$i/addr_len")" = 6 ] || 
> >> >>> continue
> >> >>>  [ "$(<"$i/carrier")" = 1 ] || continue
> >> >>>  has_soft_rdma "$i" && break
> >> >>>  rdma link add "${i}_$type" type $type 
> >> >>> netdev "$i" && break
> >> >>>  done
> >> >>>  has_soft_rdma "$i" && echo $i
> >> >>>  )
> >> >>>
> >> >>> }
> >> >>>
> >> >>> rxe_link=$(start_soft_rdma)
> >> >>> [[ "$rxe_link" ]] && get_ipv4_addr $rxe_link
> >> >>>
> >> >>> Signed-off-by: Li Zhijian 
> >> >>> ---
> >> >>>   tests/qtest/migration/new-rdma-link.sh |  34 
> >> >>>   tests/qtest/migration/precopy-tests.c  | 103 
> >> >>> +
> >> >>>   2 files changed, 137 insertions(+)
> >> >>>   create mode 100644 tests/qtest/migration/new-rdma-link.sh
> >> >>>
> >> >>> diff --git a/tests/qtest/migration/new-rdma-link.sh 
> >> >>> b/tests/qtest/migration/new-rdma-link.sh
> >> >>> new file mode 100644
> >> >>> index 000..ca20594eaae
> >> >>> --- /dev/null
> >> >>> +++ b/tests/qtest/migration/new-rdma-link.sh
> >> >>> @@ -0,0 +1,34 @@
> >> >>> +#!/bin/bash
> >> >>> +
> >> >>> +# Copied from blktests
> >> >>> +get_ipv4_addr() {
> >> >>> +  ip -4 -o addr show dev "$1" |
> >> >>> +  sed -n 
> >> >>> 's/.*[[:blank:]]inet[[:blank:]]*\([^[:blank:]/]*\).*/\1/p'
> >> >>> +}
> >> >>> +
> >> >>> +has_soft_rdma() {
> >> >>> +  rdma link | grep -q " netdev $1[[:blank:]]*\$"
> >> >>> +}
> >> >>> +
> >> >>> +start_soft_rdma() {
> >> >>> +  local type
> >> >>> +
> >> >>> +  modprobe rdma_rxe || return $?
> >> >>> +  type=rxe
> >> >>> +  (
> >> >>> +  cd /sys/class/net &&
> >> >>> +  for i in *; do
> >> >>> +  [ -e "$i" ] || continue
> >> >>> +  [ "$i" = "lo" ] && continue
> >> >>> +  [ "$(<"$i/addr_len")" = 6 ] || continue
> >> >>> +  [ "$(<"$i/carrier")" = 1 ] || continue
> >> >>> +  has_soft_rdma "$i" && break
> >> >>> +  rdma link add "${i}_$type" type $type 
> >> >>> netdev "$i" && break
> >> >>> +  done
> >> >>> +  has_soft_rdma "$i" && echo $i
> >> >>> +  )
> >> >>> +
> >> >>> +}
> >> >>> +
> >> >>> +rxe_link=$(start_soft_rdma)
> >> >>> +[[ "$rxe_link" ]] && get_ipv4_addr $rxe_link
> >> >>> diff --git a/tests/qtest/migration/precopy-tests.c 
> >> >>> b/tests/qtest/migration/precopy-tests.c
> >> >>> index 162fa695318..d2a1c9c9438 100644
> >> >>> --- a/tests/qtest/migration/precopy-tests.c
> >> >>> +++ b/tests/qtest/migration/precopy-tests.c
> >> >>> @@ -98,6 +98,105 @@ static void test_precopy_unix_dirty_ring(void)
> >> >>>   test_precopy_common(&args);
> >> >>>   }
> >> >>>   
> >> >>> +static int new_rdma_link(char *buffer) {
> >> >>> +// Copied from blktests
> >> >>> +const char *script =
> >> >>> +"#!/bin/bash\n"
> >> >>> +"\n"
> >> >>> +"get_ipv4_addr() {\n"
> >> >>> +"ip -4 -o addr show dev \"$1\" |\n"
> >> >>> +"sed -n 
> >> >>> 's/.*[[:blank:]]inet[[:blank:]]*\\([^[:blank:]/]*\\).*/\\1/p'\n"
> >> >>> +"}\n"
> >> >>> +"\n"
> >> >>> +"has_soft_rdma() {\n"
> >> 

[RFC PATCH] tests/functional: Generic method to check required devices availability

2025-02-19 Thread Philippe Mathieu-Daudé
Not all binaries contain the same set of devices. Since some
tests depend on specific devices, we need to check their
availability in the binary.

QemuSystemTest::require_device() allows for system tests to
explicitly check for a particular device. Add a similar
check_required_devices() method which check all devices
requested on the command line. If a device is missing, the
test is skipped.

Example running test_aarch64_virt.py on macOS:

  ok 1 test_aarch64_virt.Aarch64VirtMachine.test_aarch64_virt_with_gpu # SKIP 
no support for device virtio-gpu-gl-pci

Signed-off-by: Philippe Mathieu-Daudé 
---
Just an idea to see if we can avoid manual require_device() calls.
However not having a device in binary might also be a bug, so RFC...
---
 python/qemu/machine/machine.py | 10 ++
 tests/functional/qemu_test/testcase.py |  8 
 tests/functional/test_aarch64_virt.py  |  2 ++
 3 files changed, 20 insertions(+)

diff --git a/python/qemu/machine/machine.py b/python/qemu/machine/machine.py
index ebb58d5b68c..ff1ff066823 100644
--- a/python/qemu/machine/machine.py
+++ b/python/qemu/machine/machine.py
@@ -468,6 +468,16 @@ def launch(self) -> None:
 # that exception. However, we still want to clean up.
 raise
 
+def get_command_arguments(self, command) -> List[str]:
+"""
+Return a list of arguments used with one kind of command
+"""
+args = []
+for index, element in enumerate(self._args):
+if element == command:
+args += [self._args[index + 1]]
+return args
+
 def _launch(self) -> None:
 """
 Launch the VM and establish a QMP connection
diff --git a/tests/functional/qemu_test/testcase.py 
b/tests/functional/qemu_test/testcase.py
index 869f3949fe9..7e3288f452c 100644
--- a/tests/functional/qemu_test/testcase.py
+++ b/tests/functional/qemu_test/testcase.py
@@ -335,6 +335,14 @@ def require_device(self, devicename):
 if help.find(devicename) < 0:
 self.skipTest('no support for device ' + devicename)
 
+def check_required_devices(self):
+"""
+Check the devices requested on the command line are available
+in the binary. To be used before the VM launch() call.
+"""
+for device in self.vm.get_command_arguments('-device'):
+self.require_device(device.split(',')[0])
+
 def _new_vm(self, name, *args):
 vm = QEMUMachine(self.qemu_bin,
  name=name,
diff --git a/tests/functional/test_aarch64_virt.py 
b/tests/functional/test_aarch64_virt.py
index 95f5ce8b4c0..589680a44c5 100755
--- a/tests/functional/test_aarch64_virt.py
+++ b/tests/functional/test_aarch64_virt.py
@@ -180,6 +180,8 @@ def test_aarch64_virt_with_gpu(self):
  f"file.filename={image_path}")
 self.vm.add_args("-snapshot")
 
+self.check_required_devices()
+
 try:
 self.vm.launch()
 except VMLaunchFailure as excp:
-- 
2.47.1




Re: [PATCH 29/42] qapi: Add "Details:" disambiguation marker

2025-02-19 Thread Markus Armbruster
John Snow  writes:

> On Mon, Feb 17, 2025 at 7:13 AM Markus Armbruster  wrote:
>
>> John Snow  writes:
>>
>> > This clarifies sections that are mistaken by the parser as "intro"
>> > sections to be "details" sections instead.
>> >
>> > Signed-off-by: John Snow 
>>
>> This is rather terse.
>>
>
> Mea culpa. I can write more at length if we agree on the general approach.
> For now, you got an RFC as this was the subject of a considerable amount of
> controversy between us in the past ... so I am doing baby steps.
>
> "Commit message needs to be hit with the unterseification beam" added to
> tasklist. :)
>
>
>>
>> Why does the boundary between "intro" (previously "body") and "details"
>> matter?  As far as I understand, it matters for inlining.
>>
>
>> What is inlining?
>>
>
>> The old doc generator emits "The members of T" into the argument
>> description in the following cases:
>>
>> * When a command's arguments are given as a type T, the doc comment has
>>   no argument descriptions, and the generated argument description
>>   becomes "The members of T".
>>
>> * When an object type has a base type T, "The members of T" is appended
>>   to the doc comment's (possibly empty) argument descriptions.
>>
>> * For union types, "The members of T when TAG is VALUE" is appended to
>>   the doc comment's argument descriptions for every tag VALUE and
>>   associated type T.
>>
>> We want a description of the members of T right there instead.  To get
>> it right there, we need to inline from T's documentation.
>>
>> What exactly do we need to inline?  Turns out we don't want "intro", we
>> do want the argument descriptions and other stuff we can ignore here.
>>
>> "intro" ends before the argument descriptions, features, or a tagged
>> section, whatever comes first.  Most of the time, this works fine.  But
>> there are a few troublesome cases.  Here's one:
>>
>> ##
>> # @MemoryBackendShmProperties:
>> #
>> # Properties for memory-backend-shm objects.
>> #
>> # This memory backend supports only shared memory, which is the
>> # default.
>> #
>> # Since: 9.1
>> ##
>> { 'struct': 'MemoryBackendShmProperties',
>>   'base': 'MemoryBackendProperties',
>>   'data': { },
>>   'if': 'CONFIG_POSIX' }
>>
>> Everything up to "Since:" is "intro".  Consequently, the old doc
>> generator emits "The members of MemoryBackendProperties" right there:
>>
>> "MemoryBackendShmProperties" (Object)
>> -
>>
>> Properties for memory-backend-shm objects.
>>
>> This memory backend supports only shared memory, which is the default.
>>
>>
>> Members
>> ~~~
>>
>> The members of "MemoryBackendProperties"
>>
>> Since
>> ~
>>
>> 9.1
>>
>>
>> If
>> ~~
>>
>> "CONFIG_POSIX"
>>
>> That's also where the new one inlines.  Okay so far.
>>
>> This gets in turn inlined into ObjectOptions for branch
>> memory-backend-shm.  Since we don't inline "intro", we don't inline
>> "This memory backend supports only shared memory, which is the default."
>> That's a problem.
>>
>
> Yes, this is all correct so far.
>
>
>>
>> This patch moves the boundary between "intro" and the remainder up that
>> paragraph, so we don't lose that line.  It accomplishes that by giving
>> us syntax to manually mark the end of "intro"
>>
>> However, your solution is manual: it gives us the means[*] to mark the
>> boundary with "Details:" to avoid loss of text.  What if we don't
>> notice?  Should we tweak the syntax to force us to be explicit?  How
>> many doc comments would that affect?
>>
>
> I'm leaving that question to you. The calculus I made was that there were
> fewer SLOC changes to explicitly denote the "Details:" sections only in the
> handful of cases where it was (potentially) relevant than to mandate its
> use unconditionally.

How did you determine where it is (potentially) relevant?  Oh, wait ...

>  If you have an idea that is enforceable at runtime and
> has fewer SLOC changes, suggest away!
>
> Unseen in this patch is a warning I added to the /inliner/ that identified
> potentially "ambiguous" delineation spots and issued a warning (error); the
> exact code that did this is possibly a little hokey but it was what I used
> to identify the spots addressed by this patch.

... that's how.

> Point being: it's possible to enforce, but I enforced it in qapidoc.py in
> the inliner instead of directly in the parser. We could discuss moving the
> check to the parser if you'd like. The check itself is somewhat "dumb":
>
> - If a doc block has only one *paragraph* (knowingly/intentionally not
> using the term section here) of text, it's assumed to be the intro.

You mean if the "body" has just one paragraph, right?  The "body" is the
first section, always untagged, possibly empty.  It's contains the text
between the line naming the definition and the first tagged section.

The tagged sections are member / argument des

Re: [PATCH 2/2] [NOT-FOR-MERGE] Add qtest for migration over RDMA

2025-02-19 Thread Peter Xu
On Wed, Feb 19, 2025 at 05:33:26AM +, Zhijian Li (Fujitsu) wrote:
> 
> 
> On 19/02/2025 06:40, Peter Xu wrote:
> > On Tue, Feb 18, 2025 at 06:03:48PM -0300, Fabiano Rosas wrote:
> >> Li Zhijian via  writes:
> >>
> >>> This qtest requirs there is RXE link in the host.
> >>>
> >>> Here is an example to show how to add this RXE link:
> >>> $ ./new-rdma-link.sh
> >>> 192.168.22.93
> >>>
> >>> Signed-off-by: Li Zhijian 
> >>> ---
> >>> The RDMA migration was broken again...due to lack of sufficient 
> >>> test/qtest.
> >>>
> >>> It's urgly to add and execute a script to establish an RDMA link in
> >>> the C program. If anyone has a better suggestion, please let me know.
> >>>
> >>> $ cat ./new-rdma-link.sh
> >>> get_ipv4_addr() {
> >>>  ip -4 -o addr show dev "$1" |
> >>>  sed -n 
> >>> 's/.*[[:blank:]]inet[[:blank:]]*\([^[:blank:]/]*\).*/\1/p'
> >>> }
> >>>
> >>> has_soft_rdma() {
> >>>  rdma link | grep -q " netdev $1[[:blank:]]*\$"
> >>> }
> >>>
> >>> start_soft_rdma() {
> >>>  local type
> >>>
> >>>  modprobe rdma_rxe || return $?
> >>>  type=rxe
> >>>  (
> >>>  cd /sys/class/net &&
> >>>  for i in *; do
> >>>  [ -e "$i" ] || continue
> >>>  [ "$i" = "lo" ] && continue
> >>>  [ "$(<"$i/addr_len")" = 6 ] || continue
> >>>  [ "$(<"$i/carrier")" = 1 ] || continue
> >>>  has_soft_rdma "$i" && break
> >>>  rdma link add "${i}_$type" type $type 
> >>> netdev "$i" && break
> >>>  done
> >>>  has_soft_rdma "$i" && echo $i
> >>>  )
> >>>
> >>> }
> >>>
> >>> rxe_link=$(start_soft_rdma)
> >>> [[ "$rxe_link" ]] && get_ipv4_addr $rxe_link
> >>>
> >>> Signed-off-by: Li Zhijian 
> >>> ---
> >>>   tests/qtest/migration/new-rdma-link.sh |  34 
> >>>   tests/qtest/migration/precopy-tests.c  | 103 +
> >>>   2 files changed, 137 insertions(+)
> >>>   create mode 100644 tests/qtest/migration/new-rdma-link.sh
> >>>
> >>> diff --git a/tests/qtest/migration/new-rdma-link.sh 
> >>> b/tests/qtest/migration/new-rdma-link.sh
> >>> new file mode 100644
> >>> index 000..ca20594eaae
> >>> --- /dev/null
> >>> +++ b/tests/qtest/migration/new-rdma-link.sh
> >>> @@ -0,0 +1,34 @@
> >>> +#!/bin/bash
> >>> +
> >>> +# Copied from blktests
> >>> +get_ipv4_addr() {
> >>> + ip -4 -o addr show dev "$1" |
> >>> + sed -n 
> >>> 's/.*[[:blank:]]inet[[:blank:]]*\([^[:blank:]/]*\).*/\1/p'
> >>> +}
> >>> +
> >>> +has_soft_rdma() {
> >>> + rdma link | grep -q " netdev $1[[:blank:]]*\$"
> >>> +}
> >>> +
> >>> +start_soft_rdma() {
> >>> + local type
> >>> +
> >>> + modprobe rdma_rxe || return $?
> >>> + type=rxe
> >>> + (
> >>> + cd /sys/class/net &&
> >>> + for i in *; do
> >>> + [ -e "$i" ] || continue
> >>> + [ "$i" = "lo" ] && continue
> >>> + [ "$(<"$i/addr_len")" = 6 ] || continue
> >>> + [ "$(<"$i/carrier")" = 1 ] || continue
> >>> + has_soft_rdma "$i" && break
> >>> + rdma link add "${i}_$type" type $type netdev 
> >>> "$i" && break
> >>> + done
> >>> + has_soft_rdma "$i" && echo $i
> >>> + )
> >>> +
> >>> +}
> >>> +
> >>> +rxe_link=$(start_soft_rdma)
> >>> +[[ "$rxe_link" ]] && get_ipv4_addr $rxe_link
> >>> diff --git a/tests/qtest/migration/precopy-tests.c 
> >>> b/tests/qtest/migration/precopy-tests.c
> >>> index 162fa695318..d2a1c9c9438 100644
> >>> --- a/tests/qtest/migration/precopy-tests.c
> >>> +++ b/tests/qtest/migration/precopy-tests.c
> >>> @@ -98,6 +98,105 @@ static void test_precopy_unix_dirty_ring(void)
> >>>   test_precopy_common(&args);
> >>>   }
> >>>   
> >>> +static int new_rdma_link(char *buffer) {
> >>> +// Copied from blktests
> >>> +const char *script =
> >>> +"#!/bin/bash\n"
> >>> +"\n"
> >>> +"get_ipv4_addr() {\n"
> >>> +"ip -4 -o addr show dev \"$1\" |\n"
> >>> +"sed -n 
> >>> 's/.*[[:blank:]]inet[[:blank:]]*\\([^[:blank:]/]*\\).*/\\1/p'\n"
> >>> +"}\n"
> >>> +"\n"
> >>> +"has_soft_rdma() {\n"
> >>> +"rdma link | grep -q \" netdev $1[[:blank:]]*\\$\"\n"
> >>> +"}\n"
> >>> +"\n"
> >>> +"start_soft_rdma() {\n"
> >>> +"local type\n"
> >>> +"\n"
> >>> +"modprobe rdma_rxe || return $?\n"
> >>> +"type=rxe\n"
> >>> +"(\n"
> >>> +"cd /sys/class/net &&\n"
> >>> +"for i in *; do\n"
> >>> +"[ -e \"$i\" ] || continue\n"
> >>> +"[ \"$i\" = \"lo\" ] && continue\n"
> >>> +"[ \"$(<$i/addr_len)\" = 6 ]

[Bug 2072564] Re: qemu-aarch64-static segfaults running ldconfig.real (amd64 host)

2025-02-19 Thread Christian Ehrhardt
Thank you!
Adding to plucky soon and then planning SRUs as the queue gets freed of the 
former one in flight.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/2072564

Title:
  qemu-aarch64-static segfaults running ldconfig.real (amd64 host)

Status in QEMU:
  New
Status in qemu package in Ubuntu:
  Triaged

Bug description:
  This affects the qemu-user-static 1:8.2.2+ds-0ubuntu1 package on
  Ubuntu 24.04, running on a amd64 host.

  When running docker containers with Ubuntu 22.04 in them, emulating
  arm64 with qemu-aarch64-static, invocations of ldconfig (actually
  ldconfig.real) segfault. For example:

  $ docker run -ti --platform linux/arm64/v8 ubuntu:22.04 
  root@8861ff640a1c:/# /sbin/ldconfig.real
  Segmentation fault

  If you copy the ldconfig.real binary to the host, and run it directly
  via qemu-aarch64-static:

  $ gdb --args qemu-aarch64-static ./ldconfig.real 
  GNU gdb (Ubuntu 15.0.50.20240403-0ubuntu1) 15.0.50.20240403-git
  Copyright (C) 2024 Free Software Foundation, Inc.
  License GPLv3+: GNU GPL version 3 or later 
  This is free software: you are free to change and redistribute it.
  There is NO WARRANTY, to the extent permitted by law.
  Type "show copying" and "show warranty" for details.
  This GDB was configured as "x86_64-linux-gnu".
  Type "show configuration" for configuration details.
  For bug reporting instructions, please see:
  .
  Find the GDB manual and other documentation resources online at:
  .

  For help, type "help".
  Type "apropos word" to search for commands related to "word"...
  Reading symbols from qemu-aarch64-static...
  Reading symbols from 
/home/dim/.cache/debuginfod_client/86579812b213be0964189499f62f176bea817bf2/debuginfo...
  (gdb) r
  Starting program: /usr/bin/qemu-aarch64-static ./ldconfig.real
  [Thread debugging using libthread_db enabled]
  Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
  [New Thread 0x776006c0 (LWP 28378)]

  Thread 1 "qemu-aarch64-st" received signal SIGSEGV, Segmentation fault.
  0x7fffe801645b in ?? ()
  (gdb) disassemble 
  No function contains program counter for selected frame.

  It looks like this is a known qemu regression after v8.1.1:
  https://gitlab.com/qemu-project/qemu/-/issues/1913

  Downgrading the package to qemu-user-
  static_8.0.4+dfsg-1ubuntu3_amd64.deb fixes the segfault.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/2072564/+subscriptions




Re: [RFC PATCH] elfload: Fix alignment when unmapping excess reservation

2025-02-19 Thread Fabiano Rosas
Michael Tokarev  writes:

> 13.02.2025 17:35, Fabiano Rosas wrote:
>> When complying with the alignment requested in the ELF and unmapping
>> the excess reservation, having align_end not aligned to the guest page
>> causes the unmap to be rejected by the alignment check at
>> target_munmap and later brk adjustments hit an EEXIST.
>> 
>> Fix by aligning the start of region to be unmapped.
>> 
>> Fixes: c81d1fafa6 ("linux-user: Honor elf alignment when placing images")
>> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1913
>> Signed-off-by: Fabiano Rosas 
>
> Is this a qemu-stable material?  That issue was quite hot..

Yes, I think it's good for stable.

>
> Thanks,
>
> /mjt



[PATCH v8 17/28] vfio-user: implement VFIO_USER_DEVICE_GET_REGION_INFO

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Add support for per-region info fds. Unlike kernel vfio, vfio-user can
have a separate fd to support mmap() of individual regions; add
->use_regfds as needed to support this difference.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 68 +++
 hw/vfio-user/common.h |  2 ++
 hw/vfio-user/pci.c|  2 ++
 hw/vfio-user/protocol.h   | 14 
 hw/vfio-user/trace-events |  1 +
 hw/vfio/ap.c  |  2 ++
 hw/vfio/ccw.c |  2 ++
 hw/vfio/container.c   |  7 
 hw/vfio/helpers.c | 28 +--
 hw/vfio/pci.c |  2 ++
 hw/vfio/platform.c|  2 ++
 include/hw/vfio/vfio-common.h |  5 ++-
 12 files changed, 131 insertions(+), 4 deletions(-)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index 55d1da4e51..7b96d715cb 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -1106,3 +1106,71 @@ int vfio_user_get_info(VFIOUserProxy *proxy, struct 
vfio_device_info *info)
 
 return 0;
 }
+
+static int vfio_user_get_region_info(VFIOUserProxy *proxy,
+ struct vfio_region_info *info,
+ VFIOUserFDs *fds)
+{
+g_autofree VFIOUserRegionInfo *msgp = NULL;
+uint32_t size;
+
+/* data returned can be larger than vfio_region_info */
+if (info->argsz < sizeof(*info)) {
+error_printf("vfio_user_get_region_info argsz too small\n");
+return -E2BIG;
+}
+if (fds != NULL && fds->send_fds != 0) {
+error_printf("vfio_user_get_region_info can't send FDs\n");
+return -EINVAL;
+}
+
+size = info->argsz + sizeof(VFIOUserHdr);
+msgp = g_malloc0(size);
+
+vfio_user_request_msg(&msgp->hdr, VFIO_USER_DEVICE_GET_REGION_INFO,
+  sizeof(*msgp), 0);
+msgp->argsz = info->argsz;
+msgp->index = info->index;
+
+vfio_user_send_wait(proxy, &msgp->hdr, fds, size);
+if (msgp->hdr.flags & VFIO_USER_ERROR) {
+return -msgp->hdr.error_reply;
+}
+trace_vfio_user_get_region_info(msgp->index, msgp->flags, msgp->size);
+
+memcpy(info, &msgp->argsz, info->argsz);
+return 0;
+}
+
+
+/*
+ * Socket-based io_ops
+ */
+
+static int vfio_user_io_get_region_info(VFIODevice *vbasedev,
+struct vfio_region_info *info,
+int *fd)
+{
+int ret;
+VFIOUserFDs fds = { 0, 1, fd};
+
+ret = vfio_user_get_region_info(vbasedev->proxy, info, &fds);
+if (ret) {
+return ret;
+}
+
+if (info->index > vbasedev->num_regions) {
+return -EINVAL;
+}
+/* cap_offset in valid area */
+if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) &&
+(info->cap_offset < sizeof(*info) || info->cap_offset > info->argsz)) {
+return -EINVAL;
+}
+
+return 0;
+}
+
+VFIODeviceIO vfio_dev_io_sock = {
+.get_region_info = vfio_user_io_get_region_info,
+};
diff --git a/hw/vfio-user/common.h b/hw/vfio-user/common.h
index 11a80d4e7c..30a3125ea3 100644
--- a/hw/vfio-user/common.h
+++ b/hw/vfio-user/common.h
@@ -95,4 +95,6 @@ void vfio_user_set_handler(VFIODevice *vbasedev,
 bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp);
 int vfio_user_get_info(VFIOUserProxy *proxy, struct vfio_device_info *info);
 
+extern VFIODeviceIO vfio_dev_io_sock;
+
 #endif /* VFIO_USER_COMMON_H */
diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c
index 69806eb863..fdff6c99e6 100644
--- a/hw/vfio-user/pci.c
+++ b/hw/vfio-user/pci.c
@@ -108,6 +108,8 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error 
**errp)
 vbasedev->ops = &vfio_user_pci_ops;
 vbasedev->type = VFIO_DEVICE_TYPE_PCI;
 vbasedev->dev = DEVICE(vdev);
+vbasedev->io = &vfio_dev_io_sock;
+vbasedev->use_regfds = true;
 
 /*
  * vfio-user devices are effectively mdevs (don't use a host iommu).
diff --git a/hw/vfio-user/protocol.h b/hw/vfio-user/protocol.h
index 5f9ef1768f..6f70a48905 100644
--- a/hw/vfio-user/protocol.h
+++ b/hw/vfio-user/protocol.h
@@ -125,4 +125,18 @@ typedef struct {
 uint32_t num_irqs;
 } VFIOUserDeviceInfo;
 
+/*
+ * VFIO_USER_DEVICE_GET_REGION_INFO
+ * imported from struct vfio_region_info
+ */
+typedef struct {
+VFIOUserHdr hdr;
+uint32_t argsz;
+uint32_t flags;
+uint32_t index;
+uint32_t cap_offset;
+uint64_t size;
+uint64_t offset;
+} VFIOUserRegionInfo;
+
 #endif /* VFIO_USER_PROTOCOL_H */
diff --git a/hw/vfio-user/trace-events b/hw/vfio-user/trace-events
index 6b06a3ed82..1860430301 100644
--- a/hw/vfio-user/trace-events
+++ b/hw/vfio-user/trace-events
@@ -7,3 +7,4 @@ vfio_user_recv_request(uint16_t cmd) " command 0x%x"
 vfio_user_send_write(uint16_t id, int wrote) " id 0x%x wrote 0x%x"
 vfio_user_version(uint16_t major, uint16_t

[PATCH v8 03/28] vfio/container: support VFIO_DMA_UNMAP_FLAG_ALL

2025-02-19 Thread John Levon
Some containers can directly implement unmapping all regions;
add a new flag to support this.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio/common.c  | 24 +++--
 hw/vfio/container-base.c  |  4 +--
 hw/vfio/container.c   | 38 +--
 hw/vfio/iommufd.c | 19 +-
 include/hw/vfio/vfio-common.h |  1 +
 include/hw/vfio/vfio-container-base.h |  4 +--
 6 files changed, 68 insertions(+), 22 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 6f106167fd..b49aafc40c 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -324,7 +324,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 }
 } else {
 ret = vfio_container_dma_unmap(bcontainer, iova,
-   iotlb->addr_mask + 1, iotlb);
+   iotlb->addr_mask + 1, iotlb, 0);
 if (ret) {
 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  "0x%"HWADDR_PRIx") = %d (%s)",
@@ -348,7 +348,7 @@ static void 
vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
 int ret;
 
 /* Unmap with a single call. */
-ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL);
+ret = vfio_container_dma_unmap(bcontainer, iova, size, NULL, 0);
 if (ret) {
 error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
  strerror(-ret));
@@ -806,21 +806,15 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 }
 
 if (try_unmap) {
+int flags = 0;
+
 if (int128_eq(llsize, int128_2_64())) {
-/* The unmap ioctl doesn't accept a full 64-bit span. */
-llsize = int128_rshift(llsize, 1);
-ret = vfio_container_dma_unmap(bcontainer, iova,
-   int128_get64(llsize), NULL);
-if (ret) {
-error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
- "0x%"HWADDR_PRIx") = %d (%s)",
- bcontainer, iova, int128_get64(llsize), ret,
- strerror(-ret));
-}
-iova += int128_get64(llsize);
+flags = VFIO_DMA_UNMAP_FLAG_ALL;
 }
-ret = vfio_container_dma_unmap(bcontainer, iova,
-   int128_get64(llsize), NULL);
+
+ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize),
+   NULL, flags);
+
 if (ret) {
 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  "0x%"HWADDR_PRIx") = %d (%s)",
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 5e0c9700d9..db27e9c31d 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -27,12 +27,12 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer,
 
 int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
  hwaddr iova, ram_addr_t size,
- IOMMUTLBEntry *iotlb)
+ IOMMUTLBEntry *iotlb, int flags)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
 
 g_assert(vioc->dma_unmap);
-return vioc->dma_unmap(bcontainer, iova, size, iotlb);
+return vioc->dma_unmap(bcontainer, iova, size, iotlb, flags);
 }
 
 bool vfio_container_add_section_window(VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 0db0055f39..82987063e5 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -117,7 +117,7 @@ unmap_exit:
  */
 static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
  hwaddr iova, ram_addr_t size,
- IOMMUTLBEntry *iotlb)
+ IOMMUTLBEntry *iotlb, int flags)
 {
 const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
   bcontainer);
@@ -140,6 +140,34 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase 
*bcontainer,
 need_dirty_sync = true;
 }
 
+/* use unmap all if supported */
+if (flags & VFIO_DMA_UNMAP_FLAG_ALL) {
+unmap.iova = 0;
+unmap.size = 0;
+if (container->unmap_all_supported) {
+ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
+} else {
+/* unmap in halves */
+Int128 llsize = int128_rshift(int128_2_64(), 1);
+
+unmap.size = int128_get64(llsize);
+
+ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
+
+if (ret == 0) {
+unmap.iova += int128_get64(llsize);
+
+ret = ioctl(containe

[PATCH v8 01/28] vfio/container: pass MemoryRegion to DMA operations

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Pass through the MemoryRegion to DMA operation handlers of vfio
containers. The vfio-user container will need this later.

Originally-by: John Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: John Levon 
---
 hw/vfio/common.c  | 17 ++---
 hw/vfio/container-base.c  |  4 ++--
 hw/vfio/container.c   |  3 ++-
 hw/vfio/iommufd.c |  3 ++-
 hw/virtio/vhost-vdpa.c|  2 +-
 include/exec/memory.h |  4 +++-
 include/hw/vfio/vfio-container-base.h |  4 ++--
 system/memory.c   |  7 ++-
 8 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index abbdc56b6d..8d3d425c63 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -248,12 +248,12 @@ static bool 
vfio_listener_skipped_section(MemoryRegionSection *section)
 /* Called with rcu_read_lock held.  */
 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
ram_addr_t *ram_addr, bool *read_only,
-   Error **errp)
+   MemoryRegion **mrp, Error **errp)
 {
 bool ret, mr_has_discard_manager;
 
 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
-   &mr_has_discard_manager, errp);
+   &mr_has_discard_manager, mrp, errp);
 if (ret && mr_has_discard_manager) {
 /*
  * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
@@ -281,6 +281,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
 VFIOContainerBase *bcontainer = giommu->bcontainer;
 hwaddr iova = iotlb->iova + giommu->iommu_offset;
+MemoryRegion *mrp;
 void *vaddr;
 int ret;
 Error *local_err = NULL;
@@ -300,7 +301,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
 bool read_only;
 
-if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) {
+if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &mrp,
+&local_err)) {
 error_report_err(local_err);
 goto out;
 }
@@ -313,7 +315,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
  */
 ret = vfio_container_dma_map(bcontainer, iova,
  iotlb->addr_mask + 1, vaddr,
- read_only);
+ read_only, mrp);
 if (ret) {
 error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
  "0x%"HWADDR_PRIx", %p) = %d (%s)",
@@ -378,7 +380,7 @@ static int 
vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
 vaddr = memory_region_get_ram_ptr(section->mr) + start;
 
 ret = vfio_container_dma_map(bcontainer, iova, next - start,
- vaddr, section->readonly);
+ vaddr, section->readonly, section->mr);
 if (ret) {
 /* Rollback */
 vfio_ram_discard_notify_discard(rdl, section);
@@ -675,7 +677,7 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 }
 
 ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
- vaddr, section->readonly);
+ vaddr, section->readonly, section->mr);
 if (ret) {
 error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
@@ -1232,7 +1234,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 }
 
 rcu_read_lock();
-if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) {
+if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, NULL,
+&local_err)) {
 error_report_err(local_err);
 goto out_unlock;
 }
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 749a3fd29d..5e0c9700d9 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -17,12 +17,12 @@
 
 int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
-   void *vaddr, bool readonly)
+   void *vaddr, bool readonly, MemoryRegion *mrp)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
 
 g_assert(vioc->dma_map);
-return vioc->dma_map(bcontainer, iova, size, vaddr, readonly);
+return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mrp);
 }
 
 int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
diff --gi

[PATCH v8 06/28] vfio: refactor out vfio_interrupt_setup()

2025-02-19 Thread John Levon
Refactor the interrupt setup code out of vfio_realize(), as we will
later need this for vfio-user too.

Signed-off-by: John Levon 
---
 hw/vfio/pci.c | 54 +++
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 89d900e9cf..5fb6c4c4c6 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2957,6 +2957,37 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice 
*vdev)
 vdev->req_enabled = false;
 }
 
+static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
+{
+PCIDevice *pdev = &vdev->pdev;
+
+/* QEMU emulates all of MSI & MSIX */
+if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
+memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
+   MSIX_CAP_LENGTH);
+}
+
+if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
+memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
+   vdev->msi_cap_size);
+}
+
+if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
+vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+  vfio_intx_mmap_enable, vdev);
+pci_device_set_intx_routing_notifier(&vdev->pdev,
+ vfio_intx_routing_notifier);
+vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
+kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
+if (!vfio_intx_enable(vdev, errp)) {
+pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
+return false;
+}
+}
+return true;
+}
+
 static void vfio_realize(PCIDevice *pdev, Error **errp)
 {
 ERRP_GUARD();
@@ -3157,27 +3188,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 }
 }
 
-/* QEMU emulates all of MSI & MSIX */
-if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
-memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
-   MSIX_CAP_LENGTH);
-}
-
-if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
-memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
-   vdev->msi_cap_size);
-}
-
-if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
-vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
-  vfio_intx_mmap_enable, vdev);
-pci_device_set_intx_routing_notifier(&vdev->pdev,
- vfio_intx_routing_notifier);
-vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
-kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
-if (!vfio_intx_enable(vdev, errp)) {
-goto out_deregister;
-}
+if (!vfio_interrupt_setup(vdev, errp)) {
+goto out_teardown;
 }
 
 if (vdev->display != ON_OFF_AUTO_OFF) {
-- 
2.34.1




[PATCH v8 11/28] vfio-user: introduce vfio-user protocol specification

2025-02-19 Thread John Levon
From: Thanos Makatos 

This patch introduces the vfio-user protocol specification (formerly
known as VFIO-over-socket), which is designed to allow devices to be
emulated outside QEMU, in a separate process. vfio-user reuses the
existing VFIO defines, structs and concepts.

It has been earlier discussed as an RFC in:
"RFC: use VFIO over a UNIX domain socket to implement device offloading"

Signed-off-by: Thanos Makatos 
Signed-off-by: John Levon 
---
 MAINTAINERS|8 +-
 docs/devel/index-internals.rst |1 +
 docs/devel/vfio-user.rst   | 1522 
 3 files changed, 1530 insertions(+), 1 deletion(-)
 create mode 100644 docs/devel/vfio-user.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index 3848d37a38..3e7e6743cc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4148,12 +4148,18 @@ F: hw/remote/proxy-memory-listener.c
 F: include/hw/remote/proxy-memory-listener.h
 F: hw/remote/iohub.c
 F: include/hw/remote/iohub.h
-F: subprojects/libvfio-user
 F: hw/remote/vfio-user-obj.c
 F: include/hw/remote/vfio-user-obj.h
 F: hw/remote/iommu.c
 F: include/hw/remote/iommu.h
 
+VFIO-USER:
+M: John Levon 
+M: Thanos Makatos 
+S: Supported
+F: docs/devel/vfio-user.rst
+F: subprojects/libvfio-user
+
 EBPF:
 M: Jason Wang 
 R: Andrew Melnychenko 
diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst
index bca597c658..0bc24f0e51 100644
--- a/docs/devel/index-internals.rst
+++ b/docs/devel/index-internals.rst
@@ -21,6 +21,7 @@ Details about QEMU's various subsystems including how to add 
features to them.
s390-dasd-ipl
tracing
vfio-iommufd
+   vfio-user
writing-monitor-commands
virtio-backends
crypto
diff --git a/docs/devel/vfio-user.rst b/docs/devel/vfio-user.rst
new file mode 100644
index 00..0d96477a68
--- /dev/null
+++ b/docs/devel/vfio-user.rst
@@ -0,0 +1,1522 @@
+.. include:: 
+
+vfio-user Protocol Specification
+
+
+--
+Version_ 0.9.1
+--
+
+.. contents:: Table of Contents
+
+Introduction
+
+vfio-user is a protocol that allows a device to be emulated in a separate
+process outside of a Virtual Machine Monitor (VMM). vfio-user devices consist
+of a generic VFIO device type, living inside the VMM, which we call the client,
+and the core device implementation, living outside the VMM, which we call the
+server.
+
+The vfio-user specification is partly based on the
+`Linux VFIO ioctl interface 
`_.
+
+VFIO is a mature and stable API, backed by an extensively used framework. The
+existing VFIO client implementation in QEMU (``qemu/hw/vfio/``) can be largely
+re-used, though there is nothing in this specification that requires that
+particular implementation. None of the VFIO kernel modules are required for
+supporting the protocol, on either the client or server side. Some source
+definitions in VFIO are re-used for vfio-user.
+
+The main idea is to allow a virtual device to function in a separate process in
+the same host over a UNIX domain socket. A UNIX domain socket (``AF_UNIX``) is
+chosen because file descriptors can be trivially sent over it, which in turn
+allows:
+
+* Sharing of client memory for DMA with the server.
+* Sharing of server memory with the client for fast MMIO.
+* Efficient sharing of eventfd's for triggering interrupts.
+
+Other socket types could be used which allow the server to run in a separate
+guest in the same host (``AF_VSOCK``) or remotely (``AF_INET``). Theoretically
+the underlying transport does not necessarily have to be a socket, however we 
do
+not examine such alternatives. In this protocol version we focus on using a 
UNIX
+domain socket and introduce basic support for the other two types of sockets
+without considering performance implications.
+
+While passing of file descriptors is desirable for performance reasons, support
+is not necessary for either the client or the server in order to implement the
+protocol. There is always an in-band, message-passing fall back mechanism.
+
+Overview
+
+
+VFIO is a framework that allows a physical device to be securely passed through
+to a user space process; the device-specific kernel driver does not drive the
+device at all.  Typically, the user space process is a VMM and the device is
+passed through to it in order to achieve high performance. VFIO provides an API
+and the required functionality in the kernel. QEMU has adopted VFIO to allow a
+guest to directly access physical devices, instead of emulating them in
+software.
+
+vfio-user reuses the core VFIO concepts defined in its API, but implements them
+as messages to be sent over a socket. It does not change the kernel-based VFIO
+in any way, in fact none of the VFIO kernel modules need to be loaded to use
+vfio-user. It is also possible for the client to concurrently use the current
+kernel-based VFIO for one device, 

[PATCH v8 18/28] vfio-user: implement VFIO_USER_REGION_READ/WRITE

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Also add support for posted writes on remote devices.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 120 ++
 hw/vfio-user/common.h |   1 +
 hw/vfio-user/pci.c|   5 ++
 hw/vfio-user/protocol.h   |  12 
 hw/vfio-user/trace-events |   1 +
 hw/vfio/helpers.c |   8 ++-
 hw/vfio/pci.c |   5 +-
 include/hw/vfio/vfio-common.h |   3 +-
 8 files changed, 150 insertions(+), 5 deletions(-)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index 7b96d715cb..1a9033af5c 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -55,6 +55,8 @@ static void vfio_user_cb(void *opaque);
 
 static void vfio_user_request(void *opaque);
 static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg);
+static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
+ VFIOUserFDs *fds);
 static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
 VFIOUserFDs *fds, int rsize);
 static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd,
@@ -626,6 +628,33 @@ static int vfio_user_send_queued(VFIOUserProxy *proxy, 
VFIOUserMsg *msg)
 return 0;
 }
 
+/*
+ * async send - msg can be queued, but will be freed when sent
+ */
+static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
+ VFIOUserFDs *fds)
+{
+VFIOUserMsg *msg;
+int ret;
+
+if (!(hdr->flags & (VFIO_USER_NO_REPLY | VFIO_USER_REPLY))) {
+error_printf("vfio_user_send_async on sync message\n");
+return;
+}
+
+QEMU_LOCK_GUARD(&proxy->lock);
+
+msg = vfio_user_getmsg(proxy, hdr, fds);
+msg->id = hdr->id;
+msg->rsize = 0;
+msg->type = VFIO_MSG_ASYNC;
+
+ret = vfio_user_send_queued(proxy, msg);
+if (ret < 0) {
+vfio_user_recycle(proxy, msg);
+}
+}
+
 static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
 VFIOUserFDs *fds, int rsize)
 {
@@ -1139,9 +1168,84 @@ static int vfio_user_get_region_info(VFIOUserProxy 
*proxy,
 trace_vfio_user_get_region_info(msgp->index, msgp->flags, msgp->size);
 
 memcpy(info, &msgp->argsz, info->argsz);
+
+/* read-after-write hazard if guest can directly access region */
+if (info->flags & VFIO_REGION_INFO_FLAG_MMAP) {
+WITH_QEMU_LOCK_GUARD(&proxy->lock) {
+proxy->flags |= VFIO_PROXY_NO_POST;
+}
+}
+
 return 0;
 }
 
+static int vfio_user_region_read(VFIOUserProxy *proxy, uint8_t index,
+ off_t offset, uint32_t count, void *data)
+{
+g_autofree VFIOUserRegionRW *msgp = NULL;
+int size = sizeof(*msgp) + count;
+
+if (count > proxy->max_xfer_size) {
+return -EINVAL;
+}
+
+msgp = g_malloc0(size);
+vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_READ, sizeof(*msgp), 0);
+msgp->offset = offset;
+msgp->region = index;
+msgp->count = count;
+trace_vfio_user_region_rw(msgp->region, msgp->offset, msgp->count);
+
+vfio_user_send_wait(proxy, &msgp->hdr, NULL, size);
+if (msgp->hdr.flags & VFIO_USER_ERROR) {
+return -msgp->hdr.error_reply;
+} else if (msgp->count > count) {
+return -E2BIG;
+} else {
+memcpy(data, &msgp->data, msgp->count);
+}
+
+return msgp->count;
+}
+
+static int vfio_user_region_write(VFIOUserProxy *proxy, uint8_t index,
+  off_t offset, uint32_t count, void *data,
+  bool post)
+{
+VFIOUserRegionRW *msgp = NULL;
+int flags = post ? VFIO_USER_NO_REPLY : 0;
+int size = sizeof(*msgp) + count;
+int ret;
+
+if (count > proxy->max_xfer_size) {
+return -EINVAL;
+}
+
+msgp = g_malloc0(size);
+vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, flags);
+msgp->offset = offset;
+msgp->region = index;
+msgp->count = count;
+memcpy(&msgp->data, data, count);
+trace_vfio_user_region_rw(msgp->region, msgp->offset, msgp->count);
+
+/* async send will free msg after it's sent */
+if (post && !(proxy->flags & VFIO_PROXY_NO_POST)) {
+vfio_user_send_async(proxy, &msgp->hdr, NULL);
+return count;
+}
+
+vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0);
+if (msgp->hdr.flags & VFIO_USER_ERROR) {
+ret = -msgp->hdr.error_reply;
+} else {
+ret = count;
+}
+
+g_free(msgp);
+return ret;
+}
+
 
 /*
  * Socket-based io_ops
@@ -1171,6 +1275,22 @@ static int vfio_user_io_get_region_info(VFIODevice 
*vbasedev,
 return 0;
 }
 
+static int vfio_user_io_region_read(VFIODevice *vbasedev, uint8_t index,
+off_t off, uint32_t size, void *data)
+{
+return 

[PATCH v8 10/28] vfio: add device IO ops vector

2025-02-19 Thread John Levon
From: Jagannathan Raman 

For vfio-user, device operations such as IRQ handling and region
read/writes are implemented in userspace over the control socket, not
ioctl() or read()/write() to the vfio kernel driver; add an ops vector
to generalize this, and implement vfio_dev_io_ioctl for interacting
with the kernel vfio driver.

The ops consistently use the "-errno" return style, as the vfio-user
implementations get their errors from response messages not from the
kernel; adjust the callers to handle this as necessary.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio/ap.c  |   2 +-
 hw/vfio/ccw.c |   2 +-
 hw/vfio/common.c  |  13 +--
 hw/vfio/helpers.c | 110 ++---
 hw/vfio/pci.c | 147 ++
 hw/vfio/platform.c|   2 +-
 include/hw/vfio/vfio-common.h |  27 ++-
 7 files changed, 227 insertions(+), 76 deletions(-)

diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 30b08ad375..1adce1ab40 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -228,7 +228,7 @@ static void vfio_ap_instance_init(Object *obj)
  * handle ram_block_discard_disable().
  */
 vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops,
- DEVICE(vapdev), true);
+ &vfio_dev_io_ioctl, DEVICE(vapdev), true);
 
 /* AP device is mdev type device */
 vbasedev->mdev = true;
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 22378d50bc..8c16648819 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -682,7 +682,7 @@ static void vfio_ccw_instance_init(Object *obj)
  * ram_block_discard_disable().
  */
 vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops,
- DEVICE(vcdev), true);
+ &vfio_dev_io_ioctl, DEVICE(vcdev), true);
 }
 
 #ifdef CONFIG_IOMMUFD
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 1866b3d3c5..cc0c0f7fc7 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -971,7 +971,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase 
*bcontainer)
 continue;
 }
 
-if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+if (vbasedev->io->device_feature(vbasedev, feature)) {
 warn_report("%s: Failed to stop DMA logging, err %d (%s)",
 vbasedev->name, -errno, strerror(errno));
 }
@@ -1074,10 +1074,9 @@ static bool 
vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
 continue;
 }
 
-ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+ret = vbasedev->io->device_feature(vbasedev, feature);
 if (ret) {
-ret = -errno;
-error_setg_errno(errp, errno, "%s: Failed to start DMA logging",
+error_setg_errno(errp, -ret, "%s: Failed to start DMA logging",
  vbasedev->name);
 goto out;
 }
@@ -1145,6 +1144,7 @@ static int vfio_device_dma_logging_report(VFIODevice 
*vbasedev, hwaddr iova,
 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
 struct vfio_device_feature_dma_logging_report *report =
 (struct vfio_device_feature_dma_logging_report *)feature->data;
+int ret;
 
 report->iova = iova;
 report->length = size;
@@ -1155,8 +1155,9 @@ static int vfio_device_dma_logging_report(VFIODevice 
*vbasedev, hwaddr iova,
 feature->flags = VFIO_DEVICE_FEATURE_GET |
  VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
 
-if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
-return -errno;
+ret = vbasedev->io->device_feature(vbasedev, feature);
+if (ret) {
+return -ret;
 }
 
 return 0;
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 94bbc5747c..bef1540295 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -44,7 +44,7 @@ void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
 .count = 0,
 };
 
-ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+vbasedev->io->set_irqs(vbasedev, &irq_set);
 }
 
 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
@@ -57,7 +57,7 @@ void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int 
index)
 .count = 1,
 };
 
-ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+vbasedev->io->set_irqs(vbasedev, &irq_set);
 }
 
 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
@@ -70,7 +70,7 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int 
index)
 .count = 1,
 };
 
-ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+vbasedev->io->set_irqs(vbasedev, &irq_set);
 }
 
 static inline const char *action_to_str(int action)
@@ -117,6 +117,7 @@ bool vfio_set_irq_signaling(VFIODevice *vbasedev, int 
index, int subindex,
 int argsz;
 const char *name;
 int32_t

[PATCH v8 15/28] vfio-user: implement message send infrastructure

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Add plumbing for sending vfio-user messages on the control socket.
Add initial version negotation on connection.

Originally-by: John Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 495 ++
 hw/vfio-user/common.h |   9 +
 hw/vfio-user/pci.c|  18 +-
 hw/vfio-user/protocol.h   |  62 +
 hw/vfio-user/trace-events |   2 +
 5 files changed, 584 insertions(+), 2 deletions(-)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index 2c4ee14ede..634f95f2e3 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -20,15 +20,21 @@
 #include "io/channel-socket.h"
 #include "io/channel-util.h"
 #include "qapi/error.h"
+#include "qobject/qdict.h"
+#include "qobject/qjson.h"
+#include "qobject/qstring.h"
+#include "qobject/qnum.h"
 #include "qemu/error-report.h"
 #include "qemu/lockable.h"
 #include "qemu/main-loop.h"
 #include "qemu/sockets.h"
 #include "system/iothread.h"
 
+static int wait_time = 5000;   /* wait up to 5 sec for busy servers */
 static IOThread *vfio_user_iothread;
 
 static void vfio_user_shutdown(VFIOUserProxy *proxy);
+static int vfio_user_send_qio(VFIOUserProxy *proxy, VFIOUserMsg *msg);
 static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
  VFIOUserFDs *fds);
 static VFIOUserFDs *vfio_user_getfds(int numfds);
@@ -36,9 +42,16 @@ static void vfio_user_recycle(VFIOUserProxy *proxy, 
VFIOUserMsg *msg);
 
 static void vfio_user_recv(void *opaque);
 static int vfio_user_recv_one(VFIOUserProxy *proxy);
+static void vfio_user_send(void *opaque);
+static int vfio_user_send_one(VFIOUserProxy *proxy);
 static void vfio_user_cb(void *opaque);
 
 static void vfio_user_request(void *opaque);
+static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg);
+static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
+VFIOUserFDs *fds, int rsize);
+static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd,
+  uint32_t size, uint32_t flags);
 
 static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err)
 {
@@ -57,6 +70,35 @@ static void vfio_user_shutdown(VFIOUserProxy *proxy)
proxy->ctx, NULL, NULL);
 }
 
+static int vfio_user_send_qio(VFIOUserProxy *proxy, VFIOUserMsg *msg)
+{
+VFIOUserFDs *fds =  msg->fds;
+struct iovec iov = {
+.iov_base = msg->hdr,
+.iov_len = msg->hdr->size,
+};
+size_t numfds = 0;
+int ret, *fdp = NULL;
+Error *local_err = NULL;
+
+if (fds != NULL && fds->send_fds != 0) {
+numfds = fds->send_fds;
+fdp = fds->fds;
+}
+
+ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, 0,
+  &local_err);
+
+if (ret == -1) {
+vfio_user_set_error(msg->hdr, EIO);
+vfio_user_shutdown(proxy);
+error_report_err(local_err);
+}
+trace_vfio_user_send_write(msg->hdr->id, ret);
+
+return ret;
+}
+
 static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
  VFIOUserFDs *fds)
 {
@@ -97,6 +139,7 @@ static void vfio_user_recycle(VFIOUserProxy *proxy, 
VFIOUserMsg *msg)
 msg->hdr = NULL;
 msg->fds = NULL;
 msg->complete = false;
+msg->pending = false;
 QTAILQ_INSERT_HEAD(&proxy->free, msg, next);
 }
 
@@ -391,6 +434,54 @@ err:
 return -1;
 }
 
+/*
+ * Send messages from outgoing queue when the socket buffer has space.
+ * If we deplete 'outgoing', remove ourselves from the poll list.
+ */
+static void vfio_user_send(void *opaque)
+{
+VFIOUserProxy *proxy = opaque;
+
+QEMU_LOCK_GUARD(&proxy->lock);
+
+if (proxy->state == VFIO_PROXY_CONNECTED) {
+while (!QTAILQ_EMPTY(&proxy->outgoing)) {
+if (vfio_user_send_one(proxy) < 0) {
+return;
+}
+}
+qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx,
+   vfio_user_recv, NULL, NULL, proxy);
+}
+}
+
+/*
+ * Send a single message.
+ *
+ * Sent async messages are freed, others are moved to pending queue.
+ */
+static int vfio_user_send_one(VFIOUserProxy *proxy)
+{
+VFIOUserMsg *msg;
+int ret;
+
+msg = QTAILQ_FIRST(&proxy->outgoing);
+ret = vfio_user_send_qio(proxy, msg);
+if (ret < 0) {
+return ret;
+}
+
+QTAILQ_REMOVE(&proxy->outgoing, msg, next);
+if (msg->type == VFIO_MSG_ASYNC) {
+vfio_user_recycle(proxy, msg);
+} else {
+QTAILQ_INSERT_TAIL(&proxy->pending, msg, next);
+msg->pending = true;
+}
+
+return 0;
+}
+
 static void vfio_user_cb(void *opaque)
 {
 VFIOUserProxy *proxy = opaque;
@@ -452,6 +543,119 @@ static void vfio_user_request(void *opaque)
 }
 }
 
+/*
+ * Messages are que

[PATCH v8 16/28] vfio-user: implement VFIO_USER_DEVICE_GET_INFO

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Add support for getting basic device information.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 34 ++
 hw/vfio-user/common.h |  1 +
 hw/vfio-user/container.c  | 10 +-
 hw/vfio-user/protocol.h   | 12 
 hw/vfio-user/trace-events |  1 +
 5 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index 634f95f2e3..55d1da4e51 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -30,6 +30,13 @@
 #include "qemu/sockets.h"
 #include "system/iothread.h"
 
+/*
+ * These are to defend against a malign server trying
+ * to force us to run out of memory.
+ */
+#define VFIO_USER_MAX_REGIONS   100
+#define VFIO_USER_MAX_IRQS  50
+
 static int wait_time = 5000;   /* wait up to 5 sec for busy servers */
 static IOThread *vfio_user_iothread;
 
@@ -1072,3 +1079,30 @@ bool vfio_user_validate_version(VFIOUserProxy *proxy, 
Error **errp)
 trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities);
 return true;
 }
+
+int vfio_user_get_info(VFIOUserProxy *proxy, struct vfio_device_info *info)
+{
+VFIOUserDeviceInfo msg;
+uint32_t argsz = sizeof(msg) - sizeof(msg.hdr);
+
+memset(&msg, 0, sizeof(msg));
+vfio_user_request_msg(&msg.hdr, VFIO_USER_DEVICE_GET_INFO, sizeof(msg), 0);
+msg.argsz = argsz;
+
+vfio_user_send_wait(proxy, &msg.hdr, NULL, 0);
+if (msg.hdr.flags & VFIO_USER_ERROR) {
+return -msg.hdr.error_reply;
+}
+trace_vfio_user_get_info(msg.num_regions, msg.num_irqs);
+
+memcpy(info, &msg.argsz, argsz);
+
+/* defend against a malicious server */
+if (info->num_regions > VFIO_USER_MAX_REGIONS ||
+info->num_irqs > VFIO_USER_MAX_IRQS) {
+error_printf("%s: invalid reply\n", __func__);
+return -EINVAL;
+}
+
+return 0;
+}
diff --git a/hw/vfio-user/common.h b/hw/vfio-user/common.h
index 9f4243a67d..11a80d4e7c 100644
--- a/hw/vfio-user/common.h
+++ b/hw/vfio-user/common.h
@@ -93,5 +93,6 @@ void vfio_user_set_handler(VFIODevice *vbasedev,
void (*handler)(void *opaque, VFIOUserMsg *msg),
void *reqarg);
 bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp);
+int vfio_user_get_info(VFIOUserProxy *proxy, struct vfio_device_info *info);
 
 #endif /* VFIO_USER_COMMON_H */
diff --git a/hw/vfio-user/container.c b/hw/vfio-user/container.c
index 7b1c202517..c079d6f89b 100644
--- a/hw/vfio-user/container.c
+++ b/hw/vfio-user/container.c
@@ -16,6 +16,7 @@
 #include "exec/ram_addr.h"
 #include "hw/hw.h"
 #include "hw/vfio/pci.h"
+#include "hw/vfio-user/common.h"
 #include "hw/vfio-user/container.h"
 #include "qemu/error-report.h"
 #include "qemu/range.h"
@@ -152,7 +153,14 @@ static void 
vfio_disconnect_user_container(VFIOUserContainer *container)
 static bool vfio_user_get_device(VFIOUserContainer *container,
  VFIODevice *vbasedev, Error **errp)
 {
-struct vfio_device_info info = { 0 };
+struct vfio_device_info info = { .argsz = sizeof(info) };
+int ret;
+
+ret = vfio_user_get_info(vbasedev->proxy, &info);
+if (ret) {
+error_setg_errno(errp, -ret, "get info failure");
+return ret;
+}
 
 vbasedev->fd = -1;
 
diff --git a/hw/vfio-user/protocol.h b/hw/vfio-user/protocol.h
index 5de5b2030c..5f9ef1768f 100644
--- a/hw/vfio-user/protocol.h
+++ b/hw/vfio-user/protocol.h
@@ -113,4 +113,16 @@ typedef struct {
  */
 #define VFIO_USER_DEF_MAX_BITMAP (256 * 1024 * 1024)
 
+/*
+ * VFIO_USER_DEVICE_GET_INFO
+ * imported from struct vfio_device_info
+ */
+typedef struct {
+VFIOUserHdr hdr;
+uint32_t argsz;
+uint32_t flags;
+uint32_t num_regions;
+uint32_t num_irqs;
+} VFIOUserDeviceInfo;
+
 #endif /* VFIO_USER_PROTOCOL_H */
diff --git a/hw/vfio-user/trace-events b/hw/vfio-user/trace-events
index 7a3645024f..6b06a3ed82 100644
--- a/hw/vfio-user/trace-events
+++ b/hw/vfio-user/trace-events
@@ -6,3 +6,4 @@ vfio_user_recv_read(uint16_t id, int read) " id 0x%x read 0x%x"
 vfio_user_recv_request(uint16_t cmd) " command 0x%x"
 vfio_user_send_write(uint16_t id, int wrote) " id 0x%x wrote 0x%x"
 vfio_user_version(uint16_t major, uint16_t minor, const char *caps) " major %d 
minor %d caps: %s"
+vfio_user_get_info(uint32_t nregions, uint32_t nirqs) " #regions %d #irqs %d"
-- 
2.34.1




[PATCH v8 21/28] vfio-user: forward MSI-X PBA BAR accesses to server

2025-02-19 Thread John Levon
From: Jagannathan Raman 

For vfio-user, the server holds the pending IRQ state; set up an I/O
region for the MSI-X PBA so we can ask the server for this state on a
PBA read.

If VFIO_IRQ_INFO_MASKABLE is set for VFIO_PCI_MSIX_IRQ_INDEX, record
this in ->can_mask_msix, and use it to individually mask MSI-X
interrupts as needed.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/pci.c| 63 +
 hw/vfio/helpers.c | 26 +++
 hw/vfio/pci.c | 86 +--
 hw/vfio/pci.h |  2 +
 include/hw/vfio/vfio-common.h |  2 +
 5 files changed, 156 insertions(+), 23 deletions(-)

diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c
index b8c1cc34c2..cf1e642399 100644
--- a/hw/vfio-user/pci.c
+++ b/hw/vfio-user/pci.c
@@ -40,6 +40,62 @@ struct VFIOUserPCIDevice {
 bool no_post;   /* all regions write are sync */
 };
 
+/*
+ * The server maintains the device's pending interrupts,
+ * via its MSIX table and PBA, so we treat these accesses
+ * like PCI config space and forward them.
+ */
+static uint64_t vfio_user_pba_read(void *opaque, hwaddr addr,
+   unsigned size)
+{
+VFIOPCIDevice *vdev = opaque;
+VFIORegion *region = &vdev->bars[vdev->msix->pba_bar].region;
+uint64_t data;
+
+/* server copy is what matters */
+data = vfio_region_read(region, addr + vdev->msix->pba_offset, size);
+return data;
+}
+
+static void vfio_user_pba_write(void *opaque, hwaddr addr,
+  uint64_t data, unsigned size)
+{
+/* dropped */
+}
+
+static const MemoryRegionOps vfio_user_pba_ops = {
+.read = vfio_user_pba_read,
+.write = vfio_user_pba_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_user_msix_setup(VFIOPCIDevice *vdev)
+{
+MemoryRegion *vfio_reg, *msix_reg, *pba_reg;
+
+pba_reg = g_new0(MemoryRegion, 1);
+vdev->msix->pba_region = pba_reg;
+
+vfio_reg = vdev->bars[vdev->msix->pba_bar].mr;
+msix_reg = &vdev->pdev.msix_pba_mmio;
+memory_region_init_io(pba_reg, OBJECT(vdev), &vfio_user_pba_ops, vdev,
+  "VFIO MSIX PBA", int128_get64(msix_reg->size));
+memory_region_add_subregion_overlap(vfio_reg, vdev->msix->pba_offset,
+pba_reg, 1);
+}
+
+static void vfio_user_msix_teardown(VFIOPCIDevice *vdev)
+{
+MemoryRegion *mr, *sub;
+
+mr = vdev->bars[vdev->msix->pba_bar].mr;
+sub = vdev->msix->pba_region;
+memory_region_del_subregion(mr, sub);
+
+g_free(vdev->msix->pba_region);
+vdev->msix->pba_region = NULL;
+}
+
 /*
  * Incoming request message callback.
  *
@@ -154,6 +210,9 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error 
**errp)
 if (!vfio_add_capabilities(vdev, errp)) {
 goto out_teardown;
 }
+if (vdev->msix != NULL) {
+vfio_user_msix_setup(vdev);
+}
 
 if (!vfio_interrupt_setup(vdev, errp)) {
 goto out_teardown;
@@ -206,6 +265,10 @@ static void vfio_user_instance_finalize(Object *obj)
 g_free(vdev->emulated_config_bits);
 g_free(vdev->rom);
 
+if (vdev->msix != NULL) {
+vfio_user_msix_teardown(vdev);
+}
+
 vfio_pci_put_device(vdev);
 
 if (vbasedev->proxy != NULL) {
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 6bc7600ab1..2ab30fa91b 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -73,6 +73,32 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int 
index)
 vbasedev->io->set_irqs(vbasedev, &irq_set);
 }
 
+void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq)
+{
+struct vfio_irq_set irq_set = {
+.argsz = sizeof(irq_set),
+.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
+.index = index,
+.start = irq,
+.count = 1,
+};
+
+vbasedev->io->set_irqs(vbasedev, &irq_set);
+}
+
+void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq)
+{
+struct vfio_irq_set irq_set = {
+.argsz = sizeof(irq_set),
+.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
+.index = index,
+.start = irq,
+.count = 1,
+};
+
+vbasedev->io->set_irqs(vbasedev, &irq_set);
+}
+
 static inline const char *action_to_str(int action)
 {
 switch (action) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index f85215417d..b9c7e13053 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -520,11 +520,30 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector 
*vector, MSIMessage msg,
 kvm_irqchip_commit_routes(kvm_state);
 }
 
+static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector,
+   unsigned int nr)
+{
+Error *err = NULL;
+int32_t fd;
+
+if (vector->virq >= 0) {
+fd = event_notifier_get_fd(&vector->kvm_interrupt);
+} else {

[PATCH v8 12/28] vfio-user: add vfio-user class and container

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Introduce basic plumbing for vfio-user behind a new
--enable-vfio-user-client option.

We introduce VFIOUserContainer in hw/vfio-user/container.c, which is a
container type for the "IOMMU" type "vfio-iommu-user", and share some
common container code from hw/vfio/container.c.

Add hw/vfio-user/pci.c for instantiating VFIOUserPCIDevice objects,
sharing some common code from hw/vfio/pci.c.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 MAINTAINERS   |   2 +
 hw/meson.build|   1 +
 hw/vfio-user/container.c  | 222 ++
 hw/vfio-user/container.h  |  23 +++
 hw/vfio-user/meson.build  |   9 ++
 hw/vfio-user/pci.c| 154 ++
 hw/vfio/container.c   |   2 +-
 hw/vfio/pci.c |  12 +-
 hw/vfio/pci.h |   7 +
 include/hw/vfio/vfio-common.h |   3 +
 include/hw/vfio/vfio-container-base.h |   1 +
 meson_options.txt |   2 +
 scripts/meson-buildoptions.sh |   4 +
 13 files changed, 435 insertions(+), 7 deletions(-)
 create mode 100644 hw/vfio-user/container.c
 create mode 100644 hw/vfio-user/container.h
 create mode 100644 hw/vfio-user/meson.build
 create mode 100644 hw/vfio-user/pci.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 3e7e6743cc..c403742c27 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4158,6 +4158,8 @@ M: John Levon 
 M: Thanos Makatos 
 S: Supported
 F: docs/devel/vfio-user.rst
+F: hw/vfio-user/*
+F: include/hw/vfio-user/*
 F: subprojects/libvfio-user
 
 EBPF:
diff --git a/hw/meson.build b/hw/meson.build
index b827c82c5d..91e8d2bdc0 100644
--- a/hw/meson.build
+++ b/hw/meson.build
@@ -38,6 +38,7 @@ subdir('tpm')
 subdir('ufs')
 subdir('usb')
 subdir('vfio')
+subdir('vfio-user')
 subdir('virtio')
 subdir('watchdog')
 subdir('xen')
diff --git a/hw/vfio-user/container.c b/hw/vfio-user/container.c
new file mode 100644
index 00..7b1c202517
--- /dev/null
+++ b/hw/vfio-user/container.c
@@ -0,0 +1,222 @@
+/*
+ * Container for vfio-user IOMMU type: rather than communicating with the 
kernel
+ * vfio driver, we communicate over a socket to a server using the vfio-user
+ * protocol.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include 
+#include 
+#include "qemu/osdep.h"
+
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+#include "exec/ram_addr.h"
+#include "hw/hw.h"
+#include "hw/vfio/pci.h"
+#include "hw/vfio-user/container.h"
+#include "qemu/error-report.h"
+#include "qemu/range.h"
+#include "qapi/error.h"
+#include "trace.h"
+
+static int vfio_user_dma_unmap(const VFIOContainerBase *bcontainer,
+   hwaddr iova, ram_addr_t size,
+   IOMMUTLBEntry *iotlb, int flags)
+{
+return -ENOTSUP;
+}
+
+static int vfio_user_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
+ ram_addr_t size, void *vaddr, bool readonly,
+ MemoryRegion *mrp)
+{
+return -ENOTSUP;
+}
+
+static int
+vfio_user_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
+bool start, Error **errp)
+{
+error_setg_errno(errp, ENOTSUP, "Not supported");
+return -ENOTSUP;
+}
+
+static int vfio_user_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
+ VFIOBitmap *vbmap, hwaddr iova,
+ hwaddr size, Error **errp)
+{
+error_setg_errno(errp, ENOTSUP, "Not supported");
+return -ENOTSUP;
+}
+
+static bool vfio_user_setup(VFIOContainerBase *bcontainer, Error **errp)
+{
+error_setg_errno(errp, ENOTSUP, "Not supported");
+return -ENOTSUP;
+}
+
+static VFIOUserContainer *vfio_create_user_container(Error **errp)
+{
+VFIOUserContainer *container;
+
+container = VFIO_IOMMU_USER(object_new(TYPE_VFIO_IOMMU_USER));
+return container;
+}
+
+/*
+ * Try to mirror vfio_connect_container() as much as possible.
+ */
+static VFIOUserContainer *
+vfio_connect_user_container(AddressSpace *as, Error **errp)
+{
+VFIOContainerBase *bcontainer;
+VFIOUserContainer *container;
+VFIOAddressSpace *space;
+VFIOIOMMUClass *vioc;
+
+space = vfio_get_address_space(as);
+
+container = vfio_create_user_container(errp);
+if (!container) {
+goto put_space_exit;
+}
+
+bcontainer = &container->bcontainer;
+
+if (!vfio_cpr_register_container(bcontainer, errp)) {
+goto free_container_exit;
+}
+
+vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
+assert(vioc->setup);
+
+if (!vioc->setup(bcontainer, errp)) {
+goto unregister_container_exit;
+}
+
+vfio_address_space_insert(space, bcontainer);
+
+b

[PATCH v8 19/28] vfio-user: set up PCI in vfio_user_pci_realize()

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Re-use PCI setup functions from hw/vfio/pci.c to realize the vfio-user
PCI device.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/pci.c | 42 ++
 hw/vfio/pci.c  | 18 +-
 hw/vfio/pci.h  |  9 +
 3 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c
index 26d2960985..b8c1cc34c2 100644
--- a/hw/vfio-user/pci.c
+++ b/hw/vfio-user/pci.c
@@ -75,6 +75,7 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error 
**errp)
 AddressSpace *as;
 SocketAddress addr;
 VFIOUserProxy *proxy;
+int ret;
 
 /*
  * TODO: make option parser understand SocketAddress
@@ -127,8 +128,45 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error 
**errp)
 goto error;
 }
 
+if (!vfio_populate_device(vdev, errp)) {
+goto error;
+}
+
+/* Get a copy of config space */
+ret = vbasedev->io->region_read(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, 0,
+   MIN(pci_config_size(pdev), vdev->config_size),
+   pdev->config);
+if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
+error_setg_errno(errp, -ret, "failed to read device config space");
+goto error;
+}
+
+if (!vfio_pci_config_setup(vdev, errp)) {
+goto error;
+}
+
+/*
+ * vfio_pci_config_setup will have registered the device's BARs
+ * and setup any MSIX BARs, so errors after it succeeds must
+ * use out_teardown
+ */
+
+if (!vfio_add_capabilities(vdev, errp)) {
+goto out_teardown;
+}
+
+if (!vfio_interrupt_setup(vdev, errp)) {
+goto out_teardown;
+}
+
+vfio_register_err_notifier(vdev);
+vfio_register_req_notifier(vdev);
+
 return;
 
+out_teardown:
+vfio_teardown_msi(vdev);
+vfio_bars_exit(vdev);
 error:
 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 }
@@ -164,6 +202,10 @@ static void vfio_user_instance_finalize(Object *obj)
 VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
 VFIODevice *vbasedev = &vdev->vbasedev;
 
+vfio_bars_finalize(vdev);
+g_free(vdev->emulated_config_bits);
+g_free(vdev->rom);
+
 vfio_pci_put_device(vdev);
 
 if (vbasedev->proxy != NULL) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 168e331607..f85215417d 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1730,7 +1730,7 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, 
Error **errp)
 return true;
 }
 
-static void vfio_teardown_msi(VFIOPCIDevice *vdev)
+void vfio_teardown_msi(VFIOPCIDevice *vdev)
 {
 msi_uninit(&vdev->pdev);
 
@@ -1831,7 +1831,7 @@ static void vfio_bars_register(VFIOPCIDevice *vdev)
 }
 }
 
-static void vfio_bars_exit(VFIOPCIDevice *vdev)
+void vfio_bars_exit(VFIOPCIDevice *vdev)
 {
 int i;
 
@@ -1851,7 +1851,7 @@ static void vfio_bars_exit(VFIOPCIDevice *vdev)
 }
 }
 
-static void vfio_bars_finalize(VFIOPCIDevice *vdev)
+void vfio_bars_finalize(VFIOPCIDevice *vdev)
 {
 int i;
 
@@ -2419,7 +2419,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
 return;
 }
 
-static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
 {
 PCIDevice *pdev = &vdev->pdev;
 
@@ -2768,7 +2768,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
 return true;
 }
 
-static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
 {
 VFIODevice *vbasedev = &vdev->vbasedev;
 struct vfio_region_info *reg_info = NULL;
@@ -2886,7 +2886,7 @@ static void vfio_err_notifier_handler(void *opaque)
  * and continue after disabling error recovery support for the
  * device.
  */
-static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
+void vfio_register_err_notifier(VFIOPCIDevice *vdev)
 {
 Error *err = NULL;
 int32_t fd;
@@ -2945,7 +2945,7 @@ static void vfio_req_notifier_handler(void *opaque)
 }
 }
 
-static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
+void vfio_register_req_notifier(VFIOPCIDevice *vdev)
 {
 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
   .index = VFIO_PCI_REQ_IRQ_INDEX };
@@ -3001,7 +3001,7 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice 
*vdev)
 vdev->req_enabled = false;
 }
 
-static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
 {
 PCIDevice *pdev = &vdev->pdev;
 VFIODevice *vbasedev = &vdev->vbasedev;
@@ -3094,7 +3094,7 @@ static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, 
Error **errp)
 return true;
 }
 
-static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
+bool vfio_interrupt_setup(V

[PATCH v8 20/28] vfio-user: implement VFIO_USER_DEVICE_GET/SET_IRQ*

2025-02-19 Thread John Levon
From: Jagannathan Raman 

IRQ setup uses the same semantics as the traditional vfio path, but we
need to share the corresponding file descriptors with the server as
necessary.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 140 ++
 hw/vfio-user/protocol.h   |  25 +++
 hw/vfio-user/trace-events |   2 +
 3 files changed, 167 insertions(+)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index 1a9033af5c..182ef5ab8f 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -1179,6 +1179,122 @@ static int vfio_user_get_region_info(VFIOUserProxy 
*proxy,
 return 0;
 }
 
+static int vfio_user_get_irq_info(VFIOUserProxy *proxy,
+  struct vfio_irq_info *info)
+{
+VFIOUserIRQInfo msg;
+
+memset(&msg, 0, sizeof(msg));
+vfio_user_request_msg(&msg.hdr, VFIO_USER_DEVICE_GET_IRQ_INFO,
+  sizeof(msg), 0);
+msg.argsz = info->argsz;
+msg.index = info->index;
+
+vfio_user_send_wait(proxy, &msg.hdr, NULL, 0);
+if (msg.hdr.flags & VFIO_USER_ERROR) {
+return -msg.hdr.error_reply;
+}
+trace_vfio_user_get_irq_info(msg.index, msg.flags, msg.count);
+
+memcpy(info, &msg.argsz, sizeof(*info));
+return 0;
+}
+
+static int irq_howmany(int *fdp, uint32_t cur, uint32_t max)
+{
+int n = 0;
+
+if (fdp[cur] != -1) {
+do {
+n++;
+} while (n < max && fdp[cur + n] != -1);
+} else {
+do {
+n++;
+} while (n < max && fdp[cur + n] == -1);
+}
+
+return n;
+}
+
+static int vfio_user_set_irqs(VFIOUserProxy *proxy, struct vfio_irq_set *irq)
+{
+g_autofree VFIOUserIRQSet *msgp = NULL;
+uint32_t size, nfds, send_fds, sent_fds, max;
+
+if (irq->argsz < sizeof(*irq)) {
+error_printf("vfio_user_set_irqs argsz too small\n");
+return -EINVAL;
+}
+
+/*
+ * Handle simple case
+ */
+if ((irq->flags & VFIO_IRQ_SET_DATA_EVENTFD) == 0) {
+size = sizeof(VFIOUserHdr) + irq->argsz;
+msgp = g_malloc0(size);
+
+vfio_user_request_msg(&msgp->hdr, VFIO_USER_DEVICE_SET_IRQS, size, 0);
+msgp->argsz = irq->argsz;
+msgp->flags = irq->flags;
+msgp->index = irq->index;
+msgp->start = irq->start;
+msgp->count = irq->count;
+trace_vfio_user_set_irqs(msgp->index, msgp->start, msgp->count,
+ msgp->flags);
+
+vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0);
+if (msgp->hdr.flags & VFIO_USER_ERROR) {
+return -msgp->hdr.error_reply;
+}
+
+return 0;
+}
+
+/*
+ * Calculate the number of FDs to send
+ * and adjust argsz
+ */
+nfds = (irq->argsz - sizeof(*irq)) / sizeof(int);
+irq->argsz = sizeof(*irq);
+msgp = g_malloc0(sizeof(*msgp));
+/*
+ * Send in chunks if over max_send_fds
+ */
+for (sent_fds = 0; nfds > sent_fds; sent_fds += send_fds) {
+VFIOUserFDs *arg_fds, loop_fds;
+
+/* must send all valid FDs or all invalid FDs in single msg */
+max = nfds - sent_fds;
+if (max > proxy->max_send_fds) {
+max = proxy->max_send_fds;
+}
+send_fds = irq_howmany((int *)irq->data, sent_fds, max);
+
+vfio_user_request_msg(&msgp->hdr, VFIO_USER_DEVICE_SET_IRQS,
+  sizeof(*msgp), 0);
+msgp->argsz = irq->argsz;
+msgp->flags = irq->flags;
+msgp->index = irq->index;
+msgp->start = irq->start + sent_fds;
+msgp->count = send_fds;
+trace_vfio_user_set_irqs(msgp->index, msgp->start, msgp->count,
+ msgp->flags);
+
+loop_fds.send_fds = send_fds;
+loop_fds.recv_fds = 0;
+loop_fds.fds = (int *)irq->data + sent_fds;
+arg_fds = loop_fds.fds[0] != -1 ? &loop_fds : NULL;
+
+vfio_user_send_wait(proxy, &msgp->hdr, arg_fds, 0);
+if (msgp->hdr.flags & VFIO_USER_ERROR) {
+return -msgp->hdr.error_reply;
+}
+}
+
+return 0;
+}
+
 static int vfio_user_region_read(VFIOUserProxy *proxy, uint8_t index,
  off_t offset, uint32_t count, void *data)
 {
@@ -1275,6 +1391,28 @@ static int vfio_user_io_get_region_info(VFIODevice 
*vbasedev,
 return 0;
 }
 
+static int vfio_user_io_get_irq_info(VFIODevice *vbasedev,
+ struct vfio_irq_info *irq)
+{
+int ret;
+
+ret = vfio_user_get_irq_info(vbasedev->proxy, irq);
+if (ret) {
+return ret;
+}
+
+if (irq->index > vbasedev->num_irqs) {
+return -EINVAL;
+}
+return 0;
+}
+
+static int vfio_user_io_set_irqs(VFIODevice *vbasedev,
+ struct vfio_irq_set *irqs)
+{
+return vfio_user_set_irqs(vbasedev->proxy, irqs);
+

[PATCH v8 24/28] vfio-user: implement VFIO_USER_DMA_MAP/UNMAP

2025-02-19 Thread John Levon
From: John Levon 

When the vfio-user container gets mapping updates, share them with the
vfio-user by sending a message; this can include the region fd, allowing
the server to directly mmap() the region as needed.

For performance, we only wait for the message responses when we're doing
with a series of updates via the listener_commit() callback.

Originally-by: John Johnson 
Signed-off-by: Jagannathan Raman 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c |  89 +++
 hw/vfio-user/common.h |   9 
 hw/vfio-user/container.c  | 107 +-
 hw/vfio-user/protocol.h   |  32 
 hw/vfio-user/trace-events |   4 ++
 5 files changed, 229 insertions(+), 12 deletions(-)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index 160a1f0536..b78b9e57e8 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -44,7 +44,6 @@ static void vfio_user_shutdown(VFIOUserProxy *proxy);
 static int vfio_user_send_qio(VFIOUserProxy *proxy, VFIOUserMsg *msg);
 static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
  VFIOUserFDs *fds);
-static VFIOUserFDs *vfio_user_getfds(int numfds);
 static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg);
 
 static void vfio_user_recv(void *opaque);
@@ -57,10 +56,6 @@ static void vfio_user_request(void *opaque);
 static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg);
 static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
  VFIOUserFDs *fds);
-static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
-VFIOUserFDs *fds, int rsize);
-static void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd,
-  uint32_t size, uint32_t flags);
 
 static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err)
 {
@@ -152,7 +147,7 @@ static void vfio_user_recycle(VFIOUserProxy *proxy, 
VFIOUserMsg *msg)
 QTAILQ_INSERT_HEAD(&proxy->free, msg, next);
 }
 
-static VFIOUserFDs *vfio_user_getfds(int numfds)
+VFIOUserFDs *vfio_user_getfds(int numfds)
 {
 VFIOUserFDs *fds = g_malloc0(sizeof(*fds) + (numfds * sizeof(int)));
 
@@ -655,8 +650,38 @@ static void vfio_user_send_async(VFIOUserProxy *proxy, 
VFIOUserHdr *hdr,
 }
 }
 
-static void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
-VFIOUserFDs *fds, int rsize)
+/*
+ * nowait send - vfio_wait_reqs() can wait for it later
+ */
+void vfio_user_send_nowait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
+   VFIOUserFDs *fds, int rsize)
+{
+VFIOUserMsg *msg;
+int ret;
+
+if (hdr->flags & VFIO_USER_NO_REPLY) {
+error_printf("vfio_user_send_nowait on async message\n");
+return;
+}
+
+QEMU_LOCK_GUARD(&proxy->lock);
+
+msg = vfio_user_getmsg(proxy, hdr, fds);
+msg->id = hdr->id;
+msg->rsize = rsize ? rsize : hdr->size;
+msg->type = VFIO_MSG_NOWAIT;
+
+ret = vfio_user_send_queued(proxy, msg);
+if (ret < 0) {
+vfio_user_recycle(proxy, msg);
+return;
+}
+
+proxy->last_nowait = msg;
+}
+
+void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
+ VFIOUserFDs *fds, int rsize)
 {
 VFIOUserMsg *msg;
 int ret;
@@ -693,6 +718,50 @@ static void vfio_user_send_wait(VFIOUserProxy *proxy, 
VFIOUserHdr *hdr,
 qemu_mutex_unlock(&proxy->lock);
 }
 
+void vfio_user_wait_reqs(VFIOUserProxy *proxy)
+{
+VFIOUserMsg *msg;
+
+/*
+ * Any DMA map/unmap requests sent in the middle
+ * of a memory region transaction were sent nowait.
+ * Wait for them here.
+ */
+qemu_mutex_lock(&proxy->lock);
+if (proxy->last_nowait != NULL) {
+/*
+ * Change type to WAIT to wait for reply
+ */
+msg = proxy->last_nowait;
+msg->type = VFIO_MSG_WAIT;
+proxy->last_nowait = NULL;
+while (!msg->complete) {
+if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) {
+VFIOUserMsgQ *list;
+
+list = msg->pending ? &proxy->pending : &proxy->outgoing;
+QTAILQ_REMOVE(list, msg, next);
+error_printf("vfio_wait_reqs - timed out\n");
+break;
+}
+}
+
+if (msg->hdr->flags & VFIO_USER_ERROR) {
+error_printf("vfio_user_wait_reqs - error reply on async ");
+error_printf("request: command %x error %s\n", msg->hdr->command,
+ strerror(msg->hdr->error_reply));
+}
+
+/*
+ * Change type back to NOWAIT to free
+ */
+msg->type = VFIO_MSG_NOWAIT;
+vfio_user_recycle(proxy, msg);
+}
+
+qemu_mutex_unlock(&proxy->lock);
+}
+
 static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets =

[PATCH v8 14/28] vfio-user: implement message receive infrastructure

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Add the basic implementation for receiving vfio-user messages from the
control socket.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 412 +-
 hw/vfio-user/common.h |  10 +
 hw/vfio-user/pci.c|  11 +
 hw/vfio-user/protocol.h   |  54 +
 hw/vfio-user/trace-events |   6 +
 hw/vfio-user/trace.h  |   1 +
 meson.build   |   1 +
 7 files changed, 493 insertions(+), 2 deletions(-)
 create mode 100644 hw/vfio-user/protocol.h
 create mode 100644 hw/vfio-user/trace-events
 create mode 100644 hw/vfio-user/trace.h

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index e829abccec..2c4ee14ede 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -14,21 +14,37 @@
 
 #include "hw/hw.h"
 #include "hw/vfio/vfio-common.h"
+#include "hw/vfio-user/common.h"
+#include "hw/vfio-user/trace.h"
 #include "io/channel.h"
 #include "io/channel-socket.h"
 #include "io/channel-util.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/lockable.h"
+#include "qemu/main-loop.h"
 #include "qemu/sockets.h"
 #include "system/iothread.h"
 
-#include "common.h"
-
 static IOThread *vfio_user_iothread;
 
 static void vfio_user_shutdown(VFIOUserProxy *proxy);
+static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
+ VFIOUserFDs *fds);
+static VFIOUserFDs *vfio_user_getfds(int numfds);
+static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg);
+
+static void vfio_user_recv(void *opaque);
+static int vfio_user_recv_one(VFIOUserProxy *proxy);
+static void vfio_user_cb(void *opaque);
+
+static void vfio_user_request(void *opaque);
 
+static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err)
+{
+hdr->flags |= VFIO_USER_ERROR;
+hdr->error_reply = err;
+}
 
 /*
  * Functions called by main, CPU, or iothread threads
@@ -41,10 +57,340 @@ static void vfio_user_shutdown(VFIOUserProxy *proxy)
proxy->ctx, NULL, NULL);
 }
 
+static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
+ VFIOUserFDs *fds)
+{
+VFIOUserMsg *msg;
+
+msg = QTAILQ_FIRST(&proxy->free);
+if (msg != NULL) {
+QTAILQ_REMOVE(&proxy->free, msg, next);
+} else {
+msg = g_malloc0(sizeof(*msg));
+qemu_cond_init(&msg->cv);
+}
+
+msg->hdr = hdr;
+msg->fds = fds;
+return msg;
+}
+
+/*
+ * Recycle a message list entry to the free list.
+ */
+static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg)
+{
+if (msg->type == VFIO_MSG_NONE) {
+error_printf("vfio_user_recycle - freeing free msg\n");
+return;
+}
+
+/* free msg buffer if no one is waiting to consume the reply */
+if (msg->type == VFIO_MSG_NOWAIT || msg->type == VFIO_MSG_ASYNC) {
+g_free(msg->hdr);
+if (msg->fds != NULL) {
+g_free(msg->fds);
+}
+}
+
+msg->type = VFIO_MSG_NONE;
+msg->hdr = NULL;
+msg->fds = NULL;
+msg->complete = false;
+QTAILQ_INSERT_HEAD(&proxy->free, msg, next);
+}
+
+static VFIOUserFDs *vfio_user_getfds(int numfds)
+{
+VFIOUserFDs *fds = g_malloc0(sizeof(*fds) + (numfds * sizeof(int)));
+
+fds->fds = (int *)((char *)fds + sizeof(*fds));
+
+return fds;
+}
+
 /*
  * Functions only called by iothread
  */
 
+/*
+ * Process a received message.
+ */
+static void vfio_user_process(VFIOUserProxy *proxy, VFIOUserMsg *msg,
+  bool isreply)
+{
+
+/*
+ * Replies signal a waiter, if none just check for errors
+ * and free the message buffer.
+ *
+ * Requests get queued for the BH.
+ */
+if (isreply) {
+msg->complete = true;
+if (msg->type == VFIO_MSG_WAIT) {
+qemu_cond_signal(&msg->cv);
+} else {
+if (msg->hdr->flags & VFIO_USER_ERROR) {
+error_printf("vfio_user_process: error reply on async ");
+error_printf("request command %x error %s\n",
+ msg->hdr->command,
+ strerror(msg->hdr->error_reply));
+}
+/* youngest nowait msg has been ack'd */
+if (proxy->last_nowait == msg) {
+proxy->last_nowait = NULL;
+}
+vfio_user_recycle(proxy, msg);
+}
+} else {
+QTAILQ_INSERT_TAIL(&proxy->incoming, msg, next);
+qemu_bh_schedule(proxy->req_bh);
+}
+}
+
+/*
+ * Complete a partial message read
+ */
+static int vfio_user_complete(VFIOUserProxy *proxy, Error **errp)
+{
+VFIOUserMsg *msg = proxy->part_recv;
+size_t msgleft = proxy->recv_left;
+bool isreply;
+char *data;
+int ret;
+
+data = (char *)msg->hdr + (msg->hdr->size - msgleft);
+while 

[PATCH v8 09/28] vfio: split out VFIOKernelPCIDevice

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Split out code specific to the kernel-side vfio implementation from the
VFIOPCIDevice class into a VFIOKernelPCIDevice. The forthcoming
VFIOUserPCIDevice will share the base VFIOPCIDevice class.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio/helpers.c |   2 +-
 hw/vfio/pci.c | 107 --
 hw/vfio/pci.h |  16 ++-
 3 files changed, 80 insertions(+), 45 deletions(-)

diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 3c923d23b9..94bbc5747c 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -744,7 +744,7 @@ bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error 
**errp)
 VFIODevice *vfio_get_vfio_device(Object *obj)
 {
 if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
-return &VFIO_PCI(obj)->vbasedev;
+return &VFIO_PCI(obj)->device.vbasedev;
 } else {
 return NULL;
 }
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a4f99fc5e0..812743e9dd 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -239,7 +239,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, 
PCIINTxRoute *route)
 
 static void vfio_intx_routing_notifier(PCIDevice *pdev)
 {
-VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
 PCIINTxRoute route;
 
 if (vdev->interrupt != VFIO_INT_INTx) {
@@ -514,7 +514,7 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, 
MSIMessage msg,
 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
MSIMessage *msg, IOHandler *handler)
 {
-VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
 VFIOMSIVector *vector;
 int ret;
 bool resizing = !!(vdev->nr_vectors < nr + 1);
@@ -619,7 +619,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
 
 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 {
-VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
 VFIOMSIVector *vector = &vdev->msi_vectors[nr];
 
 trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
@@ -1167,7 +1167,7 @@ static const MemoryRegionOps vfio_vga_ops = {
  */
 static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
 {
-VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
 VFIORegion *region = &vdev->bars[bar].region;
 MemoryRegion *mmap_mr, *region_mr, *base_mr;
 PCIIORegion *r;
@@ -1213,7 +1213,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice 
*pdev, int bar)
  */
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
 {
-VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
 uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
 
 memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
@@ -1246,7 +1246,7 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t 
addr, int len)
 void vfio_pci_write_config(PCIDevice *pdev,
uint32_t addr, uint32_t val, int len)
 {
-VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
 uint32_t val_le = cpu_to_le32(val);
 
 trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
@@ -3084,7 +3084,7 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, 
Error **errp)
 static void vfio_realize(PCIDevice *pdev, Error **errp)
 {
 ERRP_GUARD();
-VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
 VFIODevice *vbasedev = &vdev->vbasedev;
 int i, ret;
 char uuid[UUID_STR_LEN];
@@ -3274,7 +3274,7 @@ error:
 
 static void vfio_instance_finalize(Object *obj)
 {
-VFIOPCIDevice *vdev = VFIO_PCI(obj);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
 
 vfio_display_finalize(vdev);
 vfio_bars_finalize(vdev);
@@ -3292,7 +3292,7 @@ static void vfio_instance_finalize(Object *obj)
 
 static void vfio_exitfn(PCIDevice *pdev)
 {
-VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
 VFIODevice *vbasedev = &vdev->vbasedev;
 
 vfio_unregister_req_notifier(vdev);
@@ -3316,7 +3316,7 @@ static void vfio_exitfn(PCIDevice *pdev)
 
 static void vfio_pci_reset(DeviceState *dev)
 {
-VFIOPCIDevice *vdev = VFIO_PCI(dev);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
 
 trace_vfio_pci_reset(vdev->vbasedev.name);
 
@@ -3356,7 +3356,7 @@ post_reset:
 static void vfio_instance_init(Object *obj)
 {
 PCIDevice *pci_dev = PCI_DEVICE(obj);
-VFIOPCIDevice *vdev = VFIO_PCI(obj);
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
 VFIODevice *vbasedev = &vdev->vbasedev;
 
 device_add_bootindex_property(obj, &vdev->bootindex,
@@ -3377,28 +3377,15 @@ static void vfio_instance_init(Object *obj)
 pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
 }
 
-static const Property vfio_pci_dev_pr

[PATCH v8 08/28] vfio: add region cache

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Instead of requesting region information on demand with
VFIO_DEVICE_GET_REGION_INFO, maintain a cache: this will become
necessary for performance for vfio-user, where this call becomes a
message over the control socket, so is of higher overhead than the
traditional path.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio/ccw.c |  5 -
 hw/vfio/common.c  | 12 
 hw/vfio/container.c   | 10 ++
 hw/vfio/helpers.c | 21 -
 hw/vfio/igd.c |  8 
 hw/vfio/pci.c |  8 
 include/hw/vfio/vfio-common.h |  1 +
 7 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 67bc137f9b..22378d50bc 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -510,7 +510,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error 
**errp)
 
 vcdev->io_region_offset = info->offset;
 vcdev->io_region = g_malloc0(info->size);
-g_free(info);
 
 /* check for the optional async command region */
 ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW,
@@ -523,7 +522,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error 
**errp)
 }
 vcdev->async_cmd_region_offset = info->offset;
 vcdev->async_cmd_region = g_malloc0(info->size);
-g_free(info);
 }
 
 ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW,
@@ -536,7 +534,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error 
**errp)
 }
 vcdev->schib_region_offset = info->offset;
 vcdev->schib_region = g_malloc(info->size);
-g_free(info);
 }
 
 ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW,
@@ -550,7 +547,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error 
**errp)
 }
 vcdev->crw_region_offset = info->offset;
 vcdev->crw_region = g_malloc(info->size);
-g_free(info);
 }
 
 return true;
@@ -560,7 +556,6 @@ out_err:
 g_free(vcdev->schib_region);
 g_free(vcdev->async_cmd_region);
 g_free(vcdev->io_region);
-g_free(info);
 return false;
 }
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 4434e0a0a2..1866b3d3c5 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1569,6 +1569,16 @@ retry:
 return info;
 }
 
+static void vfio_get_all_regions(VFIODevice *vbasedev)
+{
+struct vfio_region_info *info;
+int i;
+
+for (i = 0; i < vbasedev->num_regions; i++) {
+vfio_get_region_info(vbasedev, i, &info);
+}
+}
+
 void vfio_prepare_device(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
  VFIOGroup *group, struct vfio_device_info *info)
 {
@@ -1586,6 +1596,8 @@ void vfio_prepare_device(VFIODevice *vbasedev, 
VFIOContainerBase *bcontainer,
 }
 
 QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+
+vfio_get_all_regions(vbasedev);
 }
 
 bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name,
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 37a3befbc5..36cd245c92 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -886,6 +886,16 @@ static bool vfio_get_device(VFIOGroup *group, const char 
*name,
 
 static void vfio_put_base_device(VFIODevice *vbasedev)
 {
+if (vbasedev->regions != NULL) {
+int i;
+
+for (i = 0; i < vbasedev->num_regions; i++) {
+g_free(vbasedev->regions[i]);
+}
+g_free(vbasedev->regions);
+vbasedev->regions = NULL;
+}
+
 if (!vbasedev->group) {
 return;
 }
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 4b255d4f3a..3c923d23b9 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -345,7 +345,7 @@ static int vfio_setup_region_sparse_mmaps(VFIORegion 
*region,
 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
   int index, const char *name)
 {
-g_autofree struct vfio_region_info *info = NULL;
+struct vfio_region_info *info = NULL;
 int ret;
 
 ret = vfio_get_region_info(vbasedev, index, &info);
@@ -562,6 +562,17 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
 {
 size_t argsz = sizeof(struct vfio_region_info);
 
+/* create region cache */
+if (vbasedev->regions == NULL) {
+vbasedev->regions = g_new0(struct vfio_region_info *,
+   vbasedev->num_regions);
+}
+/* check cache */
+if (vbasedev->regions[index] != NULL) {
+*info = vbasedev->regions[index];
+return 0;
+}
+
 *info = g_malloc0(argsz);
 
 (*info)->index = index;
@@ -581,6 +592,9 @@ retry:
 goto retry;
 }
 
+/* fill cache */
+vbasedev->regions[index] = *info;
+
 return 0;
 }
 
@@ -599,7 +613,6 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, ui

[PATCH v8 26/28] vfio-user: add 'no-direct-dma' option

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Normally, the vfio-user client will share a region's file descriptor
with the server to allow it directly mmap() the region memory. Add an
option to disable this, so the server must use
VFIO_USER_REGION_READ/WRITE instead.

FIXME: doesn't actually stop sending the fd??

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.h| 1 +
 hw/vfio-user/container.c | 2 +-
 hw/vfio-user/pci.c   | 5 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/vfio-user/common.h b/hw/vfio-user/common.h
index f8c61f2128..72138220ba 100644
--- a/hw/vfio-user/common.h
+++ b/hw/vfio-user/common.h
@@ -84,6 +84,7 @@ typedef struct VFIOUserProxy {
 
 /* VFIOProxy flags */
 #define VFIO_PROXY_CLIENT0x1
+#define VFIO_PROXY_NO_MMAP   0x2
 #define VFIO_PROXY_FORCE_QUEUED  0x4
 #define VFIO_PROXY_NO_POST   0x8
 
diff --git a/hw/vfio-user/container.c b/hw/vfio-user/container.c
index 3974bc8a8c..3880316238 100644
--- a/hw/vfio-user/container.c
+++ b/hw/vfio-user/container.c
@@ -102,7 +102,7 @@ static int vfio_user_dma_map(const VFIOContainerBase 
*bcontainer, hwaddr iova,
  * vaddr enters as a QEMU process address; make it either a file offset
  * for mapped areas or leave as 0.
  */
-if (fd != -1) {
+if (fd != -1 && !(container->proxy->flags & VFIO_PROXY_NO_MMAP)) {
 msgp->offset = qemu_ram_block_host_offset(mrp->ram_block, vaddr);
 }
 
diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c
index e65c7eaf02..8a05e69a46 100644
--- a/hw/vfio-user/pci.c
+++ b/hw/vfio-user/pci.c
@@ -36,6 +36,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI)
 struct VFIOUserPCIDevice {
 VFIOPCIDevice device;
 char *sock_name;
+bool no_direct_dma; /* disable shared mem for DMA */
 bool send_queued;   /* all sends are queued */
 bool no_post;   /* all regions write are sync */
 };
@@ -264,6 +265,9 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error 
**errp)
 vbasedev->proxy = proxy;
 vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev);
 
+if (udev->no_direct_dma) {
+proxy->flags |= VFIO_PROXY_NO_MMAP;
+}
 if (udev->send_queued) {
 proxy->flags |= VFIO_PROXY_FORCE_QUEUED;
 }
@@ -402,6 +406,7 @@ static void vfio_user_pci_reset(DeviceState *dev)
 
 static const Property vfio_user_pci_dev_properties[] = {
 DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name),
+DEFINE_PROP_BOOL("no-direct-dma", VFIOUserPCIDevice, no_direct_dma, false),
 DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false),
 DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false),
 };
-- 
2.34.1




[PATCH v8 13/28] vfio-user: connect vfio proxy to remote server

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Introduce the vfio-user "proxy": this is the client code responsible for
sending and receiving vfio-user messages across the control socket.

The new files hw/vfio-user/common.[ch] contain some basic plumbing for
managing the proxy; initialize the proxy during realization of the
VFIOUserPCIDevice instance.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 171 ++
 hw/vfio-user/common.h |  78 
 hw/vfio-user/meson.build  |   1 +
 hw/vfio-user/pci.c|  18 
 include/hw/vfio/vfio-common.h |   2 +
 5 files changed, 270 insertions(+)
 create mode 100644 hw/vfio-user/common.c
 create mode 100644 hw/vfio-user/common.h

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
new file mode 100644
index 00..e829abccec
--- /dev/null
+++ b/hw/vfio-user/common.c
@@ -0,0 +1,171 @@
+/*
+ * vfio protocol over a UNIX socket.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include 
+#include 
+
+#include "hw/hw.h"
+#include "hw/vfio/vfio-common.h"
+#include "io/channel.h"
+#include "io/channel-socket.h"
+#include "io/channel-util.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/lockable.h"
+#include "qemu/sockets.h"
+#include "system/iothread.h"
+
+#include "common.h"
+
+static IOThread *vfio_user_iothread;
+
+static void vfio_user_shutdown(VFIOUserProxy *proxy);
+
+
+/*
+ * Functions called by main, CPU, or iothread threads
+ */
+
+static void vfio_user_shutdown(VFIOUserProxy *proxy)
+{
+qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL);
+qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, NULL,
+   proxy->ctx, NULL, NULL);
+}
+
+/*
+ * Functions only called by iothread
+ */
+
+static void vfio_user_cb(void *opaque)
+{
+VFIOUserProxy *proxy = opaque;
+
+QEMU_LOCK_GUARD(&proxy->lock);
+
+proxy->state = VFIO_PROXY_CLOSED;
+qemu_cond_signal(&proxy->close_cv);
+}
+
+
+/*
+ * Functions called by main or CPU threads
+ */
+
+static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets =
+QLIST_HEAD_INITIALIZER(vfio_user_sockets);
+
+VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp)
+{
+VFIOUserProxy *proxy;
+QIOChannelSocket *sioc;
+QIOChannel *ioc;
+char *sockname;
+
+if (addr->type != SOCKET_ADDRESS_TYPE_UNIX) {
+error_setg(errp, "vfio_user_connect - bad address family");
+return NULL;
+}
+sockname = addr->u.q_unix.path;
+
+sioc = qio_channel_socket_new();
+ioc = QIO_CHANNEL(sioc);
+if (qio_channel_socket_connect_sync(sioc, addr, errp)) {
+object_unref(OBJECT(ioc));
+return NULL;
+}
+qio_channel_set_blocking(ioc, false, NULL);
+
+proxy = g_malloc0(sizeof(VFIOUserProxy));
+proxy->sockname = g_strdup_printf("unix:%s", sockname);
+proxy->ioc = ioc;
+proxy->flags = VFIO_PROXY_CLIENT;
+proxy->state = VFIO_PROXY_CONNECTED;
+
+qemu_mutex_init(&proxy->lock);
+qemu_cond_init(&proxy->close_cv);
+
+if (vfio_user_iothread == NULL) {
+vfio_user_iothread = iothread_create("VFIO user", errp);
+}
+
+proxy->ctx = iothread_get_aio_context(vfio_user_iothread);
+
+QTAILQ_INIT(&proxy->outgoing);
+QTAILQ_INIT(&proxy->incoming);
+QTAILQ_INIT(&proxy->free);
+QTAILQ_INIT(&proxy->pending);
+QLIST_INSERT_HEAD(&vfio_user_sockets, proxy, next);
+
+return proxy;
+}
+
+void vfio_user_disconnect(VFIOUserProxy *proxy)
+{
+VFIOUserMsg *r1, *r2;
+
+qemu_mutex_lock(&proxy->lock);
+
+/* our side is quitting */
+if (proxy->state == VFIO_PROXY_CONNECTED) {
+vfio_user_shutdown(proxy);
+if (!QTAILQ_EMPTY(&proxy->pending)) {
+error_printf("vfio_user_disconnect: outstanding requests\n");
+}
+}
+object_unref(OBJECT(proxy->ioc));
+proxy->ioc = NULL;
+
+proxy->state = VFIO_PROXY_CLOSING;
+QTAILQ_FOREACH_SAFE(r1, &proxy->outgoing, next, r2) {
+qemu_cond_destroy(&r1->cv);
+QTAILQ_REMOVE(&proxy->outgoing, r1, next);
+g_free(r1);
+}
+QTAILQ_FOREACH_SAFE(r1, &proxy->incoming, next, r2) {
+qemu_cond_destroy(&r1->cv);
+QTAILQ_REMOVE(&proxy->incoming, r1, next);
+g_free(r1);
+}
+QTAILQ_FOREACH_SAFE(r1, &proxy->pending, next, r2) {
+qemu_cond_destroy(&r1->cv);
+QTAILQ_REMOVE(&proxy->pending, r1, next);
+g_free(r1);
+}
+QTAILQ_FOREACH_SAFE(r1, &proxy->free, next, r2) {
+qemu_cond_destroy(&r1->cv);
+QTAILQ_REMOVE(&proxy->free, r1, next);
+g_free(r1);
+}
+
+/*
+ * Make sure the iothread isn't blocking anywhere
+ * wi

[PATCH v8 23/28] vfio-user: implement VFIO_USER_DEVICE_RESET

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Hook this call up to the legacy reset handler for vfio-user-pci.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 12 
 hw/vfio-user/common.h |  1 +
 hw/vfio-user/pci.c| 15 +++
 3 files changed, 28 insertions(+)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index 182ef5ab8f..160a1f0536 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -1362,6 +1362,18 @@ static int vfio_user_region_write(VFIOUserProxy *proxy, 
uint8_t index,
 return ret;
 }
 
+void vfio_user_reset(VFIOUserProxy *proxy)
+{
+VFIOUserHdr msg;
+
+vfio_user_request_msg(&msg, VFIO_USER_DEVICE_RESET, sizeof(msg), 0);
+
+vfio_user_send_wait(proxy, &msg, NULL, 0);
+if (msg.flags & VFIO_USER_ERROR) {
+error_printf("reset reply error %d\n", msg.error_reply);
+}
+}
+
 
 /*
  * Socket-based io_ops
diff --git a/hw/vfio-user/common.h b/hw/vfio-user/common.h
index 31b11ed614..c09637dd48 100644
--- a/hw/vfio-user/common.h
+++ b/hw/vfio-user/common.h
@@ -95,6 +95,7 @@ void vfio_user_set_handler(VFIODevice *vbasedev,
void *reqarg);
 bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp);
 int vfio_user_get_info(VFIOUserProxy *proxy, struct vfio_device_info *info);
+void vfio_user_reset(VFIOUserProxy *proxy);
 
 extern VFIODeviceIO vfio_dev_io_sock;
 
diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c
index cf1e642399..d520b7592b 100644
--- a/hw/vfio-user/pci.c
+++ b/hw/vfio-user/pci.c
@@ -276,6 +276,20 @@ static void vfio_user_instance_finalize(Object *obj)
 }
 }
 
+static void vfio_user_pci_reset(DeviceState *dev)
+{
+VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
+VFIODevice *vbasedev = &vdev->vbasedev;
+
+vfio_pci_pre_reset(vdev);
+
+if (vbasedev->reset_works) {
+vfio_user_reset(vbasedev->proxy);
+}
+
+vfio_pci_post_reset(vdev);
+}
+
 static const Property vfio_user_pci_dev_properties[] = {
 DEFINE_PROP_STRING("socket", VFIOUserPCIDevice, sock_name),
 DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false),
@@ -287,6 +301,7 @@ static void vfio_user_pci_dev_class_init(ObjectClass 
*klass, void *data)
 DeviceClass *dc = DEVICE_CLASS(klass);
 PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
 
+device_class_set_legacy_reset(dc, vfio_user_pci_reset);
 device_class_set_props(dc, vfio_user_pci_dev_properties);
 dc->desc = "VFIO over socket PCI device assignment";
 pdc->realize = vfio_user_pci_realize;
-- 
2.34.1




[PATCH v8 02/28] vfio/container: pass listener_begin/commit callbacks

2025-02-19 Thread John Levon
From: John Levon 

The vfio-user container will later need to hook into these callbacks;
set up vfio to use them, and optionally pass them through to the
container.

Signed-off-by: John Levon 
---
 hw/vfio/common.c  | 28 +++
 include/hw/vfio/vfio-container-base.h |  2 ++
 2 files changed, 30 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8d3d425c63..6f106167fd 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -557,6 +557,32 @@ static bool vfio_get_section_iova_range(VFIOContainerBase 
*bcontainer,
 return true;
 }
 
+static void vfio_listener_begin(MemoryListener *listener)
+{
+VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+ listener);
+void (*listener_begin)(VFIOContainerBase *bcontainer);
+
+listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
+
+if (listener_begin) {
+listener_begin(bcontainer);
+}
+}
+
+static void vfio_listener_commit(MemoryListener *listener)
+{
+VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+ listener);
+void (*listener_commit)(VFIOContainerBase *bcontainer);
+
+listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin;
+
+if (listener_commit) {
+listener_commit(bcontainer);
+}
+}
+
 static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp)
 {
 /*
@@ -1396,6 +1422,8 @@ static void vfio_listener_log_sync(MemoryListener 
*listener,
 
 const MemoryListener vfio_memory_listener = {
 .name = "vfio",
+.begin = vfio_listener_begin,
+.commit = vfio_listener_commit,
 .region_add = vfio_listener_region_add,
 .region_del = vfio_listener_region_del,
 .log_global_start = vfio_listener_log_global_start,
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index c9d339383e..0a863df0dc 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -111,6 +111,8 @@ struct VFIOIOMMUClass {
 
 /* basic feature */
 bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
+void (*listener_begin)(VFIOContainerBase *bcontainer);
+void (*listener_commit)(VFIOContainerBase *bcontainer);
 int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly, MemoryRegion *mrp);
-- 
2.34.1




[PATCH v8 00/28] vfio-user client

2025-02-19 Thread John Levon
This is the 8th revision of the vfio-user client implementation. The vfio-user
protocol allows for implementing (PCI) devices in another userspace process;
SPDK is one example, which includes a virtual NVMe implementation.

The vfio-user framework consists of 3 parts:
 1) The VFIO user protocol specification.
 2) A client - the VFIO device in QEMU that encapsulates VFIO messages
and sends them to the server.
 3) A server - a remote process that emulates a device.

This patchset implements parts 1 and 2.

It has been tested against libvfio-user test servers as well as SPDK.

Thanks for previous reviews & comments.

Changes since v7:

 - split up pci patches for easier reviewing
 - fixed lots of device ops error handling
 - vfio-user code now in hw/vfio-user
 - improved commit messages
 - various other small cleanups

Jagannathan Raman (20):
  vfio/container: pass MemoryRegion to DMA operations
  vfio: add region cache
  vfio: split out VFIOKernelPCIDevice
  vfio: add device IO ops vector
  vfio-user: add vfio-user class and container
  vfio-user: connect vfio proxy to remote server
  vfio-user: implement message receive infrastructure
  vfio-user: implement message send infrastructure
  vfio-user: implement VFIO_USER_DEVICE_GET_INFO
  vfio-user: implement VFIO_USER_DEVICE_GET_REGION_INFO
  vfio-user: implement VFIO_USER_REGION_READ/WRITE
  vfio-user: set up PCI in vfio_user_pci_realize()
  vfio-user: implement VFIO_USER_DEVICE_GET/SET_IRQ*
  vfio-user: forward MSI-X PBA BAR accesses to server
  vfio-user: set up container access to the proxy
  vfio-user: implement VFIO_USER_DEVICE_RESET
  vfio-user: implement VFIO_USER_DMA_READ/WRITE
  vfio-user: add 'no-direct-dma' option
  vfio-user: add 'x-msg-timeout' option
  vfio-user: add coalesced posted writes

John Levon (7):
  vfio/container: pass listener_begin/commit callbacks
  vfio/container: support VFIO_DMA_UNMAP_FLAG_ALL
  vfio: add vfio_attach_device_by_iommu_type()
  vfio: add vfio_prepare_device()
  vfio: refactor out vfio_interrupt_setup()
  vfio: refactor out vfio_pci_config_setup()
  vfio-user: implement VFIO_USER_DMA_MAP/UNMAP

Thanos Makatos (1):
  vfio-user: introduce vfio-user protocol specification

 MAINTAINERS   |   10 +-
 docs/devel/index-internals.rst|1 +
 docs/devel/vfio-user.rst  | 1522 ++
 hw/meson.build|1 +
 hw/vfio-user/common.c | 1702 +
 hw/vfio-user/common.h |  123 ++
 hw/vfio-user/container.c  |  358 ++
 hw/vfio-user/container.h  |   24 +
 hw/vfio-user/meson.build  |   10 +
 hw/vfio-user/pci.c|  443 +++
 hw/vfio-user/protocol.h   |  243 
 hw/vfio-user/trace-events |   18 +
 hw/vfio-user/trace.h  |1 +
 hw/vfio/ap.c  |4 +-
 hw/vfio/ccw.c |9 +-
 hw/vfio/common.c  |  137 +-
 hw/vfio/container-base.c  |8 +-
 hw/vfio/container.c   |   78 +-
 hw/vfio/helpers.c |  185 ++-
 hw/vfio/igd.c |8 +-
 hw/vfio/iommufd.c |   31 +-
 hw/vfio/pci.c |  591 +
 hw/vfio/pci.h |   34 +-
 hw/vfio/platform.c|4 +-
 hw/virtio/vhost-vdpa.c|2 +-
 include/exec/memory.h |4 +-
 include/hw/vfio/vfio-common.h |   45 +-
 include/hw/vfio/vfio-container-base.h |   11 +-
 meson.build   |1 +
 meson_options.txt |2 +
 scripts/meson-buildoptions.sh |4 +
 system/memory.c   |7 +-
 32 files changed, 5281 insertions(+), 340 deletions(-)
 create mode 100644 docs/devel/vfio-user.rst
 create mode 100644 hw/vfio-user/common.c
 create mode 100644 hw/vfio-user/common.h
 create mode 100644 hw/vfio-user/container.c
 create mode 100644 hw/vfio-user/container.h
 create mode 100644 hw/vfio-user/meson.build
 create mode 100644 hw/vfio-user/pci.c
 create mode 100644 hw/vfio-user/protocol.h
 create mode 100644 hw/vfio-user/trace-events
 create mode 100644 hw/vfio-user/trace.h

-- 
2.34.1




[PATCH v8 05/28] vfio: add vfio_prepare_device()

2025-02-19 Thread John Levon
Commonize some initialization code shared by the legacy and iommufd vfio
implementations (and later by vfio-user).

Signed-off-by: John Levon 
---
 hw/vfio/common.c  | 19 +++
 hw/vfio/container.c   | 14 +-
 hw/vfio/iommufd.c |  9 +
 include/hw/vfio/vfio-common.h |  2 ++
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index eefd735bc6..4434e0a0a2 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1569,6 +1569,25 @@ retry:
 return info;
 }
 
+void vfio_prepare_device(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
+ VFIOGroup *group, struct vfio_device_info *info)
+{
+vbasedev->group = group;
+
+vbasedev->num_irqs = info->num_irqs;
+vbasedev->num_regions = info->num_regions;
+vbasedev->flags = info->flags;
+vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
+
+vbasedev->bcontainer = bcontainer;
+QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
+if (group) {
+QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
+}
+
+QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+}
+
 bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name,
   VFIODevice *vbasedev, AddressSpace *as,
   Error **errp)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 82987063e5..37a3befbc5 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -876,17 +876,11 @@ static bool vfio_get_device(VFIOGroup *group, const char 
*name,
 }
 
 vbasedev->fd = fd;
-vbasedev->group = group;
-QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
 
-vbasedev->num_irqs = info->num_irqs;
-vbasedev->num_regions = info->num_regions;
-vbasedev->flags = info->flags;
+vfio_prepare_device(vbasedev, &group->container->bcontainer, group, info);
 
 trace_vfio_get_device(name, info->flags, info->num_regions, 
info->num_irqs);
 
-vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
-
 return true;
 }
 
@@ -939,7 +933,6 @@ static bool vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
 int groupid = vfio_device_groupid(vbasedev, errp);
 VFIODevice *vbasedev_iter;
 VFIOGroup *group;
-VFIOContainerBase *bcontainer;
 
 if (groupid < 0) {
 return false;
@@ -968,11 +961,6 @@ static bool vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
 return false;
 }
 
-bcontainer = &group->container->bcontainer;
-vbasedev->bcontainer = bcontainer;
-QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
-QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
-
 return true;
 }
 
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index e295f251c0..85c70eae37 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -604,14 +604,7 @@ found_container:
 iommufd_cdev_ram_block_discard_disable(false);
 }
 
-vbasedev->group = 0;
-vbasedev->num_irqs = dev_info.num_irqs;
-vbasedev->num_regions = dev_info.num_regions;
-vbasedev->flags = dev_info.flags;
-vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
-vbasedev->bcontainer = bcontainer;
-QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
-QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+vfio_prepare_device(vbasedev, bcontainer, NULL, &dev_info);
 
 trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
vbasedev->num_regions, vbasedev->flags);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index c40f8de6bc..ae3ecbd9f6 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -250,6 +250,8 @@ void vfio_reset_handler(void *opaque);
 struct vfio_device_info *vfio_get_device_info(int fd);
 bool vfio_device_is_mdev(VFIODevice *vbasedev);
 bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp);
+void vfio_prepare_device(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
+ VFIOGroup *group, struct vfio_device_info *info);
 bool vfio_attach_device(char *name, VFIODevice *vbasedev,
 AddressSpace *as, Error **errp);
 bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name,
-- 
2.34.1




Re: [PATCH] hw/timer/hpet: Detect invalid access to TN registers

2025-02-19 Thread Zhao Liu
On Wed, Feb 19, 2025 at 10:25:40AM +0100, Paolo Bonzini wrote:
> Date: Wed, 19 Feb 2025 10:25:40 +0100
> From: Paolo Bonzini 
> Subject: Re: [PATCH] hw/timer/hpet: Detect invalid access to TN registers
> 
> On 2/18/25 10:07, Philippe Mathieu-Daudé wrote:
> > On 18/2/25 09:53, Paolo Bonzini wrote:
> > > On 2/18/25 08:37, Zhao Liu wrote:
> > > > "addr & 0x18" ignores invalid address, so that the trace in default
> > > > branch (trace_hpet_ram_{read|write}_invalid()) doesn't work.
> > > > 
> > > > Mask addr by "0x1f & ~4", in which 0x1f means to get the complete TN
> > > > registers access and ~4 means to keep any invalid address offset.
> > > 
> > > I think this is less readable.
> > > 
> > > The reason to use !4 in the Rust code is because the initial AND is done
> > > in a separate function, timer_and_addr().
> > 
> > Having a quick look at the model without looking at the specs:
> > 
> > include/hw/timer/hpet.h:20:#define HPET_LEN    0x400
> > 
> > hw/timer/hpet.c:439:static uint64_t hpet_ram_read(...,
> > hw/timer/hpet.c-441-{
> > hw/timer/hpet.c-448-    /*address range of all TN regs*/
> > hw/timer/hpet.c-449-    if (addr >= 0x100 && addr <= 0x3ff) {
> > hw/timer/hpet.c-450-    uint8_t timer_id = (addr - 0x100) / 0x20;
> >      ...
> > hw/timer/hpet.c-469-    } else {
> > hw/timer/hpet.c-470-    switch (addr & ~4) {
> >   ...
> > hw/timer/hpet.c-488-    }
> > hw/timer/hpet.c-489-    }
> > hw/timer/hpet.c-490-    return 0;
> > hw/timer/hpet.c-491-}
> > 
> > hw/timer/hpet.c:699:    memory_region_init_io(&s->iomem, obj,
> >    &hpet_ram_ops, s,
> >    "hpet", HPET_LEN);
> > 
> > I suppose we want to register multiple timers of I/O size 0x20 at 0x100,
> > and the I/O size of 0x20 at 0x000 is a generic control region.
> > 
> > Maybe split hpet_ram_ops in 2 (hpet_cfg_ops and hpet_tmr_ops), mapping
> > the first one once at 0x000 and the other 24 times at 0x100-0x3ff?
> 
> You would have to come up with a way to get the index though.  It seems to
> be adding churn for no particular reason.
> 
> I'd rather look into how to make decoding code *easy* without making
> everything MemoryRegions.

I aslo met an register space implementation [1] which stores registers
with range (I guess for QEMU, it could map each register to a memory
region) and register specific callbacks.

I didn't choose this way since it's too complex to quickly develop...

[1]: 
https://github.com/google/crosvm/blob/main/devices/src/register_space/mod.rs

> As I explained yesterday, while I'm not yet sure
> that Rust is going to stay in QEMU, I'd like to have as many examples as
> possible to help tilting the balance one way or the other. And indeed in the
> Rust version of HPET, timer_and_addr() could be extended to something like
> this:
> 
> // Start with the same "enum for registers" pattern that PL011 uses:
> #[derive(qemu_api_macros::TryInto)]
> #[repr(u64)]
> enum TimerRegister {
> CFG = 0,
> CMP = 8,
> ROUTE = 16,
> }
> 
> #[derive(qemu_api_macros::TryInto)]
> #[repr(u64)]
> enum GlobalRegister {
> CAP = 0,
> CFG = 0x10,
> INT_STATUS = 0x20,
> COUNTER = 0xF0,
> }
> 
> // Go one step further and define types for all possible outcomes:
> #[derive(Copy)]
> enum HPETRegister {
> Timer(&BqlRefCell, TimerRegister),
> Global(GlobalRegister),
> Unknown(hwaddr),
> }
> 
> struct HPETAddrDecode {
> u32 shift,
> u32 len,
> HPETRegister reg,
> }
> 
> fn decode(&self, addr: hwaddr, size: u32) -> HPETAddrDecode {
> let shift = ((addr & 4) * 8) as u32;
> let len = std::cmp::min(size * 8, 64 - shift);
> 
> addr &= !4;
> let reg = if (0x100..=0x3ff).contains(&addr) {
> let timer_id: usize = ((addr - 0x100) / 0x20) as usize;
> TimerRegister::try_from(addr)
> .map(|reg| HPETRegister::Timer(&self.timers[timer_id], reg))
> } else {
> GlobalRegister::try_from(addr)
> .map(HPETRegister::Global)
> }
> 
> // reg is now a Result
> // convert the Err case into HPETRegister as well
> let reg = reg.unwrap_or_else(HPETRegister::Unknown);
> HPETAddrDecode { shift, len, reg }
> }
> 
> (untested).  The read and write functions then can do something like
> 
> let val = match decoded.reg {
> Timer(timer, reg) => timer.borrow_mut().read(decoded),
> Global(GlobalRegister::CAP) => self.capability.get(),
> Global(GlobalRegister::CFG) => self.config.get(),
> ...
> }
> val >> decoded.shift
> 
> and for write:
> 
> match decoded.reg {
> Timer(timer, reg) => timer.borrow_mut().write(decoded, value),
> Global(GlobalRegister::CAP) => {}, // read-only
> Global(GlobalRegister::CFG) => self.set_cfg_reg(decoded, value),
> ...
> }
> 
> 
> The above could be a scheme that new devices could copy.  Overal

[PATCH 1/4] tests/functional: move aarch64 GPU test into own file

2025-02-19 Thread Alex Bennée
I want to expand the number of tests to cover a wide range of
configurations. That starts with splitting off from the normal virt
test from which it doesn't really share much code.

Signed-off-by: Alex Bennée 
---
 tests/functional/meson.build  |   2 +
 tests/functional/test_aarch64_virt.py |  71 ---
 tests/functional/test_aarch64_virt_gpu.py | 102 ++
 3 files changed, 104 insertions(+), 71 deletions(-)
 create mode 100755 tests/functional/test_aarch64_virt_gpu.py

diff --git a/tests/functional/meson.build b/tests/functional/meson.build
index b516d21cba..11b7ca1577 100644
--- a/tests/functional/meson.build
+++ b/tests/functional/meson.build
@@ -19,6 +19,7 @@ test_timeouts = {
   'aarch64_sbsaref_freebsd' : 720,
   'aarch64_tuxrun' : 240,
   'aarch64_virt' : 720,
+  'aarch64_virt_gpu' : 720,
   'acpi_bits' : 420,
   'arm_aspeed_palmetto' : 120,
   'arm_aspeed_romulus' : 120,
@@ -77,6 +78,7 @@ tests_aarch64_system_thorough = [
   'aarch64_tcg_plugins',
   'aarch64_tuxrun',
   'aarch64_virt',
+  'aarch64_virt_gpu',
   'aarch64_xen',
   'aarch64_xlnx_versal',
   'multiprocess',
diff --git a/tests/functional/test_aarch64_virt.py 
b/tests/functional/test_aarch64_virt.py
index 95f5ce8b4c..884aad7af6 100755
--- a/tests/functional/test_aarch64_virt.py
+++ b/tests/functional/test_aarch64_virt.py
@@ -134,77 +134,6 @@ def test_aarch64_virt_gicv2(self):
 self.common_aarch64_virt("virt,gic-version=2")
 
 
-ASSET_VIRT_GPU_KERNEL = Asset(
-'https://fileserver.linaro.org/s/ce5jXBFinPxtEdx/'
-'download?path=%2F&files='
-'Image',
-'89e5099d26166204cc5ca4bb6d1a11b92c217e1f82ec67e3ba363d09157462f6')
-
-ASSET_VIRT_GPU_ROOTFS = Asset(
-'https://fileserver.linaro.org/s/ce5jXBFinPxtEdx/'
-'download?path=%2F&files='
-'rootfs.ext4.zstd',
-'792da7573f5dc2913ddb7c638151d4a6b2d028a4cb2afb38add513c1924bdad4')
-
-@skipIfMissingCommands('zstd')
-def test_aarch64_virt_with_gpu(self):
-# This tests boots with a buildroot test image that contains
-# vkmark and other GPU exercising tools. We run a headless
-# weston that nevertheless still exercises the virtio-gpu
-# backend.
-
-self.set_machine('virt')
-self.require_accelerator("tcg")
-
-kernel_path = self.ASSET_VIRT_GPU_KERNEL.fetch()
-image_path = self.uncompress(self.ASSET_VIRT_GPU_ROOTFS, format="zstd")
-
-self.vm.set_console()
-kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE +
-   'console=ttyAMA0 root=/dev/vda')
-
-self.vm.add_args("-accel", "tcg")
-self.vm.add_args("-cpu", "neoverse-v1,pauth-impdef=on")
-self.vm.add_args("-machine", "virt,gic-version=max",
- '-kernel', kernel_path,
- '-append', kernel_command_line)
-self.vm.add_args("-smp", "2", "-m", "2048")
-self.vm.add_args("-device",
- "virtio-gpu-gl-pci,hostmem=4G,blob=on,venus=on")
-self.vm.add_args("-display", "egl-headless")
-self.vm.add_args("-display", "dbus,gl=on")
-self.vm.add_args("-device", "virtio-blk-device,drive=hd0")
-self.vm.add_args("-blockdev",
- "driver=raw,file.driver=file,"
- "node-name=hd0,read-only=on,"
- f"file.filename={image_path}")
-self.vm.add_args("-snapshot")
-
-try:
-self.vm.launch()
-except VMLaunchFailure as excp:
-if "old virglrenderer, blob resources unsupported" in excp.output:
-self.skipTest("No blob support for virtio-gpu")
-elif "old virglrenderer, venus unsupported" in excp.output:
-self.skipTest("No venus support for virtio-gpu")
-elif "egl: no drm render node available" in excp.output:
-self.skipTest("Can't access host DRM render node")
-elif "'type' does not accept value 'egl-headless'" in excp.output:
-self.skipTest("egl-headless support is not available")
-else:
-self.log.info(f"unhandled launch failure: {excp.output}")
-raise excp
-
-self.wait_for_console_pattern('buildroot login:')
-exec_command(self, 'root')
-exec_command(self, 'export XDG_RUNTIME_DIR=/tmp')
-exec_command_and_wait_for_pattern(self,
-  "weston -B headless "
-  "--renderer gl "
-  "--shell kiosk "
-  "-- vkmark -b:duration=1.0",
-  "vkmark Score")
-
 
 if __name__ == '__main__':
 QemuSystemTest.main()
diff --git a/tests/functional/test_aarch64_virt_gpu.py 
b/tests/functional/test_aarch64_virt_gpu.py
new file mode 100755
index 00..f21a

[PATCH 0/4] testing/next (aarch64 virt gpu tests)

2025-02-19 Thread Alex Bennée
Hi,

As I was looking at the native context patches I realised our existing
GPU testing is a little sparse. I took the opportunity to split the
test from the main virt test and then extend it to exercise the 3
current display modes (virgl, virgl+blobs, vulkan).

I've added some additional validation to ensure we have the devices we
expect before we start. It doesn't currently address the reported
clang issues but hopefully it will help narrow down what fails and
what works.

Once I've built some new buildroot images I'll re-spin with a while
bunch of additional test binaries available.

Alex.

Alex Bennée (4):
  tests/functional: move aarch64 GPU test into own file
  tests/functional: factor out common code in gpu test
  tests/functional: ensure we have a GPU device for tests
  tests/functional: expand tests to cover virgl

 tests/functional/meson.build  |   2 +
 tests/functional/test_aarch64_virt.py |  71 -
 tests/functional/test_aarch64_virt_gpu.py | 123 ++
 3 files changed, 125 insertions(+), 71 deletions(-)
 create mode 100755 tests/functional/test_aarch64_virt_gpu.py

-- 
2.39.5




[PATCH 4/4] tests/functional: expand tests to cover virgl

2025-02-19 Thread Alex Bennée
Add two more test modes using glmark2-wayland to exercise the OpenGL
pass-through modes with virgl. Virgl can run with or without the
hostmem blob support.

We might want to eventually add more directed tests and individual
features later on but the glmark/vkmark tests are a good general
smoke test for accelerated 3D.

Signed-off-by: Alex Bennée 
---
 tests/functional/test_aarch64_virt_gpu.py | 20 
 1 file changed, 20 insertions(+)

diff --git a/tests/functional/test_aarch64_virt_gpu.py 
b/tests/functional/test_aarch64_virt_gpu.py
index c9463d7285..7a8471d1ca 100755
--- a/tests/functional/test_aarch64_virt_gpu.py
+++ b/tests/functional/test_aarch64_virt_gpu.py
@@ -89,6 +89,26 @@ def _run_virt_gpu_test(self, gpu_device,  weston_cmd, 
weston_pattern):
 full_cmd = f"weston -B headless --renderer gl --shell kiosk -- 
{weston_cmd}"
 exec_command_and_wait_for_pattern(self, full_cmd, weston_pattern)
 
+@skipIfMissingCommands('zstd')
+def test_aarch64_virt_with_virgl_gpu(self):
+
+self.require_device('virtio-gpu-gl-pci')
+
+gpu_device = "virtio-gpu-gl-pci"
+weston_cmd = "glmark2-wayland -b:duration=1.0"
+weston_pattern = "glmark2 Score"
+self._run_virt_gpu_test(gpu_device, weston_cmd, weston_pattern)
+
+@skipIfMissingCommands('zstd')
+def test_aarch64_virt_with_virgl_blobs_gpu(self):
+
+self.require_device('virtio-gpu-gl-pci')
+
+gpu_device = "virtio-gpu-gl-pci,hostmem=4G,blob=on"
+weston_cmd = "glmark2-wayland -b:duration=1.0"
+weston_pattern = "glmark2 Score"
+self._run_virt_gpu_test(gpu_device, weston_cmd, weston_pattern)
+
 @skipIfMissingCommands('zstd')
 def test_aarch64_virt_with_vulkan_gpu(self):
 
-- 
2.39.5




[PATCH v8 07/28] vfio: refactor out vfio_pci_config_setup()

2025-02-19 Thread John Levon
Refactor the PCI config setup code out of vfio_realize(), as we will
later need this for vfio-user too.

Signed-off-by: John Levon 
---
 hw/vfio/pci.c | 176 +++---
 1 file changed, 94 insertions(+), 82 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5fb6c4c4c6..83fe329474 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2957,6 +2957,99 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice 
*vdev)
 vdev->req_enabled = false;
 }
 
+static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp)
+{
+PCIDevice *pdev = &vdev->pdev;
+VFIODevice *vbasedev = &vdev->vbasedev;
+
+/* vfio emulates a lot for us, but some bits need extra love */
+vdev->emulated_config_bits = g_malloc0(vdev->config_size);
+
+/* QEMU can choose to expose the ROM or not */
+memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
+/* QEMU can also add or extend BARs */
+memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
+
+/*
+ * The PCI spec reserves vendor ID 0x as an invalid value.  The
+ * device ID is managed by the vendor and need only be a 16-bit value.
+ * Allow any 16-bit value for subsystem so they can be hidden or changed.
+ */
+if (vdev->vendor_id != PCI_ANY_ID) {
+if (vdev->vendor_id >= 0x) {
+error_setg(errp, "invalid PCI vendor ID provided");
+return false;
+}
+vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
+trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
+} else {
+vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
+}
+
+if (vdev->device_id != PCI_ANY_ID) {
+if (vdev->device_id > 0x) {
+error_setg(errp, "invalid PCI device ID provided");
+return false;
+}
+vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
+trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
+} else {
+vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
+}
+
+if (vdev->sub_vendor_id != PCI_ANY_ID) {
+if (vdev->sub_vendor_id > 0x) {
+error_setg(errp, "invalid PCI subsystem vendor ID provided");
+return false;
+}
+vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
+   vdev->sub_vendor_id, ~0);
+trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
+  vdev->sub_vendor_id);
+}
+
+if (vdev->sub_device_id != PCI_ANY_ID) {
+if (vdev->sub_device_id > 0x) {
+error_setg(errp, "invalid PCI subsystem device ID provided");
+return false;
+}
+vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, 
~0);
+trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
+  vdev->sub_device_id);
+}
+
+/* QEMU can change multi-function devices to single function, or reverse */
+vdev->emulated_config_bits[PCI_HEADER_TYPE] =
+  PCI_HEADER_TYPE_MULTI_FUNCTION;
+
+/* Restore or clear multifunction, this is always controlled by QEMU */
+if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
+} else {
+vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
+}
+
+/*
+ * Clear host resource mapping info.  If we choose not to register a
+ * BAR, such as might be the case with the option ROM, we can get
+ * confusing, unwritable, residual addresses from the host here.
+ */
+memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
+memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
+
+vfio_pci_size_rom(vdev);
+
+vfio_bars_prepare(vdev);
+
+if (!vfio_msix_early_setup(vdev, errp)) {
+return false;
+}
+
+vfio_bars_register(vdev);
+
+return true;
+}
+
 static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp)
 {
 PCIDevice *pdev = &vdev->pdev;
@@ -3060,91 +3153,10 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 goto error;
 }
 
-/* vfio emulates a lot for us, but some bits need extra love */
-vdev->emulated_config_bits = g_malloc0(vdev->config_size);
-
-/* QEMU can choose to expose the ROM or not */
-memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
-/* QEMU can also add or extend BARs */
-memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
-
-/*
- * The PCI spec reserves vendor ID 0x as an invalid value.  The
- * device ID is managed by the vendor and need only be a 16-bit value.
- * Allow any 16-bit value for subsystem so they can be hidden or changed.
- */
-if (vdev->vendor_id != PCI_ANY_I

[PATCH 2/4] tests/functional: factor out common code in gpu test

2025-02-19 Thread Alex Bennée
In preparation for handling more tests split out the common machine
setup details from the test specific stuff.

Signed-off-by: Alex Bennée 
---
 tests/functional/test_aarch64_virt_gpu.py | 30 +++
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/tests/functional/test_aarch64_virt_gpu.py 
b/tests/functional/test_aarch64_virt_gpu.py
index f21ae18392..06093c6b60 100755
--- a/tests/functional/test_aarch64_virt_gpu.py
+++ b/tests/functional/test_aarch64_virt_gpu.py
@@ -39,12 +39,7 @@ def wait_for_console_pattern(self, success_message, vm=None):
 'rootfs.ext4.zstd',
 '792da7573f5dc2913ddb7c638151d4a6b2d028a4cb2afb38add513c1924bdad4')
 
-@skipIfMissingCommands('zstd')
-def test_aarch64_virt_with_vulkan_gpu(self):
-# This tests boots with a buildroot test image that contains
-# vkmark and other GPU exercising tools. We run a headless
-# weston that nevertheless still exercises the virtio-gpu
-# backend.
+def _run_virt_gpu_test(self, gpu_device,  weston_cmd, weston_pattern):
 
 self.set_machine('virt')
 self.require_accelerator("tcg")
@@ -62,10 +57,10 @@ def test_aarch64_virt_with_vulkan_gpu(self):
  '-kernel', kernel_path,
  '-append', kernel_command_line)
 self.vm.add_args("-smp", "2", "-m", "2048")
-self.vm.add_args("-device",
- "virtio-gpu-gl-pci,hostmem=4G,blob=on,venus=on")
-self.vm.add_args("-display", "egl-headless")
-self.vm.add_args("-display", "dbus,gl=on")
+self.vm.add_args("-device", gpu_device)
+for opt in ["egl-headless", "dbus,gl=on"]:
+self.vm.add_args("-display", opt)
+
 self.vm.add_args("-device", "virtio-blk-device,drive=hd0")
 self.vm.add_args("-blockdev",
  "driver=raw,file.driver=file,"
@@ -91,12 +86,15 @@ def test_aarch64_virt_with_vulkan_gpu(self):
 self.wait_for_console_pattern('buildroot login:')
 exec_command(self, 'root')
 exec_command(self, 'export XDG_RUNTIME_DIR=/tmp')
-exec_command_and_wait_for_pattern(self,
-  "weston -B headless "
-  "--renderer gl "
-  "--shell kiosk "
-  "-- vkmark -b:duration=1.0",
-  "vkmark Score")
+full_cmd = f"weston -B headless --renderer gl --shell kiosk -- 
{weston_cmd}"
+exec_command_and_wait_for_pattern(self, full_cmd, weston_pattern)
+
+@skipIfMissingCommands('zstd')
+def test_aarch64_virt_with_vulkan_gpu(self):
+gpu_device = "virtio-gpu-gl-pci,hostmem=4G,blob=on,venus=on"
+weston_cmd = "vkmark -b:duration=1.0"
+weston_pattern = "vkmark Score"
+self._run_virt_gpu_test(gpu_device, weston_cmd, weston_pattern)
 
 if __name__ == '__main__':
 QemuSystemTest.main()
-- 
2.39.5




[PATCH 3/4] tests/functional: ensure we have a GPU device for tests

2025-02-19 Thread Alex Bennée
It's possible to build QEMU without support for the GL enabled GPU
devices and we can catch that earlier with an explicit check.

Signed-off-by: Alex Bennée 
---
 tests/functional/test_aarch64_virt_gpu.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/functional/test_aarch64_virt_gpu.py 
b/tests/functional/test_aarch64_virt_gpu.py
index 06093c6b60..c9463d7285 100755
--- a/tests/functional/test_aarch64_virt_gpu.py
+++ b/tests/functional/test_aarch64_virt_gpu.py
@@ -91,6 +91,9 @@ def _run_virt_gpu_test(self, gpu_device,  weston_cmd, 
weston_pattern):
 
 @skipIfMissingCommands('zstd')
 def test_aarch64_virt_with_vulkan_gpu(self):
+
+self.require_device('virtio-gpu-gl-pci')
+
 gpu_device = "virtio-gpu-gl-pci,hostmem=4G,blob=on,venus=on"
 weston_cmd = "vkmark -b:duration=1.0"
 weston_pattern = "vkmark Score"
-- 
2.39.5




[PATCH v8 25/28] vfio-user: implement VFIO_USER_DMA_READ/WRITE

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Unlike most other messages, this is a server->client message, for when a
server wants to do "DMA"; this is slow, so normally the server has
memory directly mapped instead.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c   |  57 +
 hw/vfio-user/common.h   |   3 ++
 hw/vfio-user/pci.c  | 110 
 hw/vfio-user/protocol.h |  13 -
 4 files changed, 182 insertions(+), 1 deletion(-)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index b78b9e57e8..38f8eef317 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -377,6 +377,10 @@ static int vfio_user_recv_one(VFIOUserProxy *proxy)
 *msg->hdr = hdr;
 data = (char *)msg->hdr + sizeof(hdr);
 } else {
+if (hdr.size > proxy->max_xfer_size + sizeof(VFIOUserDMARW)) {
+error_setg(&local_err, "vfio_user_recv request larger than max");
+goto err;
+}
 buf = g_malloc0(hdr.size);
 memcpy(buf, &hdr, sizeof(hdr));
 data = buf + sizeof(hdr);
@@ -762,6 +766,59 @@ void vfio_user_wait_reqs(VFIOUserProxy *proxy)
 qemu_mutex_unlock(&proxy->lock);
 }
 
+/*
+ * Reply to an incoming request.
+ */
+void vfio_user_send_reply(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int size)
+{
+
+if (size < sizeof(VFIOUserHdr)) {
+error_printf("vfio_user_send_reply - size too small\n");
+g_free(hdr);
+return;
+}
+
+/*
+ * convert header to associated reply
+ */
+hdr->flags = VFIO_USER_REPLY;
+hdr->size = size;
+
+vfio_user_send_async(proxy, hdr, NULL);
+}
+
+/*
+ * Send an error reply to an incoming request.
+ */
+void vfio_user_send_error(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int error)
+{
+
+/*
+ * convert header to associated reply
+ */
+hdr->flags = VFIO_USER_REPLY;
+hdr->flags |= VFIO_USER_ERROR;
+hdr->error_reply = error;
+hdr->size = sizeof(*hdr);
+
+vfio_user_send_async(proxy, hdr, NULL);
+}
+
+/*
+ * Close FDs erroneously received in an incoming request.
+ */
+void vfio_user_putfds(VFIOUserMsg *msg)
+{
+VFIOUserFDs *fds = msg->fds;
+int i;
+
+for (i = 0; i < fds->recv_fds; i++) {
+close(fds->fds[i]);
+}
+g_free(fds);
+msg->fds = NULL;
+}
+
 static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets =
 QLIST_HEAD_INITIALIZER(vfio_user_sockets);
 
diff --git a/hw/vfio-user/common.h b/hw/vfio-user/common.h
index f7cc02d2e7..f8c61f2128 100644
--- a/hw/vfio-user/common.h
+++ b/hw/vfio-user/common.h
@@ -105,6 +105,9 @@ void vfio_user_send_nowait(VFIOUserProxy *proxy, 
VFIOUserHdr *hdr,
VFIOUserFDs *fds, int rsize);
 void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
  VFIOUserFDs *fds, int rsize);
+void vfio_user_send_reply(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int size);
+void vfio_user_send_error(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int error);
+void vfio_user_putfds(VFIOUserMsg *msg);
 
 extern VFIODeviceIO vfio_dev_io_sock;
 
diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c
index d520b7592b..e65c7eaf02 100644
--- a/hw/vfio-user/pci.c
+++ b/hw/vfio-user/pci.c
@@ -96,6 +96,95 @@ static void vfio_user_msix_teardown(VFIOPCIDevice *vdev)
 vdev->msix->pba_region = NULL;
 }
 
+static void vfio_user_dma_read(VFIOPCIDevice *vdev, VFIOUserDMARW *msg)
+{
+PCIDevice *pdev = &vdev->pdev;
+VFIOUserProxy *proxy = vdev->vbasedev.proxy;
+VFIOUserDMARW *res;
+MemTxResult r;
+size_t size;
+
+if (msg->hdr.size < sizeof(*msg)) {
+vfio_user_send_error(proxy, &msg->hdr, EINVAL);
+return;
+}
+if (msg->count > proxy->max_xfer_size) {
+vfio_user_send_error(proxy, &msg->hdr, E2BIG);
+return;
+}
+
+/* switch to our own message buffer */
+size = msg->count + sizeof(VFIOUserDMARW);
+res = g_malloc0(size);
+memcpy(res, msg, sizeof(*res));
+g_free(msg);
+
+r = pci_dma_read(pdev, res->offset, &res->data, res->count);
+
+switch (r) {
+case MEMTX_OK:
+if (res->hdr.flags & VFIO_USER_NO_REPLY) {
+g_free(res);
+return;
+}
+vfio_user_send_reply(proxy, &res->hdr, size);
+break;
+case MEMTX_ERROR:
+vfio_user_send_error(proxy, &res->hdr, EFAULT);
+break;
+case MEMTX_DECODE_ERROR:
+vfio_user_send_error(proxy, &res->hdr, ENODEV);
+break;
+case MEMTX_ACCESS_ERROR:
+vfio_user_send_error(proxy, &res->hdr, EPERM);
+break;
+default:
+error_printf("vfio_user_dma_read unknown error %d\n", r);
+vfio_user_send_error(vdev->vbasedev.proxy, &res->hdr, EINVAL);
+}
+}
+
+static void vfio_user_dma_write(VFIOPCIDevice *vdev, VFIOUserDMARW *msg)
+{
+PCIDevice *pdev = &vdev->pdev;
+VFIOUserProxy *proxy = vdev->vbasedev.pro

[PATCH v8 27/28] vfio-user: add 'x-msg-timeout' option

2025-02-19 Thread John Levon
From: Jagannathan Raman 

By default, the vfio-user subsystem will wait 5 seconds for a message
reply from the server. Add an option to allow this to be configurable.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 7 ---
 hw/vfio-user/common.h | 1 +
 hw/vfio-user/pci.c| 4 
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index 38f8eef317..e44c8a2568 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -37,7 +37,6 @@
 #define VFIO_USER_MAX_REGIONS   100
 #define VFIO_USER_MAX_IRQS  50
 
-static int wait_time = 5000;   /* wait up to 5 sec for busy servers */
 static IOThread *vfio_user_iothread;
 
 static void vfio_user_shutdown(VFIOUserProxy *proxy);
@@ -707,7 +706,8 @@ void vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr 
*hdr,
 
 if (ret == 0) {
 while (!msg->complete) {
-if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) {
+if (!qemu_cond_timedwait(&msg->cv, &proxy->lock,
+ proxy->wait_time)) {
 VFIOUserMsgQ *list;
 
 list = msg->pending ? &proxy->pending : &proxy->outgoing;
@@ -740,7 +740,8 @@ void vfio_user_wait_reqs(VFIOUserProxy *proxy)
 msg->type = VFIO_MSG_WAIT;
 proxy->last_nowait = NULL;
 while (!msg->complete) {
-if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) {
+if (!qemu_cond_timedwait(&msg->cv, &proxy->lock,
+ proxy->wait_time)) {
 VFIOUserMsgQ *list;
 
 list = msg->pending ? &proxy->pending : &proxy->outgoing;
diff --git a/hw/vfio-user/common.h b/hw/vfio-user/common.h
index 72138220ba..9acf634ca7 100644
--- a/hw/vfio-user/common.h
+++ b/hw/vfio-user/common.h
@@ -62,6 +62,7 @@ typedef struct VFIOUserProxy {
 uint64_t max_bitmap;
 uint64_t migr_pgsize;
 int flags;
+uint32_t wait_time;
 QemuCond close_cv;
 AioContext *ctx;
 QEMUBH *req_bh;
diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c
index 8a05e69a46..fe096cc7a2 100644
--- a/hw/vfio-user/pci.c
+++ b/hw/vfio-user/pci.c
@@ -39,6 +39,7 @@ struct VFIOUserPCIDevice {
 bool no_direct_dma; /* disable shared mem for DMA */
 bool send_queued;   /* all sends are queued */
 bool no_post;   /* all regions write are sync */
+uint32_t wait_time; /* timeout for message replies */
 };
 
 /*
@@ -274,6 +275,8 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error 
**errp)
 if (udev->no_post) {
 proxy->flags |= VFIO_PROXY_NO_POST;
 }
+/* user specified or 5 sec default */
+proxy->wait_time = udev->wait_time;
 
 if (!vfio_user_validate_version(proxy, errp)) {
 goto error;
@@ -409,6 +412,7 @@ static const Property vfio_user_pci_dev_properties[] = {
 DEFINE_PROP_BOOL("no-direct-dma", VFIOUserPCIDevice, no_direct_dma, false),
 DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false),
 DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false),
+DEFINE_PROP_UINT32("x-msg-timeout", VFIOUserPCIDevice, wait_time, 5000),
 };
 
 static void vfio_user_pci_dev_class_init(ObjectClass *klass, void *data)
-- 
2.34.1




Re: [PATCH trivial] Makefile: "make dist" generates a .xz, not .bz2

2025-02-19 Thread Philippe Mathieu-Daudé

On 19/2/25 14:00, Michael Tokarev wrote:

Fixes: 9bc9e9511944 (make-release: switch to .xz format by default)
Signed-off-by: Michael Tokarev 
---
  Makefile | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 




Re: [PULL 23/32] tests/functional: extend test_aarch64_virt with vulkan test

2025-02-19 Thread Philippe Mathieu-Daudé

On 19/2/25 14:43, Thomas Huth wrote:

On 19/02/2025 14.25, Philippe Mathieu-Daudé wrote:

(+Markus for CLI)

On 10/1/25 14:17, Alex Bennée wrote:

Now that we have virtio-gpu Vulkan support, let's add a test for it.
Currently this is using images build by buildroot:

   https://lists.buildroot.org/pipermail/buildroot/2024- 
December/768196.html


Reviewed-by: Thomas Huth 
Signed-off-by: Alex Bennée 
Message-Id: <20250108121054.1126164-24-alex.ben...@linaro.org>

diff --git a/tests/functional/test_aarch64_virt.py b/tests/ 
functional/ test_aarch64_virt.py

index 201c5ed023..6b2336a28d 100755
--- a/tests/functional/test_aarch64_virt.py
+++ b/tests/functional/test_aarch64_virt.py
@@ -13,10 +13,12 @@
  import logging
  from subprocess import check_call, DEVNULL
+from qemu.machine.machine import VMLaunchFailure
+
  from qemu_test import QemuSystemTest, Asset
-from qemu_test import exec_command_and_wait_for_pattern
+from qemu_test import exec_command, exec_command_and_wait_for_pattern
  from qemu_test import wait_for_console_pattern
-from qemu_test import get_qemu_img
+from qemu_test import skipIfMissingCommands, get_qemu_img
  class Aarch64VirtMachine(QemuSystemTest):
@@ -132,5 +134,73 @@ def test_aarch64_virt_gicv2(self):
  self.common_aarch64_virt("virt,gic-version=2")
+    ASSET_VIRT_GPU_KERNEL = Asset(
+    'https://fileserver.linaro.org/s/ce5jXBFinPxtEdx/'
+    'download?path=%2F&files='
+    'Image',
+
'89e5099d26166204cc5ca4bb6d1a11b92c217e1f82ec67e3ba363d09157462f6')

+
+    ASSET_VIRT_GPU_ROOTFS = Asset(
+    'https://fileserver.linaro.org/s/ce5jXBFinPxtEdx/'
+    'download?path=%2F&files='
+    'rootfs.ext4.zstd',
+
'792da7573f5dc2913ddb7c638151d4a6b2d028a4cb2afb38add513c1924bdad4')

+
+    @skipIfMissingCommands('zstd')
+    def test_aarch64_virt_with_gpu(self):
+    # This tests boots with a buildroot test image that contains
+    # vkmark and other GPU exercising tools. We run a headless
+    # weston that nevertheless still exercises the virtio-gpu
+    # backend.
+
+    self.set_machine('virt')
+    self.require_accelerator("tcg")
+
+    kernel_path = self.ASSET_VIRT_GPU_KERNEL.fetch()
+    image_path = self.uncompress(self.ASSET_VIRT_GPU_ROOTFS, 
format="zstd")

+
+    self.vm.set_console()
+    kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE +
+   'console=ttyAMA0 root=/dev/vda')
+
+    self.vm.add_args("-accel", "tcg")
+    self.vm.add_args("-cpu", "neoverse-v1,pauth-impdef=on")
+    self.vm.add_args("-machine", "virt,gic-version=max",
+ '-kernel', kernel_path,
+ '-append', kernel_command_line)
+    self.vm.add_args("-smp", "2", "-m", "2048")
+    self.vm.add_args("-device",
+ "virtio-gpu-gl- 
pci,hostmem=4G,blob=on,venus=on")

+    self.vm.add_args("-display", "egl-headless")
+    self.vm.add_args("-display", "dbus,gl=on")


[*]


+    self.vm.add_args("-device", "virtio-blk-device,drive=hd0")
+    self.vm.add_args("-blockdev",
+ "driver=raw,file.driver=file,"
+ "node-name=hd0,read-only=on,"
+ f"file.filename={image_path}")
+    self.vm.add_args("-snapshot")
+
+    try:
+    self.vm.launch()
+    except VMLaunchFailure as excp:
+    if "old virglrenderer, blob resources unsupported" in 
excp.output:

+    self.skipTest("No blob support for virtio-gpu")
+    elif "old virglrenderer, venus unsupported" in excp.output:
+    self.skipTest("No venus support for virtio-gpu")


This seems dependent on the order of the CLI arguments, as I got:

qemu-system-aarch64: -device virtio-gpu-gl- 
pci,hostmem=4G,blob=on,venus=on: 'virtio-gpu-gl-pci' is not a valid 
device model name


I understand it is too complex to check this device availability with
meson, in order to avoid running the test.

Can we use device introspection instead, like we do in QTest with
qtest_qom_has_concrete_type() for accelerators? Maybe in the lines of:

   @skipIfMissingQOMType('virtio-gpu-gl-pci')


We already have "self.require_device('...')" that can be used to check 
for the availability of devices and skip the test if it is not built 
in ... would that be suitable here?


Yes, perfect, thanks!



  Thomas






Re: [PATCH v6 2/4] migration: enable multifd and postcopy together

2025-02-19 Thread Prasad Pandit
Hello Fabiano,

On Tue, 18 Feb 2025 at 19:52, Fabiano Rosas  wrote:
> Do you concede that this code has a hidden assumption? Either that
> migrate_multifd() != migrate_postcopy_preempt() or that multifd channels
> must be set up before postcopy preempt channel? Because that is enough
> for us to have to do something about it. Either restructuring or a
> comment explaining.

* Not a hidden assumption, but it is an observation that 'main' and
'multifd' channels are established before 'postcopy' ones. And for new
migration to start, it is necessary that 'main' and 'multifd' channels
(when enabled) are established before migration starts.

> > * When does postcopy preempt channel creation race with the multifd
> > channel creation?
>
> For instance, postcopy_do_resume() has this comment:
> /*
>  * If preempt is enabled, re-establish the preempt channel.  Note that
>  * we do it after resume prepare to make sure the main channel will be
>  * created before the preempt channel.  E.g. with weak network, the
>  * dest QEMU may get messed up with the preempt and main channels on
>  * the order of connection setup.  This guarantees the correct order.
>  */
> It looks like if the main channel can race, so do the multifd channels,
> no? In any case, I'm fine with just documenting any assumption for now.

* The first requirement for this race to occur is that two types of
channels are created together at the same time. Let's see:

   * Postcopy migration:  without multifd enabled
  - 'main' channel is created before the migration starts. And
'postcopy' channels are created towards the end of precopy migration,
when the Postcopy phase starts. So in this scenario the race does not
happen.

   * Postcopy resume: without multifd enabled
  - As described in the comment above, preempt channel is created
_after_ the 'main' channel to avoid the race condition.

   * Postcopy migration: with multifd enabled
  - 'main' and 'multifd' channels are created before migration
starts. And 'postcopy' channels are created towards the end of precopy
migration, when the Postcopy phase starts. No race occurs.

   * Postcopy resume: with multifd enabled
  - 'multifd' channels are shutdown before Postcopy starts, ie. no
'multifd' channels exist during Postcopy resume. So no race between
'postcopy' and 'multifd' channels.
  - And 'postcopy' channels are created after the 'main' channel
to avoid the race between them.
  - postcopy_do_resume() does not seem to create 'multifd' channels.

   * Multifd migration: without Postcopy enabled
  - 'main' and 'multifd' channels are created before the migration
starts. They both send 'magic value' bytes, so are easier to
differentiate. No race occurs.


> > * migration_needs_multiple_sockets() => return migrate_multifd() ||
> > migrate_postcopy_preempt();
> >
> Nope, this is just saying whether a single channel is expected, or more
> than one.

* If we read it as a question:
- migration_needs_multiple_sockets() ? True => Yes, migration
needs multiple sockets.
- migration_needs_multiple_sockets() ? False => No, migration does
not need multiple sockets.

Then it should return 'True' when both migrate_multifd() and
postcopy_preempt() are enabled.

>That's why I think it would be a good gate for this peeking
> code. Since postcopy preempt could be a peekable channel, it's
> misleading to put it all behind QIO_CHANNEL_FEATURE_READ_MSG_PEEK
> only. This is a time-bomb for the next person to refactor this code.

* Postcopy preempt could be a peekable channel ? Currently it does not
send magic value, does it?

> Right, but that's not what we have today. Changing this requires
> figuring out how to keep the stream compatible when channels now start
> sending extra stuff at the start. It's not trivial. There's also
> mapped-ram which is asynchronous and there might be something special to
> be done about the TLS handshake, I'm not sure.

* True, it's not trivial.

> Well, aside from preempt, they're *not* dependent on the order. That's
> the point of having to do all of this dance. In fact we might be better
> off if we could serialize the connections somehow.
>
> I havent't followed this series closely, could you point me to the
> discussion that led to the channels concept being introduced?

* Channels concept was not introduced in this series. It has been
there since the beginning, no?

> Yes. They *can* be used without multifd. The comment would explain that
> at that point in the code, these are the only types possible. So as to
> not mislead future readers that whenever tls/file, then multifd must be
> used.

> See? Multifd mutually exclusive with postcopy preempt. You carried that
> assumption (well done), but made it more subtle (not good), since
> if/else is by definition showing the relationship between the two while
> migration_has_main_and_multifd_channels() makes it hidden under the
> multifd check allowing the last return true to happ

Re: [PATCH v2 09/11] rust/block: Add read support for block drivers

2025-02-19 Thread Kevin Wolf
Am 19.02.2025 um 07:11 hat Paolo Bonzini geschrieben:
> On 2/18/25 19:20, Kevin Wolf wrote:
> > +/// The described blocks are stored in a child node.
> > +Data {
> > +/// Child node in which the data is stored
> > +node: Arc,
> 
> Having Arc<> here shouldn't be necessary, since the BdrvChild is already
> reference counted.  Since the code is called under the bdrv_graph_rdlock
> there's no risk of the BdrvChild going away, and you can just make it a
> &BdrvChild.

That would mean that you need keep the BlockDriver borrowed as long as
you're using the mapping. It would work today, but as soon as I want to
cache mappings, it won't any more.

> Likewise, even BochsImage should not need a standard Rust Arc.
> However you need to add your own block::Arc and map Clone/Drop to
> bdrv_ref/bdrv_unref.  Then BochsImage can use block::Arc; this
> makes it even clearer that Mapping should not use the Arc<> wrapper, because
> bdrv_ref is GLOBAL_STATE_CODE() and would abort if run from a non-main
> thread.

It's not BdrvChild that is refcounted on the C side, but
BlockDriverState. We definitely don't bdrv_ref()/unref() for each
request on the C side and we shouldn't on the Rust side either. The
refcount only changes when you modify the graph.

I'm not entirely sure how your block::Arc is supposed to work. It
would be tied to one specific type (BlockDriverState), not generic.
Which probably means that it can't be a separate pointer type, but
BlockDriverState itself should just implement Clone with bdrv_ref().

Though that doesn't help here, obviously, because we have a BdrvChild.

> That said, I'm not sure how to include "block graph lock must be taken" into
> the types, yet.  That has to be taken into account too, sooner or later.
> You probably have a lot of items like this one so it'd be nice to have TODO
> comments as much as you can.

Actually, I'm not aware of that many items. But yes, there is a TODO
item for the graph lock.

I think I'll have something like:

pub struct BdrvChild {
child: GraphLock<*mut bindings::BdrvChild>,
}

where you can access the inner object either by calling a lock function,
or passing another graph lock guard that you already own. And for the
FFI boundary unsafe functions like "I promise I already own the lock".

> (This boundary is where you get an unholy mix of C and Rust concepts. It
> takes a while to get used to, and it teaches you a lot of the parts of Rust
> that you usually take for granted.  So while it's not hard, it's unusual and
> it does feel like water and oil in the beginning).
> 
> > +) -> std::os::raw::c_int {
> > +let s = unsafe { &mut *((*bs).opaque as *mut D) };
> 
> &mut is not safe here (don't worry, we went through the same thing for
> devices :)).  You can only get an & unless you go through an UnsafeCell (or
> something that contains one).

Right, we can have multiple requests in flight.

The fix is easy here: Even though bindgen gives us a *mut, we only want
a immutable reference.

> You'll need to split the mutable and immutable parts of BochsImage in
> separate structs, and embed the former into the latter.  Long term you
> there should be a qemu_api::coroutine::CoMutex<>, but for the short
> term you can just use a BqlRefCell<> or a standard Rust RefCell<>.
> You can see how PL011Registers is included into PL011State in
> rust/hw/char/pl011/src/device.rs, and a small intro is also present in
> docs/devel/rust.rst.

There is no mutable part in BochsImage, which makes this easy. The only
thing is the *mut bindings::BdrvChild, but we never dereference that in
Rust. It is also essentially interior mutability protected by the graph
lock, even though this isn't explicit yet.

But if we were to introduce a mutable part (I think we will add write
support to it sooner or later), then BqlRefCell or RefCell are
definitely not right. They would only turn the UB into a safe panic when
you have more than one request in flight. (Or actually, BqlRefCell
should already panic with just one request from an iothread, because we
don't actually hold the BQL.)

> Anyway, the BdrvChild needs to remain in BochsImage, so that it is
> accessible outside the CoMutex critical section and can be placed into
> the Mapping.
> 
> > +let mut offset = offset as u64;
> > +let mut bytes = bytes as u64;
> > +
> > +while bytes > 0 {
> > +let req = Request::Read { offset, len: bytes };
> > +let mapping = match qemu_co_run_future(s.map(&req)) {
> > +Ok(mapping) => mapping,
> > +Err(e) => return -i32::from(Errno::from(e).0),
> 
> This is indeed not great, but it's partly so because you're doing a
> lot (for some definition of "a lot") in the function.  While it would
> be possible to use a trait, I wrote the API thinking of minimal glue
> code that only does the C<->Rust conversion.
> 
> In this case, because you have a lot more code than just a call into
> the BlockDriver trait, you'd have something like
>

Re: [PATCH 2/2] [NOT-FOR-MERGE] Add qtest for migration over RDMA

2025-02-19 Thread Fabiano Rosas
Peter Xu  writes:

> On Wed, Feb 19, 2025 at 05:33:26AM +, Zhijian Li (Fujitsu) wrote:
>> 
>> 
>> On 19/02/2025 06:40, Peter Xu wrote:
>> > On Tue, Feb 18, 2025 at 06:03:48PM -0300, Fabiano Rosas wrote:
>> >> Li Zhijian via  writes:
>> >>
>> >>> This qtest requirs there is RXE link in the host.
>> >>>
>> >>> Here is an example to show how to add this RXE link:
>> >>> $ ./new-rdma-link.sh
>> >>> 192.168.22.93
>> >>>
>> >>> Signed-off-by: Li Zhijian 
>> >>> ---
>> >>> The RDMA migration was broken again...due to lack of sufficient 
>> >>> test/qtest.
>> >>>
>> >>> It's urgly to add and execute a script to establish an RDMA link in
>> >>> the C program. If anyone has a better suggestion, please let me know.
>> >>>
>> >>> $ cat ./new-rdma-link.sh
>> >>> get_ipv4_addr() {
>> >>>  ip -4 -o addr show dev "$1" |
>> >>>  sed -n 
>> >>> 's/.*[[:blank:]]inet[[:blank:]]*\([^[:blank:]/]*\).*/\1/p'
>> >>> }
>> >>>
>> >>> has_soft_rdma() {
>> >>>  rdma link | grep -q " netdev $1[[:blank:]]*\$"
>> >>> }
>> >>>
>> >>> start_soft_rdma() {
>> >>>  local type
>> >>>
>> >>>  modprobe rdma_rxe || return $?
>> >>>  type=rxe
>> >>>  (
>> >>>  cd /sys/class/net &&
>> >>>  for i in *; do
>> >>>  [ -e "$i" ] || continue
>> >>>  [ "$i" = "lo" ] && continue
>> >>>  [ "$(<"$i/addr_len")" = 6 ] || continue
>> >>>  [ "$(<"$i/carrier")" = 1 ] || continue
>> >>>  has_soft_rdma "$i" && break
>> >>>  rdma link add "${i}_$type" type $type 
>> >>> netdev "$i" && break
>> >>>  done
>> >>>  has_soft_rdma "$i" && echo $i
>> >>>  )
>> >>>
>> >>> }
>> >>>
>> >>> rxe_link=$(start_soft_rdma)
>> >>> [[ "$rxe_link" ]] && get_ipv4_addr $rxe_link
>> >>>
>> >>> Signed-off-by: Li Zhijian 
>> >>> ---
>> >>>   tests/qtest/migration/new-rdma-link.sh |  34 
>> >>>   tests/qtest/migration/precopy-tests.c  | 103 +
>> >>>   2 files changed, 137 insertions(+)
>> >>>   create mode 100644 tests/qtest/migration/new-rdma-link.sh
>> >>>
>> >>> diff --git a/tests/qtest/migration/new-rdma-link.sh 
>> >>> b/tests/qtest/migration/new-rdma-link.sh
>> >>> new file mode 100644
>> >>> index 000..ca20594eaae
>> >>> --- /dev/null
>> >>> +++ b/tests/qtest/migration/new-rdma-link.sh
>> >>> @@ -0,0 +1,34 @@
>> >>> +#!/bin/bash
>> >>> +
>> >>> +# Copied from blktests
>> >>> +get_ipv4_addr() {
>> >>> +ip -4 -o addr show dev "$1" |
>> >>> +sed -n 
>> >>> 's/.*[[:blank:]]inet[[:blank:]]*\([^[:blank:]/]*\).*/\1/p'
>> >>> +}
>> >>> +
>> >>> +has_soft_rdma() {
>> >>> +rdma link | grep -q " netdev $1[[:blank:]]*\$"
>> >>> +}
>> >>> +
>> >>> +start_soft_rdma() {
>> >>> +local type
>> >>> +
>> >>> +modprobe rdma_rxe || return $?
>> >>> +type=rxe
>> >>> +(
>> >>> +cd /sys/class/net &&
>> >>> +for i in *; do
>> >>> +[ -e "$i" ] || continue
>> >>> +[ "$i" = "lo" ] && continue
>> >>> +[ "$(<"$i/addr_len")" = 6 ] || continue
>> >>> +[ "$(<"$i/carrier")" = 1 ] || continue
>> >>> +has_soft_rdma "$i" && break
>> >>> +rdma link add "${i}_$type" type $type 
>> >>> netdev "$i" && break
>> >>> +done
>> >>> +has_soft_rdma "$i" && echo $i
>> >>> +)
>> >>> +
>> >>> +}
>> >>> +
>> >>> +rxe_link=$(start_soft_rdma)
>> >>> +[[ "$rxe_link" ]] && get_ipv4_addr $rxe_link
>> >>> diff --git a/tests/qtest/migration/precopy-tests.c 
>> >>> b/tests/qtest/migration/precopy-tests.c
>> >>> index 162fa695318..d2a1c9c9438 100644
>> >>> --- a/tests/qtest/migration/precopy-tests.c
>> >>> +++ b/tests/qtest/migration/precopy-tests.c
>> >>> @@ -98,6 +98,105 @@ static void test_precopy_unix_dirty_ring(void)
>> >>>   test_precopy_common(&args);
>> >>>   }
>> >>>   
>> >>> +static int new_rdma_link(char *buffer) {
>> >>> +// Copied from blktests
>> >>> +const char *script =
>> >>> +"#!/bin/bash\n"
>> >>> +"\n"
>> >>> +"get_ipv4_addr() {\n"
>> >>> +"ip -4 -o addr show dev \"$1\" |\n"
>> >>> +"sed -n 
>> >>> 's/.*[[:blank:]]inet[[:blank:]]*\\([^[:blank:]/]*\\).*/\\1/p'\n"
>> >>> +"}\n"
>> >>> +"\n"
>> >>> +"has_soft_rdma() {\n"
>> >>> +"rdma link | grep -q \" netdev $1[[:blank:]]*\\$\"\n"
>> >>> +"}\n"
>> >>> +"\n"
>> >>> +"start_soft_rdma() {\n"
>> >>> +"local type\n"
>> >>> +"\n"
>> >>> +"modprobe rdma_rxe || return $?\n"
>> >>> +"type=rxe\n"
>> >>> + 

[PATCH trivial] Makefile: "make dist" generates a .xz, not .bz2

2025-02-19 Thread Michael Tokarev
Fixes: 9bc9e9511944 (make-release: switch to .xz format by default)
Signed-off-by: Michael Tokarev 
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index b65b0bd41a..c92a3cf785 100644
--- a/Makefile
+++ b/Makefile
@@ -207,10 +207,10 @@ clean: recurse-clean
 
 VERSION = $(shell cat $(SRC_PATH)/VERSION)
 
-dist: qemu-$(VERSION).tar.bz2
+dist: qemu-$(VERSION).tar.xz
 
-qemu-%.tar.bz2:
-   $(SRC_PATH)/scripts/make-release "$(SRC_PATH)" "$(patsubst 
qemu-%.tar.bz2,%,$@)"
+qemu-%.tar.xz:
+   $(SRC_PATH)/scripts/make-release "$(SRC_PATH)" "$(patsubst 
qemu-%.tar.xz,%,$@)"
 
 distclean: clean recurse-distclean
-$(quiet-@)test -f build.ninja && $(NINJA) $(NINJAFLAGS) -t clean -g || 
:
-- 
2.39.5




Re: [PATCH 00/42] docs: add sphinx-domain rST generator to qapidoc

2025-02-19 Thread Markus Armbruster
John Snow  writes:

> "The text handler you add looks just like the existing latex handler. Does
> LaTeX output lack "little headings", too?"
>
> Yes, almost certainly. Can you let me know which output formats we actually
> "care about"? I'll have to test them all.

As far as I can tell, our build system runs sphinx-build -b html and -b
man.

I run it with -b text manually all the time to hunt for and review
changes in output.  I'd prefer to keep it working if practical.

For what it's worth, there is a bit of LaTeX configuration in
docs/conf.py.

>   In the meantime, I upgraded my
> patch so that the text translator properly handles branches with headings
> that delineate the different branches so that the text output is fully
> reasonable. I will need to do the same for any format we care about.
>
> I've re-pushed as of "about 30 minutes before I wrote this email" --
> https://gitlab.com/jsnow/qemu/-/commits/sphinx-domain-blergh2
>
> This branch includes the text generator fixes (which technically belong
> with the predecessor series we skipped, but I'll refactor that later.)
> it also includes fixes to the branch inliner, generated return statements,
> and generated out-of-band feature sections.

I'll fetch it, thanks!

> (Long story short: inserting new sections in certain spots was broken
> because of cache. Oops. We can discuss more why I wrote that part of the
> code like I did in review for the patch that introduced that problem. It's
> the "basic inliner" patch.)
>
> Below, I'm going to try a new communication approach where I explicitly say
> if I have added something to my tasklist or not so that it's clear to you
> what I believe is actionable (and what I am agreeing to change) and what I
> believe needs stronger input from you before I do anything. Apologies if it
> seems a little robotic, just trying new things O:-)
>
> On that note: not added to tasklist: do we need the LaTeX handler? Do we
> need any others? Please confirm O:-)

See above.

> On Fri, Feb 14, 2025 at 7:05 AM Markus Armbruster  wrote:
>
>> I started to eyeball old and new generated output side by side.
>>
>> New table of contents shows one level, old two.  No objection; the
>> navigation thingie on the left is more useful anyway.
>>
>
> Unintentional, but if you like it, it's fine by me. Nothing added to my
> tasklist.

Mention in a commit message.

>> The new generator elides unreferenced types.  Generally good, but two
>> observations:
>>
>> * QapiErrorClass is unreferenced, but its members are mentioned in
>>   Errors sections.  QapiErrorClass serves as better than nothing error
>>   code documentation, but it's gone in the new doc.  So this is a minor
>>   regression.  We can figure out what to do about it later.
>>
>
> Right. I debated making the members references to that class, but recalled
> that you disliked this class and figured you'd not like such a change, so I
> just left it alone. I do not have cross-references for individual members
> of objects at all yet anyway, so this is definitely more work regardless.
>
> We could always create a pragma of some sort (or just hardcode a list) of
> items that must be documented regardless of if they're referenced or not.
> Please let me know your preference and I will add a "ticket" on my personal
> tasklist for this project to handle that at /some point/. Nothing added to
> my tasklist just yet.

Suggest to add something like "compensate for the loss of QapiErrorClass
documentation in the QEMU QMP Reference Manual".

>> * Section "QMP errors" is empty in the new doc, because its entire
>>   contents is elided.  I guess we should elide the section as well, but
>>   it's fine to leave that for later.
>>
>
> Adding to tasklist to elide empty modules, but "for later".

ACK

>> Old doc shows a definition's since information like any other section.
>> New doc has it in the heading box.  Looks prettier and uses much less
>> space.  Not sure the heading box is the best place, but it'll do for
>> now, we can always move it around later.
>>
>
> Agree, it's a strict improvement - there may be further improvements, but
> that is always true anyway. When we tackle "autogenerated since
> information" we can tackle the since display issues more meticulously. Or
> maybe we'll need do sooner because of conflicting info in branches or
> whatever else. I dunno, I'll burn that bridge when I get to it. Nothing
> added to tasklist.

ACK

>> The new doc's headings use "Struct" or "Union" where the old one uses
>> just "Object".  Let's keep "Object", please.
>>
>
> I was afraid you'd ask for this. OK, I think it's an easy change. Can I
> keep the index page segmented by object type still, though?
>
> I do find knowing the *type* of object to be helpful as a developer,

Can you explain why and how struct vs. union matters to you as a
developer?

>  though
> I understand that from the point 

Re: [PATCH 1/2] migration: Prioritize RDMA in ram_save_target_page()

2025-02-19 Thread Peter Xu
On Wed, Feb 19, 2025 at 09:39:38AM +, Zhijian Li (Fujitsu) wrote:
> 
> 
> On 19/02/2025 06:03, Peter Xu wrote:
> > On Tue, Feb 18, 2025 at 05:30:40PM -0300, Fabiano Rosas wrote:
> >> Li Zhijian via  writes:
> >>
> >>> Address an error in RDMA-based migration by ensuring RDMA is prioritized
> >>> when saving pages in `ram_save_target_page()`.
> >>>
> >>> Previously, the RDMA protocol's page-saving step was placed after other
> >>> protocols due to a refactoring in commit bc38dc2f5f3. This led to 
> >>> migration
> >>> failures characterized by unknown control messages and state loading 
> >>> errors
> >>> destination:
> >>> (qemu) qemu-system-x86_64: Unknown control message QEMU FILE
> >>> qemu-system-x86_64: error while loading state section id 1(ram)
> >>> qemu-system-x86_64: load of migration failed: Operation not permitted
> >>> source:
> >>> (qemu) qemu-system-x86_64: RDMA is in an error state waiting migration to 
> >>> abort!
> >>> qemu-system-x86_64: failed to save SaveStateEntry with id(name): 1(ram): 
> >>> -1
> >>> qemu-system-x86_64: rdma migration: recv polling control error!
> >>> qemu-system-x86_64: warning: Early error. Sending error.
> >>> qemu-system-x86_64: warning: rdma migration: send polling control error
> >>>
> >>> RDMA migration implemented its own protocol/method to send pages to
> >>> destination side, hand over to RDMA first to prevent pages being saved by
> >>> other protocol.
> >>>
> >>> Fixes: bc38dc2f5f3 ("migration: refactor ram_save_target_page functions")
> >>> Signed-off-by: Li Zhijian 
> >>> ---
> >>>   migration/ram.c | 9 +
> >>>   1 file changed, 5 insertions(+), 4 deletions(-)
> >>>
> >>> diff --git a/migration/ram.c b/migration/ram.c
> >>> index 6f460fd22d2..635a2fe443a 100644
> >>> --- a/migration/ram.c
> >>> +++ b/migration/ram.c
> >>> @@ -1964,6 +1964,11 @@ static int ram_save_target_page(RAMState *rs, 
> >>> PageSearchStatus *pss)
> >>>   ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
> >>>   int res;
> >>>   
> >>> +/* Hand over to RDMA first */
> >>> +if (control_save_page(pss, offset, &res)) {
> >>> +return res;
> >>> +}
> >>> +
> >>
> >> Can we hoist that migrate_rdma() from inside the function? Since the
> >> other paths already check first before calling their functions.
> > 
> 
> Yeah, it sounds good to me.
> 
> 
> > If we're talking about hoist and stuff.. and if we want to go slightly
> > further, I wonder if we could also drop RAM_SAVE_CONTROL_NOT_SUPP.
> > 
> >  if (!migrate_rdma() || migration_in_postcopy()) {
> >  return RAM_SAVE_CONTROL_NOT_SUPP;
> >  }
> > 
> > We should make sure rdma_control_save_page() won't get invoked at all in
> > either case above..  
> 
> > For postcopy, maybe we could fail in the QMP migrate /
> > migrate_incoming cmd, at migration_channels_and_transport_compatible()
> 
> I tried to kill RAM_SAVE_CONTROL_NOT_SUPP, but It seems it doesn't need to 
> touch any postcopy logic
> "in the QMP migrate / migrate_incoming cmd, at 
> migration_channels_and_transport_compatible()"
> 
> Is there something I might have overlooked?

Yes it looks almost good.  What I meant is (please see below):

> 
> A whole draft diff would be like below:
> It includes 3 parts:
> 
> migration/rdma: Remove unnecessary RAM_SAVE_CONTROL_NOT_SUPP check in 
> rdma_control_save_page()
> migration: kill RAM_SAVE_CONTROL_NOT_SUPP
> migration: open control_save_page() to ram_save_target_page()
> 
> diff --git a/migration/ram.c b/migration/ram.c
> index 589b6505eb2..fc6a964fd64 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1143,32 +1143,6 @@ static int save_zero_page(RAMState *rs, 
> PageSearchStatus *pss,
>   return len;
>   }
>   
> -/*
> - * @pages: the number of pages written by the control path,
> - *< 0 - error
> - *> 0 - number of pages written
> - *
> - * Return true if the pages has been saved, otherwise false is returned.
> - */
> -static bool control_save_page(PageSearchStatus *pss,
> -  ram_addr_t offset, int *pages)
> -{
> -int ret;
> -
> -ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, 
> offset,
> - TARGET_PAGE_SIZE);
> -if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
> -return false;
> -}
> -
> -if (ret == RAM_SAVE_CONTROL_DELAYED) {
> -*pages = 1;
> -return true;
> -}
> -*pages = ret;
> -return true;
> -}
> -
>   /*
>* directly send the page to the stream
>*
> @@ -1964,6 +1938,16 @@ static int ram_save_target_page(RAMState *rs, 
> PageSearchStatus *pss)
>   ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
>   int res;
>   
> +if (migrate_rdma() && !migration_in_postcopy()) {

Here instead of bypassing postcopy, we should fail the migrate cmd early if
postcopy ever enabled:

diff --git a/migration/migration.c b/migration/migration.c
index 862f469ea7..3a82e71437 100644
--- a/migra

Re: [PULL 23/32] tests/functional: extend test_aarch64_virt with vulkan test

2025-02-19 Thread Philippe Mathieu-Daudé

(+Markus for CLI)

On 10/1/25 14:17, Alex Bennée wrote:

Now that we have virtio-gpu Vulkan support, let's add a test for it.
Currently this is using images build by buildroot:

   https://lists.buildroot.org/pipermail/buildroot/2024-December/768196.html

Reviewed-by: Thomas Huth 
Signed-off-by: Alex Bennée 
Message-Id: <20250108121054.1126164-24-alex.ben...@linaro.org>

diff --git a/tests/functional/test_aarch64_virt.py 
b/tests/functional/test_aarch64_virt.py
index 201c5ed023..6b2336a28d 100755
--- a/tests/functional/test_aarch64_virt.py
+++ b/tests/functional/test_aarch64_virt.py
@@ -13,10 +13,12 @@
  import logging
  from subprocess import check_call, DEVNULL
  
+from qemu.machine.machine import VMLaunchFailure

+
  from qemu_test import QemuSystemTest, Asset
-from qemu_test import exec_command_and_wait_for_pattern
+from qemu_test import exec_command, exec_command_and_wait_for_pattern
  from qemu_test import wait_for_console_pattern
-from qemu_test import get_qemu_img
+from qemu_test import skipIfMissingCommands, get_qemu_img
  
  
  class Aarch64VirtMachine(QemuSystemTest):

@@ -132,5 +134,73 @@ def test_aarch64_virt_gicv2(self):
  self.common_aarch64_virt("virt,gic-version=2")
  
  
+ASSET_VIRT_GPU_KERNEL = Asset(

+'https://fileserver.linaro.org/s/ce5jXBFinPxtEdx/'
+'download?path=%2F&files='
+'Image',
+'89e5099d26166204cc5ca4bb6d1a11b92c217e1f82ec67e3ba363d09157462f6')
+
+ASSET_VIRT_GPU_ROOTFS = Asset(
+'https://fileserver.linaro.org/s/ce5jXBFinPxtEdx/'
+'download?path=%2F&files='
+'rootfs.ext4.zstd',
+'792da7573f5dc2913ddb7c638151d4a6b2d028a4cb2afb38add513c1924bdad4')
+
+@skipIfMissingCommands('zstd')
+def test_aarch64_virt_with_gpu(self):
+# This tests boots with a buildroot test image that contains
+# vkmark and other GPU exercising tools. We run a headless
+# weston that nevertheless still exercises the virtio-gpu
+# backend.
+
+self.set_machine('virt')
+self.require_accelerator("tcg")
+
+kernel_path = self.ASSET_VIRT_GPU_KERNEL.fetch()
+image_path = self.uncompress(self.ASSET_VIRT_GPU_ROOTFS, format="zstd")
+
+self.vm.set_console()
+kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE +
+   'console=ttyAMA0 root=/dev/vda')
+
+self.vm.add_args("-accel", "tcg")
+self.vm.add_args("-cpu", "neoverse-v1,pauth-impdef=on")
+self.vm.add_args("-machine", "virt,gic-version=max",
+ '-kernel', kernel_path,
+ '-append', kernel_command_line)
+self.vm.add_args("-smp", "2", "-m", "2048")
+self.vm.add_args("-device",
+ "virtio-gpu-gl-pci,hostmem=4G,blob=on,venus=on")
+self.vm.add_args("-display", "egl-headless")
+self.vm.add_args("-display", "dbus,gl=on")


[*]


+self.vm.add_args("-device", "virtio-blk-device,drive=hd0")
+self.vm.add_args("-blockdev",
+ "driver=raw,file.driver=file,"
+ "node-name=hd0,read-only=on,"
+ f"file.filename={image_path}")
+self.vm.add_args("-snapshot")
+
+try:
+self.vm.launch()
+except VMLaunchFailure as excp:
+if "old virglrenderer, blob resources unsupported" in excp.output:
+self.skipTest("No blob support for virtio-gpu")
+elif "old virglrenderer, venus unsupported" in excp.output:
+self.skipTest("No venus support for virtio-gpu")


This seems dependent on the order of the CLI arguments, as I got:

qemu-system-aarch64: -device 
virtio-gpu-gl-pci,hostmem=4G,blob=on,venus=on: 'virtio-gpu-gl-pci' is 
not a valid device model name


I understand it is too complex to check this device availability with
meson, in order to avoid running the test.

Can we use device introspection instead, like we do in QTest with
qtest_qom_has_concrete_type() for accelerators? Maybe in the lines of:

  @skipIfMissingQOMType('virtio-gpu-gl-pci')

Or having a skipIfMissingDevice -> skipIfMissingQOMType alias:

  @skipIfMissingDevice('virtio-gpu-gl-pci')

tests/functional/test_virtio_version.py already uses some of it:

 def devtype_implements(vm, devtype, implements):
 return devtype in [d['name'] for d in
vm.cmd('qom-list-types', implements=implements)]


+else:
+self.log.info("unhandled launch failure: {excp.output}")
+raise excp
+
+self.wait_for_console_pattern('buildroot login:')
+exec_command(self, 'root')
+exec_command(self, 'export XDG_RUNTIME_DIR=/tmp')
+exec_command_and_wait_for_pattern(self,
+  "weston -B headless "
+  "--renderer gl "
+  "--shell kiosk "
+ 

Re: [PATCH] target/arm: Fix signed integer overflow undefined behavior.

2025-02-19 Thread Peter Maydell
On Tue, 18 Feb 2025 at 22:22, Stephen Longfield  wrote:
>
> The problem is internal to t32_expandimm_imm, the imm intermediate
> immediate value. This value is sourced from x, which always comes from
> the return of a deposit32 call, which returns uint32_t already.
>
> It's extracted via: int imm = extract32(x, 0, 8);, so the value will be
> between 0-255
>
> It is then multiplied by one of 1, 0x00010001, 0x01000100, 0x01010101,
> or 0x80.
>
> Values between 128-255 multiplied by 0x01000100 or 0x01010101 will cause
> the upper bit to get set, which is a signed integer overflow. From
> Chapter 6.5, paragraph 5 of the C11 spec:
> https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1548.pdf this is
> undefined behavior.

QEMU always compiles with -fwrapv. This means that this integer
overflow is not undefined behaviour in our dialect of C.

> Though this is a minor undefined behavior, I'd like to see this fixed,
> since the error is showing up when I enable clang's sanitizers while
> looking for other issues.

If clang's sanitizer reports the overflow as UB when built with
-fwrapv, that is a bug in the sanitizer and you should get
it fixed in clang.

We use and rely on 2s complement handling of signed integers
in a lot of places, so if you try to find and fix them
all you're going to be playing a pointless game of whackamole.

> Signed-off-by: Stephen Longfield 
> Signed-off-by: Roque Arcudia Hernandez 
> ---
>  target/arm/tcg/translate.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
> index 68ac393415..8770f0ce1c 100644
> --- a/target/arm/tcg/translate.c
> +++ b/target/arm/tcg/translate.c
> @@ -3508,9 +3508,9 @@ static int t32_expandimm_rot(DisasContext *s, int x)
>  }
>
>  /* Return the unrotated immediate from T32ExpandImm.  */
> -static int t32_expandimm_imm(DisasContext *s, int x)
> +static uint32_t t32_expandimm_imm(DisasContext *s, uint32_t x)

This function is following the API for decodetree !function
filters, which return 'int', not 'uint32_t'.

>  {
> -int imm = extract32(x, 0, 8);
> +uint32_t imm = extract32(x, 0, 8);

Given what we're doing in the function, it is reasonable
to make this a uint32_t, though.

>
>  switch (extract32(x, 8, 4)) {
>  case 0: /* XY */

thanks
-- PMM



Re: [PATCH 1/2] rust: subprojects: add libc crate

2025-02-19 Thread Zhao Liu
On Thu, Feb 13, 2025 at 03:32:15PM +0100, Paolo Bonzini wrote:
> Date: Thu, 13 Feb 2025 15:32:15 +0100
> From: Paolo Bonzini 
> Subject: [PATCH 1/2] rust: subprojects: add libc crate
> X-Mailer: git-send-email 2.48.1
> 
> This allows access to errno values.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  rust/Cargo.lock   |  7 
>  rust/qemu-api/Cargo.toml  |  1 +
>  scripts/archive-source.sh |  2 +-
>  scripts/make-release  |  2 +-
>  subprojects/libc-0.2-rs.wrap  |  7 
>  .../packagefiles/libc-0.2-rs/meson.build  | 36 +++
>  6 files changed, 53 insertions(+), 2 deletions(-)
>  create mode 100644 subprojects/libc-0.2-rs.wrap
>  create mode 100644 subprojects/packagefiles/libc-0.2-rs/meson.build

Missing to update subprojects/.gitignore. With this nit fixed,

Reviewed-by: Zhao Liu 




Re: [PATCH] microvm: do not use the lastest cpu version

2025-02-19 Thread Igor Mammedov
On Thu, 13 Feb 2025 15:11:00 +0530
Ani Sinha  wrote:

> Microvm machines are not versioned and therefore there is no requirement to 
> use
> the latest cpu model by default. Let microvms use the non-versioned cpu model.
> Those users who need spefific cpu versions can use explicit commandline to
> select the cpu version desired.

Above says why we can do this but,
I'd also add here a reason why we are doing that to begin with. 

> 
> CC: imamm...@redhat.com
> CC: zhao1@intel.com
> Signed-off-by: Ani Sinha 

Reviewed-by: Igor Mammedov 

> ---
>  hw/i386/microvm.c |  2 +-
>  target/i386/cpu.c | 15 ---
>  target/i386/cpu.h |  4 
>  3 files changed, 1 insertion(+), 20 deletions(-)
> 
> Pipeline passes:
> https://gitlab.com/anisinha/qemu/-/pipelines/1669159835
> See also Igor's comment on
> https://patchwork.ozlabs.org/project/qemu-devel/patch/20250128035526.3750043-1-anisi...@redhat.com/
> 
> diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
> index a8d354aabe..b8be1542ff 100644
> --- a/hw/i386/microvm.c
> +++ b/hw/i386/microvm.c
> @@ -458,7 +458,7 @@ static void microvm_machine_state_init(MachineState 
> *machine)
>  
>  microvm_memory_init(mms);
>  
> -x86_cpus_init(x86ms, CPU_VERSION_LATEST);
> +x86_cpus_init(x86ms, 1);
>  
>  microvm_devices_init(mms);
>  }
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index b5dd60d281..6d251c0025 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -5513,18 +5513,6 @@ void x86_cpu_set_default_version(X86CPUVersion version)
>  default_cpu_version = version;
>  }
>  
> -static X86CPUVersion x86_cpu_model_last_version(const X86CPUModel *model)
> -{
> -int v = 0;
> -const X86CPUVersionDefinition *vdef =
> -x86_cpu_def_get_versions(model->cpudef);
> -while (vdef->version) {
> -v = vdef->version;
> -vdef++;
> -}
> -return v;
> -}
> -
>  /* Return the actual version being used for a specific CPU model */
>  static X86CPUVersion x86_cpu_model_resolve_version(const X86CPUModel *model)
>  {
> @@ -5532,9 +5520,6 @@ static X86CPUVersion 
> x86_cpu_model_resolve_version(const X86CPUModel *model)
>  if (v == CPU_VERSION_AUTO) {
>  v = default_cpu_version;
>  }
> -if (v == CPU_VERSION_LATEST) {
> -return x86_cpu_model_last_version(model);
> -}
>  return v;
>  }
>  
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index c67b42d34f..71f150a05f 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -2701,10 +2701,6 @@ void apic_handle_tpr_access_report(DeviceState *d, 
> target_ulong ip,
> TPRAccess access);
>  
>  /* Special values for X86CPUVersion: */
> -
> -/* Resolve to latest CPU version */
> -#define CPU_VERSION_LATEST -1
> -
>  /*
>   * Resolve to version defined by current machine type.
>   * See x86_cpu_set_default_version()




Re: [PATCH] target/arm: Fix signed integer overflow undefined behavior.

2025-02-19 Thread Stephen Longfield
On Wed, Feb 19, 2025 at 7:26 AM Peter Maydell 
wrote:

> On Tue, 18 Feb 2025 at 22:22, Stephen Longfield 
> wrote:
> >
> > The problem is internal to t32_expandimm_imm, the imm intermediate
> > immediate value. This value is sourced from x, which always comes from
> > the return of a deposit32 call, which returns uint32_t already.
> >
> > It's extracted via: int imm = extract32(x, 0, 8);, so the value will be
> > between 0-255
> >
> > It is then multiplied by one of 1, 0x00010001, 0x01000100, 0x01010101,
> > or 0x80.
> >
> > Values between 128-255 multiplied by 0x01000100 or 0x01010101 will cause
> > the upper bit to get set, which is a signed integer overflow. From
> > Chapter 6.5, paragraph 5 of the C11 spec:
> > https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1548.pdf this is
> > undefined behavior.
>
> QEMU always compiles with -fwrapv. This means that this integer
> overflow is not undefined behaviour in our dialect of C.
>
> > Though this is a minor undefined behavior, I'd like to see this fixed,
> > since the error is showing up when I enable clang's sanitizers while
> > looking for other issues.
>
> If clang's sanitizer reports the overflow as UB when built with
> -fwrapv, that is a bug in the sanitizer and you should get
> it fixed in clang.
> We use and rely on 2s complement handling of signed integers
> in a lot of places, so if you try to find and fix them
> all you're going to be playing a pointless game of whackamole.
>

Yeah, I was running with `-ftrapv` instead of `-fwrapv` looking for errors
in other code.
This was the only place that got flagged, but sounds like that's likely
just an artifact
of the test I was running. (Though, there is a vanishingly small amount of
math
done on `int`s in target/arm/tcg/translate.c, which also probably helps.)


> > Signed-off-by: Stephen Longfield 
> > Signed-off-by: Roque Arcudia Hernandez 
> > ---
> >  target/arm/tcg/translate.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
> > index 68ac393415..8770f0ce1c 100644
> > --- a/target/arm/tcg/translate.c
> > +++ b/target/arm/tcg/translate.c
> > @@ -3508,9 +3508,9 @@ static int t32_expandimm_rot(DisasContext *s, int
> x)
> >  }
> >
> >  /* Return the unrotated immediate from T32ExpandImm.  */
> > -static int t32_expandimm_imm(DisasContext *s, int x)
> > +static uint32_t t32_expandimm_imm(DisasContext *s, uint32_t x)
>
> This function is following the API for decodetree !function
> filters, which return 'int', not 'uint32_t'.
>
> >  {
> > -int imm = extract32(x, 0, 8);
> > +uint32_t imm = extract32(x, 0, 8);
>
> Given what we're doing in the function, it is reasonable
> to make this a uint32_t, though.
>

Changing this to uint32_t is sufficient for me.

I'll send out a v2 of the patch.


> >
> >  switch (extract32(x, 8, 4)) {
> >  case 0: /* XY */
>
> thanks
> -- PMM
>

Thank you for your comprehensive and quick feedback!

--Stephen


Re: [PATCH 1/1] qapi/char.json: minor doc rewording for `hub` device

2025-02-19 Thread Markus Armbruster
Roman Penyaev  writes:

> Refine documentation for the hub device, specify the maximum.
>
> Signed-off-by: Roman Penyaev 
> Cc: Marc-André Lureau 
> Cc: Markus Armbruster 
> Cc: qemu-devel@nongnu.org
> ---
>  qapi/char.json | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/qapi/char.json b/qapi/char.json
> index f02b66c06b3e..dde2f9538f81 100644
> --- a/qapi/char.json
> +++ b/qapi/char.json
> @@ -337,7 +337,7 @@
>  #
>  # Configuration info for hub chardevs.
>  #
> -# @chardevs: List of chardev IDs, which should be added to this hub
> +# @chardevs: IDs to be added to this hub (maximum 4 devices).
>  #
>  # Since: 10.0
>  ##

Reviewed-by: Markus Armbruster 

Thank you, Roman!




[PATCH v2] target/arm: Fix signed integer overflow undefined behavior.

2025-02-19 Thread Stephen Longfield
The problem is internal to t32_expandimm_imm, the imm intermediate
immediate value.

It's extracted via: int imm = extract32(x, 0, 8);, so the value will be
between 0-255

It is then multiplied by one of 1, 0x00010001, 0x01000100, 0x01010101,
or 0x80.

Values between 128-255 multiplied by 0x01000100 or 0x01010101 will cause
the upper bit to get set, which is a signed integer overflow. From
Chapter 6.5, paragraph 5 of the C11 spec:
https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1548.pdf this is
undefined behavior.

Though this is a minor undefined behavior, I'd like to see this fixed,
since the error is showing up when I enable clang's sanitizers while
looking for other issues.

Changes from v1: From peter.mayd...@linaro.org's review, only changing
the internal representation from int to uint32_t, and leaving the API
types the same.

Signed-off-by: Stephen Longfield 
Signed-off-by: Roque Arcudia Hernandez 
---
 target/arm/tcg/translate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index 68ac393415..d8225b77c8 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -3510,7 +3510,7 @@ static int t32_expandimm_rot(DisasContext *s, int x)
 /* Return the unrotated immediate from T32ExpandImm.  */
 static int t32_expandimm_imm(DisasContext *s, int x)
 {
-int imm = extract32(x, 0, 8);
+uint32_t imm = extract32(x, 0, 8);

 switch (extract32(x, 8, 4)) {
 case 0: /* XY */
--
2.48.1.601.g30ceb7b040-goog




[PATCH v8 22/28] vfio-user: set up container access to the proxy

2025-02-19 Thread John Levon
From: Jagannathan Raman 

The user container will shortly need access to the underlying vfio-user
proxy; set this up.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/container.c | 43 +++-
 hw/vfio-user/container.h |  1 +
 hw/vfio/container.c  |  4 +++-
 3 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/hw/vfio-user/container.c b/hw/vfio-user/container.c
index c079d6f89b..0c487dbb92 100644
--- a/hw/vfio-user/container.c
+++ b/hw/vfio-user/container.c
@@ -55,15 +55,28 @@ static int vfio_user_query_dirty_bitmap(const 
VFIOContainerBase *bcontainer,
 
 static bool vfio_user_setup(VFIOContainerBase *bcontainer, Error **errp)
 {
-error_setg_errno(errp, ENOTSUP, "Not supported");
-return -ENOTSUP;
+VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer,
+bcontainer);
+
+assert(container->proxy->dma_pgsizes != 0);
+bcontainer->pgsizes = container->proxy->dma_pgsizes;
+bcontainer->dma_max_mappings = container->proxy->max_dma;
+
+/* No live migration support yet. */
+bcontainer->dirty_pages_supported = false;
+bcontainer->max_dirty_bitmap_size = container->proxy->max_bitmap;
+bcontainer->dirty_pgsizes = container->proxy->migr_pgsize;
+
+return true;
 }
 
-static VFIOUserContainer *vfio_create_user_container(Error **errp)
+static VFIOUserContainer *vfio_create_user_container(VFIODevice *vbasedev,
+ Error **errp)
 {
 VFIOUserContainer *container;
 
 container = VFIO_IOMMU_USER(object_new(TYPE_VFIO_IOMMU_USER));
+container->proxy = vbasedev->proxy;
 return container;
 }
 
@@ -71,16 +84,18 @@ static VFIOUserContainer *vfio_create_user_container(Error 
**errp)
  * Try to mirror vfio_connect_container() as much as possible.
  */
 static VFIOUserContainer *
-vfio_connect_user_container(AddressSpace *as, Error **errp)
+vfio_connect_user_container(AddressSpace *as, VFIODevice *vbasedev,
+Error **errp)
 {
 VFIOContainerBase *bcontainer;
 VFIOUserContainer *container;
 VFIOAddressSpace *space;
 VFIOIOMMUClass *vioc;
+int ret;
 
 space = vfio_get_address_space(as);
 
-container = vfio_create_user_container(errp);
+container = vfio_create_user_container(vbasedev, errp);
 if (!container) {
 goto put_space_exit;
 }
@@ -91,11 +106,17 @@ vfio_connect_user_container(AddressSpace *as, Error **errp)
 goto free_container_exit;
 }
 
+ret = ram_block_uncoordinated_discard_disable(true);
+if (ret) {
+error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
+goto unregister_container_exit;
+}
+
 vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
 assert(vioc->setup);
 
 if (!vioc->setup(bcontainer, errp)) {
-goto unregister_container_exit;
+goto enable_discards_exit;
 }
 
 vfio_address_space_insert(space, bcontainer);
@@ -120,6 +141,9 @@ listener_release_exit:
 vioc->release(bcontainer);
 }
 
+enable_discards_exit:
+ram_block_uncoordinated_discard_disable(false);
+
 unregister_container_exit:
 vfio_cpr_unregister_container(bcontainer);
 
@@ -136,14 +160,15 @@ static void 
vfio_disconnect_user_container(VFIOUserContainer *container)
 {
 VFIOContainerBase *bcontainer = &container->bcontainer;
 VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
+VFIOAddressSpace *space = bcontainer->space;
+
+ram_block_uncoordinated_discard_disable(false);
 
 memory_listener_unregister(&bcontainer->listener);
 if (vioc->release) {
 vioc->release(bcontainer);
 }
 
-VFIOAddressSpace *space = bcontainer->space;
-
 vfio_cpr_unregister_container(bcontainer);
 object_unref(container);
 
@@ -177,7 +202,7 @@ static bool vfio_user_attach_device(const char *name, 
VFIODevice *vbasedev,
 {
 VFIOUserContainer *container;
 
-container = vfio_connect_user_container(as, errp);
+container = vfio_connect_user_container(as, vbasedev, errp);
 if (container == NULL) {
 error_prepend(errp, "failed to connect proxy");
 return false;
diff --git a/hw/vfio-user/container.h b/hw/vfio-user/container.h
index 24ce13bc2d..8a033d5598 100644
--- a/hw/vfio-user/container.h
+++ b/hw/vfio-user/container.h
@@ -16,6 +16,7 @@
 /* MMU container sub-class for vfio-user. */
 typedef struct VFIOUserContainer {
 VFIOContainerBase bcontainer;
+VFIOUserProxy *proxy;
 } VFIOUserContainer;
 
 OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserContainer, VFIO_IOMMU_USER);
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index ddb86edb65..797707d0fd 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -909,7 +909,9 @@ void vfio_put_base_device(VFIODevice *vbasedev)
 QLIST_REMOVE(vbasedev, next);
 vbasedev->group = NUL

[PATCH v8 04/28] vfio: add vfio_attach_device_by_iommu_type()

2025-02-19 Thread John Levon
Allow attachment by explicitly passing a TYPE_VFIO_IOMMU_* string;
vfio-user will use this later.

Signed-off-by: John Levon 
---
 hw/vfio/common.c  | 30 +++---
 include/hw/vfio/vfio-common.h |  3 +++
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index b49aafc40c..eefd735bc6 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1569,25 +1569,20 @@ retry:
 return info;
 }
 
-bool vfio_attach_device(char *name, VFIODevice *vbasedev,
-AddressSpace *as, Error **errp)
+bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name,
+  VFIODevice *vbasedev, AddressSpace *as,
+  Error **errp)
 {
-const VFIOIOMMUClass *ops =
-VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
 HostIOMMUDevice *hiod = NULL;
-
-if (vbasedev->iommufd) {
-ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
-}
-
-assert(ops);
-
+const VFIOIOMMUClass *ops =
+VFIO_IOMMU_CLASS(object_class_by_name(iommu_type));
 
 if (!vbasedev->mdev) {
 hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
 vbasedev->hiod = hiod;
 }
 
+
 if (!ops->attach_device(name, vbasedev, as, errp)) {
 object_unref(hiod);
 vbasedev->hiod = NULL;
@@ -1597,6 +1592,19 @@ bool vfio_attach_device(char *name, VFIODevice *vbasedev,
 return true;
 }
 
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+   AddressSpace *as, Error **errp)
+{
+const char *iommu_type = TYPE_VFIO_IOMMU_LEGACY;
+
+if (vbasedev->iommufd) {
+iommu_type = TYPE_VFIO_IOMMU_IOMMUFD;
+}
+
+return vfio_attach_device_by_iommu_type(iommu_type, name, vbasedev,
+as, errp);
+}
+
 void vfio_detach_device(VFIODevice *vbasedev)
 {
 if (!vbasedev->bcontainer) {
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f4f08eb8a6..c40f8de6bc 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -252,6 +252,9 @@ bool vfio_device_is_mdev(VFIODevice *vbasedev);
 bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp);
 bool vfio_attach_device(char *name, VFIODevice *vbasedev,
 AddressSpace *as, Error **errp);
+bool vfio_attach_device_by_iommu_type(const char *iommu_type, char *name,
+  VFIODevice *vbasedev, AddressSpace *as,
+  Error **errp);
 void vfio_detach_device(VFIODevice *vbasedev);
 VFIODevice *vfio_get_vfio_device(Object *obj);
 
-- 
2.34.1




[PATCH v8 28/28] vfio-user: add coalesced posted writes

2025-02-19 Thread John Levon
From: Jagannathan Raman 

Add new message to send multiple writes to server in a single message.
Prevents the outgoing queue from overflowing when a long latency
operation is followed by a series of posted writes.

Originally-by: John Johnson 
Signed-off-by: Elena Ufimtseva 
Signed-off-by: Jagannathan Raman 
Signed-off-by: John Levon 
---
 hw/vfio-user/common.c | 131 +-
 hw/vfio-user/common.h |   7 ++
 hw/vfio-user/protocol.h   |  21 ++
 hw/vfio-user/trace-events |   1 +
 4 files changed, 158 insertions(+), 2 deletions(-)

diff --git a/hw/vfio-user/common.c b/hw/vfio-user/common.c
index e44c8a2568..809c8e6614 100644
--- a/hw/vfio-user/common.c
+++ b/hw/vfio-user/common.c
@@ -20,6 +20,7 @@
 #include "io/channel-socket.h"
 #include "io/channel-util.h"
 #include "qapi/error.h"
+#include "qobject/qbool.h"
 #include "qobject/qdict.h"
 #include "qobject/qjson.h"
 #include "qobject/qstring.h"
@@ -55,6 +56,7 @@ static void vfio_user_request(void *opaque);
 static int vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg);
 static void vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
  VFIOUserFDs *fds);
+static void vfio_user_flush_multi(VFIOUserProxy *proxy);
 
 static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err)
 {
@@ -459,6 +461,11 @@ static void vfio_user_send(void *opaque)
 }
 qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx,
vfio_user_recv, NULL, NULL, proxy);
+
+/* queue empty - send any pending multi write msgs */
+if (proxy->wr_multi != NULL) {
+vfio_user_flush_multi(proxy);
+}
 }
 }
 
@@ -479,6 +486,7 @@ static int vfio_user_send_one(VFIOUserProxy *proxy)
 }
 
 QTAILQ_REMOVE(&proxy->outgoing, msg, next);
+proxy->num_outgoing--;
 if (msg->type == VFIO_MSG_ASYNC) {
 vfio_user_recycle(proxy, msg);
 } else {
@@ -586,11 +594,18 @@ static int vfio_user_send_queued(VFIOUserProxy *proxy, 
VFIOUserMsg *msg)
 {
 int ret;
 
+/* older coalesced writes go first */
+if (proxy->wr_multi != NULL &&
+((msg->hdr->flags & VFIO_USER_TYPE) == VFIO_USER_REQUEST)) {
+vfio_user_flush_multi(proxy);
+}
+
 /*
  * Unsent outgoing msgs - add to tail
  */
 if (!QTAILQ_EMPTY(&proxy->outgoing)) {
 QTAILQ_INSERT_TAIL(&proxy->outgoing, msg, next);
+proxy->num_outgoing++;
 return 0;
 }
 
@@ -604,6 +619,7 @@ static int vfio_user_send_queued(VFIOUserProxy *proxy, 
VFIOUserMsg *msg)
 }
 if (ret == QIO_CHANNEL_ERR_BLOCK) {
 QTAILQ_INSERT_HEAD(&proxy->outgoing, msg, next);
+proxy->num_outgoing = 1;
 qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx,
vfio_user_recv, proxy->ctx,
vfio_user_send, proxy);
@@ -1119,12 +1135,27 @@ static bool check_migr(VFIOUserProxy *proxy, QObject 
*qobj, Error **errp)
 return caps_parse(proxy, qdict, caps_migr, errp);
 }
 
+static bool check_multi(VFIOUserProxy *proxy, QObject *qobj, Error **errp)
+{
+QBool *qb = qobject_to(QBool, qobj);
+
+if (qb == NULL) {
+error_setg(errp, "malformed %s", VFIO_USER_CAP_MULTI);
+return false;
+}
+if (qbool_get_bool(qb)) {
+proxy->flags |= VFIO_PROXY_USE_MULTI;
+}
+return true;
+}
+
 static struct cap_entry caps_cap[] = {
 { VFIO_USER_CAP_MAX_FDS, check_max_fds },
 { VFIO_USER_CAP_MAX_XFER, check_max_xfer },
 { VFIO_USER_CAP_PGSIZES, check_pgsizes },
 { VFIO_USER_CAP_MAP_MAX, check_max_dma },
 { VFIO_USER_CAP_MIGR, check_migr },
+{ VFIO_USER_CAP_MULTI, check_multi },
 { NULL }
 };
 
@@ -1183,6 +1214,7 @@ static GString *caps_json(void)
 qdict_put_int(capdict, VFIO_USER_CAP_MAX_XFER, VFIO_USER_DEF_MAX_XFER);
 qdict_put_int(capdict, VFIO_USER_CAP_PGSIZES, VFIO_USER_DEF_PGSIZE);
 qdict_put_int(capdict, VFIO_USER_CAP_MAP_MAX, VFIO_USER_DEF_MAP_MAX);
+qdict_put_bool(capdict, VFIO_USER_CAP_MULTI, true);
 
 qdict_put_obj(dict, VFIO_USER_CAP, QOBJECT(capdict));
 
@@ -1451,19 +1483,114 @@ static int vfio_user_region_read(VFIOUserProxy *proxy, 
uint8_t index,
 return msgp->count;
 }
 
+static void vfio_user_flush_multi(VFIOUserProxy *proxy)
+{
+VFIOUserMsg *msg;
+VFIOUserWRMulti *wm = proxy->wr_multi;
+int ret;
+
+proxy->wr_multi = NULL;
+
+/* adjust size for actual # of writes */
+wm->hdr.size -= (VFIO_USER_MULTI_MAX - wm->wr_cnt) * sizeof(VFIOUserWROne);
+
+msg = vfio_user_getmsg(proxy, &wm->hdr, NULL);
+msg->id = wm->hdr.id;
+msg->rsize = 0;
+msg->type = VFIO_MSG_ASYNC;
+trace_vfio_user_wrmulti("flush", wm->wr_cnt);
+
+ret = vfio_user_send_queued(proxy, msg);
+if (ret < 0) {
+vfio_user_recycle(proxy, msg);
+}
+}
+
+static void vfio_user_create_multi(VFIOUserProxy *proxy)
+{
+VFIOUserWRMu

  1   2   3   >