[PATCH] migration/dirtyrate: Show sample pages only in page-sampling mode

2022-11-28 Thread Zhenzhong Duan
The value of "Sample Pages" is confusing in mode other than page-sampling.
See below:

(qemu) calc_dirty_rate -b 10 520
(qemu) info dirty_rate
Status: measuring
Start Time: 11646834 (ms)
Sample Pages: 520 (per GB)
Period: 10 (sec)
Mode: dirty-bitmap
Dirty rate: (not ready)

(qemu) info dirty_rate
Status: measured
Start Time: 11646834 (ms)
Sample Pages: 0 (per GB)
Period: 10 (sec)
Mode: dirty-bitmap
Dirty rate: 2 (MB/s)

While it's totally useless in dirty-ring and dirty-bitmap mode, fix to
show it only in page-sampling mode.

Signed-off-by: Zhenzhong Duan 
---
 migration/dirtyrate.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index d6f1e01a7001..5041c2558c62 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -720,8 +720,8 @@ void qmp_calc_dirty_rate(int64_t calc_time,
 mode =  DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
 }
 
-if (has_sample_pages && mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
-error_setg(errp, "either sample-pages or dirty-ring can be 
specified.");
+if (has_sample_pages && mode != DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) {
+error_setg(errp, "sample-pages is used only in page-sampling mode");
 return;
 }
 
@@ -791,8 +791,10 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict)
DirtyRateStatus_str(info->status));
 monitor_printf(mon, "Start Time: %"PRIi64" (ms)\n",
info->start_time);
-monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n",
-   info->sample_pages);
+if (info->mode == DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) {
+monitor_printf(mon, "Sample Pages: %"PRIu64" (per GB)\n",
+   info->sample_pages);
+}
 monitor_printf(mon, "Period: %"PRIi64" (sec)\n",
info->calc_time);
 monitor_printf(mon, "Mode: %s\n",
-- 
2.25.1




[PATCH v2] softmmu/physmem: Fix input parameters for flatview_access_allowed()

2022-07-22 Thread Zhenzhong Duan
The comment of flatview_access_allowed() suggests to pass address
within that memory region, this isn't true in some call sites.

This makes qemu log in flatview_access_allowed() confusing and
potential risk if the input parameter will be checked in the future.

Signed-off-by: Zhenzhong Duan 
Reviewed-by: Peter Xu 
Reviewed-by: David Hildenbrand 
---
v2: Fix typo and removed Fixed-by per David

 softmmu/physmem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index fb16be57a6c6..214cb04c8fc3 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -2850,7 +2850,7 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr 
addr, MemTxAttrs attrs,
 
 l = len;
 mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
-if (!flatview_access_allowed(mr, attrs, addr, len)) {
+if (!flatview_access_allowed(mr, attrs, addr1, l)) {
 return MEMTX_ACCESS_ERROR;
 }
 return flatview_write_continue(fv, addr, attrs, buf, len,
@@ -2917,7 +2917,7 @@ static MemTxResult flatview_read(FlatView *fv, hwaddr 
addr,
 
 l = len;
 mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
-if (!flatview_access_allowed(mr, attrs, addr, len)) {
+if (!flatview_access_allowed(mr, attrs, addr1, l)) {
 return MEMTX_ACCESS_ERROR;
 }
 return flatview_read_continue(fv, addr, attrs, buf, len,
-- 
2.25.1




[PATCH v1 02/11] backends/iommufd: Introduce IOMMUFDDevice

2024-02-27 Thread Zhenzhong Duan
IOMMUFDDevice represents a device in iommufd and can be used as
a communication interface between devices (i.e., VFIO, VDPA) and
vIOMMU.

Currently it includes only public iommufd handle and device id
which could be used by vIOMMU to get hw IOMMU information.

There will also be some elements in private field in future,
i.e., capability bits for dirty tracking; when nested translation
is supported in future, vIOMMU is going to have more iommufd related
operations like allocate hwpt for a device, attach/detach hwpt, etc.
So IOMMUFDDevice will be further extended with those needs.

IOMMUFDDevice is willingly not a QOM object because we don't want
it to be visible from the user interface.

Introduce a helper iommufd_device_init to initialize IOMMUFDDevice.

Originally-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h | 15 +++
 backends/iommufd.c   |  9 +
 2 files changed, 24 insertions(+)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9af27ebd6c..d509ff88ef 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
 OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
@@ -33,4 +34,18 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+
+
+/* Abstraction of host IOMMUFD device */
+typedef struct IOMMUFDDevice {
+/* private: */
+HostIOMMUDevice base;
+
+/* public: */
+IOMMUFDBackend *iommufd;
+uint32_t devid;
+} IOMMUFDDevice;
+
+void iommufd_device_init(IOMMUFDDevice *idev,
+ IOMMUFDBackend *iommufd, int devid);
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 1ef683c7b0..6d280e4aea 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -231,3 +231,12 @@ static void register_types(void)
 }
 
 type_init(register_types);
+
+void iommufd_device_init(IOMMUFDDevice *idev,
+ IOMMUFDBackend *iommufd, int devid)
+{
+host_iommu_base_device_init(&idev->base, HID_IOMMUFD,
+sizeof(IOMMUFDDevice));
+idev->iommufd = iommufd;
+idev->devid = devid;
+}
-- 
2.34.1




[PATCH v1 01/11] Introduce a common abstract struct HostIOMMUDevice

2024-02-27 Thread Zhenzhong Duan
HostIOMMUDevice will be inherited by two sub classes,
legacy and iommufd currently.

Introduce a helper function host_iommu_base_device_init to initialize it.

Suggested-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/host_iommu_device.h | 22 ++
 1 file changed, 22 insertions(+)
 create mode 100644 include/sysemu/host_iommu_device.h

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
new file mode 100644
index 00..fe80ab25fb
--- /dev/null
+++ b/include/sysemu/host_iommu_device.h
@@ -0,0 +1,22 @@
+#ifndef HOST_IOMMU_DEVICE_H
+#define HOST_IOMMU_DEVICE_H
+
+typedef enum HostIOMMUDevice_Type {
+HID_LEGACY,
+HID_IOMMUFD,
+HID_MAX,
+} HostIOMMUDevice_Type;
+
+typedef struct HostIOMMUDevice {
+HostIOMMUDevice_Type type;
+size_t size;
+} HostIOMMUDevice;
+
+static inline void host_iommu_base_device_init(HostIOMMUDevice *dev,
+   HostIOMMUDevice_Type type,
+   size_t size)
+{
+dev->type = type;
+dev->size = size;
+}
+#endif
-- 
2.34.1




[PATCH v1 06/11] vfio/container: Implement host_iommu_device_create callback in legacy mode

2024-02-27 Thread Zhenzhong Duan
This callback will be used to initialize base and public elements in
IOMMULegacyDevice.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index bd25b9fbad..2e8ff32284 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1120,6 +1120,14 @@ out_single:
 return ret;
 }
 
+static void vfio_legacy_host_iommu_device_create(VFIODevice *vbasedev)
+{
+vbasedev->base_hdev = g_malloc0(sizeof(IOMMULegacyDevice));
+
+host_iommu_base_device_init(vbasedev->base_hdev, HID_LEGACY,
+sizeof(IOMMULegacyDevice));
+}
+
 static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
@@ -1132,6 +1140,7 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
 vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
+vioc->host_iommu_device_create = vfio_legacy_host_iommu_device_create;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH v1 00/11] Add a host IOMMU device abstraction

2024-02-27 Thread Zhenzhong Duan
Hi,

Based on Joao's suggestion, the iommufd nesting prerequisite series [1]
is further splitted to host IOMMU device abstract part and vIOMMU
check/sync part. This series implements the 1st part.

This split also faciliates the dirty tracking series [2] and virtio-iommu
series [3] to depend on 1st part.

PATCH1-3: Introduce HostIOMMUDevice and two sub class
PATCH4: Define HostIOMMUDevice handle in VFIODevice
PATCH5-8: Introdcue host_iommu_device_create callback to allocate and intialize 
HostIOMMUDevice
PATCH9-10: Introdcue set/unset_iommu_device to pass HostIOMMUDevice to vIOMMU
PATCH11: a helper to get host IOMMU info

Because it's becoming clear on community's suggestion, I'd like to remove
rfc tag from this version.

Qemu code can be found at:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_preq_part1_v1

[1] 
https://lore.kernel.org/qemu-devel/20240201072818.327930-1-zhenzhong.d...@intel.com/
[2] 
https://lore.kernel.org/qemu-devel/20240212135643.5858-1-joao.m.mart...@oracle.com/
[3] 
https://lore.kernel.org/qemu-devel/20240117080414.316890-1-eric.au...@redhat.com/

Thanks
Zhenzhong

Changelog:
v1:
- use HostIOMMUDevice handle instead of union in VFIODevice (Eric)
- change host_iommu_device_init to host_iommu_device_create
- allocate HostIOMMUDevice in host_iommu_device_create callback
  and set the VFIODevice base_hdev handle (Eric)
- refine pci_device_set/unset_iommu_device doc (Eric)
- use HostIOMMUDevice handle instead of union in VTDHostIOMMUDevice (Eric)

rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B


Yi Liu (1):
  hw/pci: Introduce pci_device_set/unset_iommu_device()

Zhenzhong Duan (10):
  Introduce a common abstract struct HostIOMMUDevice
  backends/iommufd: Introduce IOMMUFDDevice
  vfio: Introduce IOMMULegacyDevice
  vfio: Add HostIOMMUDevice handle into VFIODevice
  vfio: Introduce host_iommu_device_create callback
  vfio/container: Implement host_iommu_device_create callback in legacy
mode
  vfio/iommufd: Implement host_iommu_device_create callback in iommufd
mode
  vfio/pci: Allocate and initialize HostIOMMUDevice after attachment
  vfio: Pass HostIOMMUDevice to vIOMMU
  backends/iommufd: Introduce helper function iommufd_device_get_info()

 include/hw/pci/pci.h  | 38 +++-
 include/hw/vfio/vfio-common.h |  8 
 include/hw/vfio/vfio-container-base.h |  1 +
 include/sysemu/host_iommu_device.h| 22 ++
 include/sysemu/iommufd.h  | 19 
 backends/iommufd.c| 32 +-
 hw/pci/pci.c  | 62 +--
 hw/vfio/common.c  |  8 
 hw/vfio/container.c   |  9 
 hw/vfio/iommufd.c | 10 +
 hw/vfio/pci.c | 24 ---
 11 files changed, 223 insertions(+), 10 deletions(-)
 create mode 100644 include/sysemu/host_iommu_device.h

-- 
2.34.1




[PATCH v1 08/11] vfio/pci: Allocate and initialize HostIOMMUDevice after attachment

2024-02-27 Thread Zhenzhong Duan
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 4fa387f043..6cc7de5d10 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3006,6 +3006,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 goto error;
 }
 
+/* Allocate and initialize HostIOMMUDevice after attachment succeed */
+host_iommu_device_create(vbasedev);
+
 vfio_populate_device(vdev, &err);
 if (err) {
 error_propagate(errp, err);
@@ -3244,6 +3247,7 @@ static void vfio_instance_finalize(Object *obj)
 
 vfio_display_finalize(vdev);
 vfio_bars_finalize(vdev);
+g_free(vdev->vbasedev.base_hdev);
 g_free(vdev->emulated_config_bits);
 g_free(vdev->rom);
 /*
-- 
2.34.1




[PATCH v1 03/11] vfio: Introduce IOMMULegacyDevice

2024-02-27 Thread Zhenzhong Duan
Similar as IOMMUFDDevice, IOMMULegacyDevice represents a device in
legacy mode and can be used as a communication interface between
devices (i.e., VFIO, VDPA) and vIOMMU.

Currently it includes nothing legacy specific, but could be extended
with any wanted info of legacy mode when necessary.

IOMMULegacyDevice is willingly not a QOM object because we don't want
it to be visible from the user interface.

Suggested-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9b7ef7d02b..8bfb9cbe94 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
 #endif
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -97,6 +98,11 @@ typedef struct VFIOIOMMUFDContainer {
 uint32_t ioas_id;
 } VFIOIOMMUFDContainer;
 
+/* Abstraction of host IOMMU legacy device */
+typedef struct IOMMULegacyDevice {
+HostIOMMUDevice base;
+} IOMMULegacyDevice;
+
 typedef struct VFIODeviceOps VFIODeviceOps;
 
 typedef struct VFIODevice {
-- 
2.34.1




[PATCH v1 10/11] vfio: Pass HostIOMMUDevice to vIOMMU

2024-02-27 Thread Zhenzhong Duan
Support both iommufd and legacy backend.

Originally-by: Yi Liu 
Signed-off-by: Nicolin Chen 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6cc7de5d10..ed9f386fde 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3112,11 +3112,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 vfio_bars_register(vdev);
 
-ret = vfio_add_capabilities(vdev, errp);
+ret = pci_device_set_iommu_device(pdev, vbasedev->base_hdev, errp);
 if (ret) {
+error_prepend(errp, "Failed to set iommu_device: ");
 goto out_teardown;
 }
 
+ret = vfio_add_capabilities(vdev, errp);
+if (ret) {
+goto out_unset_idev;
+}
+
 if (vdev->vga) {
 vfio_vga_quirk_setup(vdev);
 }
@@ -3133,7 +3139,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 error_setg(errp,
"cannot support IGD OpRegion feature on hotplugged "
"device");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_get_dev_region_info(vbasedev,
@@ -3142,13 +3148,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 if (ret) {
 error_setg_errno(errp, -ret,
  "does not support requested IGD OpRegion 
feature");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
 g_free(opregion);
 if (ret) {
-goto out_teardown;
+goto out_unset_idev;
 }
 }
 
@@ -3234,6 +3240,8 @@ out_deregister:
 if (vdev->intx.mmap_timer) {
 timer_free(vdev->intx.mmap_timer);
 }
+out_unset_idev:
+pci_device_unset_iommu_device(pdev);
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
@@ -3263,6 +3271,7 @@ static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIODevice *vbasedev = &vdev->vbasedev;
 
 vfio_unregister_req_notifier(vdev);
 vfio_unregister_err_notifier(vdev);
@@ -3277,7 +3286,8 @@ static void vfio_exitfn(PCIDevice *pdev)
 vfio_teardown_msi(vdev);
 vfio_pci_disable_rp_atomics(vdev);
 vfio_bars_exit(vdev);
-vfio_migration_exit(&vdev->vbasedev);
+vfio_migration_exit(vbasedev);
+pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
-- 
2.34.1




[PATCH v1 09/11] hw/pci: Introduce pci_device_set/unset_iommu_device()

2024-02-27 Thread Zhenzhong Duan
From: Yi Liu 

This adds pci_device_set/unset_iommu_device() to set/unset
HostIOMMUDevice for a given PCIe device. Caller of set
should fail if set operation fails.

Extract out pci_device_get_iommu_bus_devfn() to facilitate
implementation of pci_device_set/unset_iommu_device().

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/pci/pci.h | 38 ++-
 hw/pci/pci.c | 62 +---
 2 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index fa6313aabc..8fe6f746d7 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
 
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
@@ -384,10 +385,45 @@ typedef struct PCIIOMMUOps {
  *
  * @devfn: device and function number
  */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+/**
+ * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
+ *
+ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+ * retrieve host information from the associated HostIOMMUDevice.
+ *
+ * Return true if HostIOMMUDevice is attached, or else return false
+ * with errp set.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ *
+ * @dev: the data structure representing host IOMMU device.
+ *
+ */
+int (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *dev, Error **errp);
+/**
+ * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU
+ *
+ * Optional callback.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ */
+void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
 } PCIIOMMUOps;
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *base_dev,
+Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
 
 /**
  * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 76080af580..8078307963 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2672,11 +2672,14 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+   PCIBus **aliased_bus,
+   PCIBus **piommu_bus,
+   int *aliased_devfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
-uint8_t devfn = dev->devfn;
+int devfn = dev->devfn;
 
 while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
 PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2717,13 +2720,66 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+assert(iommu_bus);
+
+if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+iommu_bus = NULL;
+}
+
+*piommu_bus = iommu_bus;
+
+if (aliased_bus) {
+*aliased_bus = bus;
+}
+
+if (aliased_devfn) {
+*aliased_devfn = devfn;
+}
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+PCIBus *iommu_bus;
+int devfn;
+
+pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn);
+if (iommu_bus) {
 return iommu_bus->iommu_ops->get_address_space(bus,
  iommu_bus->iommu_opaque, devfn);
 }
 return &address_space_memory;
 }
 
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *base_dev,
+Error **errp)
+{
+PCIBus *iommu_bus;
+
+pci_device_get_iommu_bus_devfn(dev, NULL, &iommu_bus, NULL);
+if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) {
+return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev),
+  iommu_bus->iommu_opaque,
+  

[PATCH v1 11/11] backends/iommufd: Introduce helper function iommufd_device_get_info()

2024-02-27 Thread Zhenzhong Duan
Introduce a helper function iommufd_device_get_info() to get
host IOMMU related information through iommufd uAPI.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  4 
 backends/iommufd.c   | 23 ++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index d509ff88ef..518b97bfed 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include 
 #include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
@@ -48,4 +49,7 @@ typedef struct IOMMUFDDevice {
 
 void iommufd_device_init(IOMMUFDDevice *idev,
  IOMMUFDBackend *iommufd, int devid);
+int iommufd_device_get_info(IOMMUFDDevice *idev,
+enum iommu_hw_info_type *type,
+uint32_t len, void *data, Error **errp);
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 6d280e4aea..69f3f75ea5 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -20,7 +20,6 @@
 #include "monitor/monitor.h"
 #include "trace.h"
 #include 
-#include 
 
 static void iommufd_backend_init(Object *obj)
 {
@@ -240,3 +239,25 @@ void iommufd_device_init(IOMMUFDDevice *idev,
 idev->iommufd = iommufd;
 idev->devid = devid;
 }
+
+int iommufd_device_get_info(IOMMUFDDevice *idev,
+enum iommu_hw_info_type *type,
+uint32_t len, void *data, Error **errp)
+{
+struct iommu_hw_info info = {
+.size = sizeof(info),
+.dev_id = idev->devid,
+.data_len = len,
+.data_uptr = (uintptr_t)data,
+};
+int ret;
+
+ret = ioctl(idev->iommufd->fd, IOMMU_GET_HW_INFO, &info);
+if (ret) {
+error_setg_errno(errp, errno, "Failed to get hardware info");
+} else {
+*type = info.out_data_type;
+}
+
+return ret;
+}
-- 
2.34.1




[PATCH v1 04/11] vfio: Add HostIOMMUDevice handle into VFIODevice

2024-02-27 Thread Zhenzhong Duan
This handle points to either IOMMULegacyDevice or IOMMUFDDevice variant,
neither both.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8bfb9cbe94..b6676c9f79 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -130,6 +130,7 @@ typedef struct VFIODevice {
 OnOffAuto pre_copy_dirty_page_tracking;
 bool dirty_pages_supported;
 bool dirty_tracking;
+HostIOMMUDevice *base_hdev;
 int devid;
 IOMMUFDBackend *iommufd;
 } VFIODevice;
-- 
2.34.1




[PATCH v1 05/11] vfio: Introduce host_iommu_device_create callback

2024-02-27 Thread Zhenzhong Duan
Introduce host_iommu_device_create callback and a wrapper for it.

This callback is used to allocate a host iommu device instance and
initialize it based on type.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 1 +
 include/hw/vfio/vfio-container-base.h | 1 +
 hw/vfio/common.c  | 8 
 3 files changed, 10 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b6676c9f79..9fefea4b89 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -208,6 +208,7 @@ struct vfio_device_info *vfio_get_device_info(int fd);
 int vfio_attach_device(char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp);
 void vfio_detach_device(VFIODevice *vbasedev);
+void host_iommu_device_create(VFIODevice *vbasedev);
 
 int vfio_kvm_device_add_fd(int fd, Error **errp);
 int vfio_kvm_device_del_fd(int fd, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index b2813b0c11..dc003f6eb2 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -120,6 +120,7 @@ struct VFIOIOMMUClass {
 int (*attach_device)(const char *name, VFIODevice *vbasedev,
  AddressSpace *as, Error **errp);
 void (*detach_device)(VFIODevice *vbasedev);
+void (*host_iommu_device_create)(VFIODevice *vbasedev);
 /* migration feature */
 int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
bool start);
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 059bfdc07a..41e9031c59 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1521,3 +1521,11 @@ void vfio_detach_device(VFIODevice *vbasedev)
 }
 vbasedev->bcontainer->ops->detach_device(vbasedev);
 }
+
+void host_iommu_device_create(VFIODevice *vbasedev)
+{
+const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops;
+
+assert(ops->host_iommu_device_create);
+ops->host_iommu_device_create(vbasedev);
+}
-- 
2.34.1




[PATCH v1 07/11] vfio/iommufd: Implement host_iommu_device_create callback in iommufd mode

2024-02-27 Thread Zhenzhong Duan
This callback will be used to initialize base and public elements
in IOMMUFDDevice.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 9bfddc1360..1c2f5da0d0 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -619,6 +619,15 @@ out_single:
 return ret;
 }
 
+static void vfio_cdev_host_iommu_device_create(VFIODevice *vbasedev)
+{
+IOMMUFDDevice *idev = g_malloc0(sizeof(IOMMUFDDevice));
+
+vbasedev->base_hdev = &idev->base;
+
+iommufd_device_init(idev, vbasedev->iommufd, vbasedev->devid);
+}
+
 static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
@@ -628,6 +637,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 vioc->attach_device = iommufd_cdev_attach;
 vioc->detach_device = iommufd_cdev_detach;
 vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
+vioc->host_iommu_device_create = vfio_cdev_host_iommu_device_create;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH v1 5/6] intel_iommu: Use mgaw instead of s->aw_bits

2024-02-28 Thread Zhenzhong Duan
Because vIOMMU mgaw can be updated based on host IOMMU mgaw, s->aw_bits
does't necessarily represent the final mgaw now but the mgaw field in
s->cap does.

Replace reference to s->aw_bits with a MACRO S_AW_BITS to fetch mgaw
from s->cap. There are two exceptions on this, aw_bits value sanity
check and s->cap initialization.

ACPI DMAR table is also updated with right mgaw value.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/acpi-build.c  |  3 ++-
 hw/i386/intel_iommu.c | 44 ++-
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index edc979379c..6467157686 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2159,7 +2159,8 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker, 
const char *oem_id,
 
 acpi_table_begin(&table, table_data);
 /* Host Address Width */
-build_append_int_noprefix(table_data, intel_iommu->aw_bits - 1, 1);
+build_append_int_noprefix(table_data,
+  VTD_MGAW_FROM_CAP(intel_iommu->cap), 1);
 build_append_int_noprefix(table_data, dmar_flags, 1); /* Flags */
 g_array_append_vals(table_data, rsvd10, sizeof(rsvd10)); /* Reserved */
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2a55268538..e474284e43 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -42,6 +42,8 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 
+#define S_AW_BITS (VTD_MGAW_FROM_CAP(s->cap) + 1)
+
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
 ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
@@ -1410,13 +1412,13 @@ static int 
vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
 {
 /* Legacy Mode reserved bits check */
 if (!s->root_scalable &&
-(re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits
+(re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(S_AW_BITS
 goto rsvd_err;
 
 /* Scalable Mode reserved bits check */
 if (s->root_scalable &&
-((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
- (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits
+((re->lo & VTD_ROOT_ENTRY_RSVD(S_AW_BITS)) ||
+ (re->hi & VTD_ROOT_ENTRY_RSVD(S_AW_BITS
 goto rsvd_err;
 
 return 0;
@@ -1433,7 +1435,7 @@ static inline int 
vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
 {
 if (!s->root_scalable &&
 (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
- ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
+ ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(S_AW_BITS))) {
 error_report_once("%s: invalid context entry: hi=%"PRIx64
   ", lo=%"PRIx64" (reserved nonzero)",
   __func__, ce->hi, ce->lo);
@@ -1441,7 +1443,7 @@ static inline int 
vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
 }
 
 if (s->root_scalable &&
-(ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
+(ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(S_AW_BITS) ||
  ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
  ce->val[2] ||
  ce->val[3])) {
@@ -1572,7 +1574,7 @@ static int 
vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
 .hook_fn = vtd_sync_shadow_page_hook,
 .private = (void *)&vtd_as->iommu,
 .notify_unmap = true,
-.aw = s->aw_bits,
+.aw = S_AW_BITS,
 .as = vtd_as,
 .domain_id = vtd_get_domain_id(s, ce, vtd_as->pasid),
 };
@@ -1991,7 +1993,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace 
*vtd_as, PCIBus *bus,
 }
 
 ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
-   &reads, &writes, s->aw_bits, pasid);
+   &reads, &writes, S_AW_BITS, pasid);
 if (ret_fr) {
 vtd_report_fault(s, -ret_fr, is_fpd_set, source_id,
  addr, is_write, pasid != PCI_NO_PASID, pasid);
@@ -2005,7 +2007,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace 
*vtd_as, PCIBus *bus,
 out:
 vtd_iommu_unlock(s);
 entry->iova = addr & page_mask;
-entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
+entry->translated_addr = vtd_get_slpte_addr(slpte, S_AW_BITS) & page_mask;
 entry->addr_mask = ~page_mask;
 entry->perm = access_flags;
 return true;
@@ -2022,7 +2024,7 @@ error:
 static void vtd_root_table_setup(IntelIOMMUState *s)
 {
 s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
-s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
+s->root &= VTD_RTADDR_ADDR_MASK(S_AW_BITS);
 
 vtd_update_scalab

[PATCH v1 6/6] intel_iommu: Block migration if cap is updated

2024-02-28 Thread Zhenzhong Duan
When there is VFIO device and vIOMMU cap/ecap is updated based on host
IOMMU cap/ecap, migration should be blocked.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e474284e43..9ca47dbf9a 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -40,6 +40,7 @@
 #include "hw/i386/apic_internal.h"
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
+#include "migration/blocker.h"
 #include "trace.h"
 
 #define S_AW_BITS (VTD_MGAW_FROM_CAP(s->cap) + 1)
@@ -3830,6 +3831,8 @@ static int vtd_check_legacy_hdev(IntelIOMMUState *s,
 return 0;
 }
 
+static Error *vtd_mig_blocker;
+
 static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
   IOMMUFDDevice *idev,
   Error **errp)
@@ -3861,8 +3864,17 @@ static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
 tmp_cap |= VTD_CAP_MGAW(host_mgaw + 1);
 }
 
-s->cap = tmp_cap;
-return 0;
+if (s->cap != tmp_cap) {
+if (vtd_mig_blocker == NULL) {
+error_setg(&vtd_mig_blocker,
+   "cap/ecap update from host IOMMU block migration");
+ret = migrate_add_blocker(&vtd_mig_blocker, errp);
+}
+if (!ret) {
+s->cap = tmp_cap;
+}
+}
+return ret;
 }
 
 static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
-- 
2.34.1




[PATCH v1 0/6] Check and sync host IOMMU cap/ecap with vIOMMU

2024-02-28 Thread Zhenzhong Duan
Hi,

Based on Joao's suggestion, the iommufd nesting prerequisite series [1]
is further splitted to host IOMMU device abstract part [2] and vIOMMU
check/sync part. This series implements the 2nd part.

This enables vIOMMU to get host IOMMU cap/ecap information by implementing
a new set/unset_iommu_device interface, then vIOMMU could check or sync
with vIOMMU's own cap/ecap config.

It works by having device side, i.e. VFIO, register either an IOMMULegacyDevice
or IOMMUFDDevice to vIOMMU, which includes necessary data to archive that.
Currently only VFIO device is supported, but it could also be used for other
devices, i.e., VDPA.

For coldplugged device, we can get its host IOMMU cap/ecap during qemu init,
then check and sync into vIOMMU cap/ecap.
For hotplugged device, vIOMMU cap/ecap is frozen, we could only check with
vIOMMU cap/ecap, not allowed to update. If check fails, hotplugged will fail.

This is also a prerequisite for incoming iommufd nesting series:
'intel_iommu: Enable stage-1 translation'.

I didn't implement cap/ecap sync for legacy VFIO backend, would like to see
what Eric want to put in IOMMULegacyDevice for virtio-iommu and if I can
utilize some of them.

Because it's becoming clear on community's suggestion, I'd like to remove
rfc tag from this version.

Qemu code can be found at:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_preq_part2_v1

[1] 
https://lore.kernel.org/qemu-devel/20240201072818.327930-1-zhenzhong.d...@intel.com
[2] https://lists.gnu.org/archive/html/qemu-devel/2024-02/msg06314.html

Thanks
Zhenzhong

Changelog:
v1:
- convert HostIOMMUDevice to sub object pointer in vtd_check_hdev

rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B


Yi Liu (2):
  intel_iommu: Add set/unset_iommu_device callback
  intel_iommu: Add a framework to check and sync host IOMMU cap/ecap

Zhenzhong Duan (4):
  intel_iommu: Extract out vtd_cap_init to initialize cap/ecap
  intel_iommu: Implement check and sync mechanism in iommufd mode
  intel_iommu: Use mgaw instead of s->aw_bits
  intel_iommu: Block migration if cap is updated

 hw/i386/intel_iommu_internal.h |   9 ++
 include/hw/i386/intel_iommu.h  |   4 +
 hw/i386/acpi-build.c   |   3 +-
 hw/i386/intel_iommu.c  | 287 ++---
 4 files changed, 245 insertions(+), 58 deletions(-)

-- 
2.34.1




[PATCH v1 1/6] intel_iommu: Add set/unset_iommu_device callback

2024-02-28 Thread Zhenzhong Duan
From: Yi Liu 

This adds set/unset_iommu_device() implementation in Intel vIOMMU.
In set call, a pointer to host IOMMU device info is stored in hash
table indexed by PCI BDF.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  8 
 include/hw/i386/intel_iommu.h  |  2 +
 hw/i386/intel_iommu.c  | 74 ++
 3 files changed, 84 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index f8cf99bddf..becafd03c1 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -537,4 +537,12 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SL_IGN_COM  0xbff0ULL
 #define VTD_SL_TM   (1ULL << 62)
 
+
+typedef struct VTDHostIOMMUDevice {
+IntelIOMMUState *iommu_state;
+PCIBus *bus;
+uint8_t devfn;
+HostIOMMUDevice *dev;
+QLIST_ENTRY(VTDHostIOMMUDevice) next;
+} VTDHostIOMMUDevice;
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7fa0a695c8..bbc7b96add 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -292,6 +292,8 @@ struct IntelIOMMUState {
 /* list of registered notifiers */
 QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
+GHashTable *vtd_host_iommu_dev; /* VTDHostIOMMUDevice */
+
 /* interrupt remapping */
 bool intr_enabled;  /* Whether guest enabled IR */
 dma_addr_t intr_root;   /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1a07faddb4..9b62441439 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -237,6 +237,13 @@ static gboolean vtd_as_equal(gconstpointer v1, 
gconstpointer v2)
(key1->pasid == key2->pasid);
 }
 
+static gboolean vtd_as_idev_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct vtd_as_key *key1 = v1;
+const struct vtd_as_key *key2 = v2;
+
+return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
+}
 /*
  * Note that we use pointer to PCIBus as the key, so hashing/shifting
  * based on the pointer value is intended. Note that we deal with
@@ -3812,6 +3819,68 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *base_dev, Error **errp)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+struct vtd_as_key *new_key;
+
+assert(base_dev);
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+
+if (vtd_hdev) {
+error_setg(errp, "IOMMUFD device already exist");
+vtd_iommu_unlock(s);
+return -EEXIST;
+}
+
+vtd_hdev = g_malloc0(sizeof(VTDHostIOMMUDevice));
+vtd_hdev->bus = bus;
+vtd_hdev->devfn = (uint8_t)devfn;
+vtd_hdev->iommu_state = s;
+vtd_hdev->dev = base_dev;
+
+new_key = g_malloc(sizeof(*new_key));
+new_key->bus = bus;
+new_key->devfn = devfn;
+
+g_hash_table_insert(s->vtd_host_iommu_dev, new_key, vtd_hdev);
+
+vtd_iommu_unlock(s);
+
+return 0;
+}
+
+static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+if (!vtd_hdev) {
+vtd_iommu_unlock(s);
+return;
+}
+
+g_hash_table_remove(s->vtd_host_iommu_dev, &key);
+
+vtd_iommu_unlock(s);
+}
+
 /* Unmap the whole range in the notifier's scope. */
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 {
@@ -4107,6 +4176,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.set_iommu_device = vtd_dev_set_iommu_device,
+.unset_iommu_device = vtd_dev_unset_iommu_device,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
@@ -4230,6 +4301,9 @@ static void vtd_realize(DeviceState *dev, Error **errp)
  g_free, g_free);
 s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,
   g_free, g_free);
+s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_as_hash,
+  vtd_as_idev_equal,
+  g_free, g_free);
 vtd_init(s);
 pci_setup_iommu(bus, &vtd_iommu_ops, dev);
 /* Pseudo address space under root PCI bus. */
-- 
2.34.1




[PATCH v1 3/6] intel_iommu: Add a framework to check and sync host IOMMU cap/ecap

2024-02-28 Thread Zhenzhong Duan
From: Yi Liu 

Add a framework to check and synchronize host IOMMU cap/ecap with
vIOMMU cap/ecap.

The sequence will be:

vtd_cap_init() initializes iommu->cap/ecap.
vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
iommu->cap_frozen set when machine create done, iommu->cap/ecap become readonly.

Implementation details for different backends will be in following patches.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/i386/intel_iommu.c | 50 ++-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index bbc7b96add..c71a133820 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -283,6 +283,7 @@ struct IntelIOMMUState {
 
 uint64_t cap;   /* The value of capability reg */
 uint64_t ecap;  /* The value of extended capability reg */
+bool cap_frozen;/* cap/ecap become read-only after frozen 
*/
 
 uint32_t context_cache_gen; /* Should be in [1,MAX] */
 GHashTable *iotlb;  /* IOTLB */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index ffa1ad6429..a9f9dfd6a7 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -35,6 +35,8 @@
 #include "sysemu/kvm.h"
 #include "sysemu/dma.h"
 #include "sysemu/sysemu.h"
+#include "hw/vfio/vfio-common.h"
+#include "sysemu/iommufd.h"
 #include "hw/i386/apic_internal.h"
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
@@ -3819,6 +3821,38 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_check_legacy_hdev(IntelIOMMUState *s,
+ IOMMULegacyDevice *ldev,
+ Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
+  IOMMUFDDevice *idev,
+  Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
+  Error **errp)
+{
+HostIOMMUDevice *base_dev = vtd_hdev->dev;
+IOMMUFDDevice *idev;
+
+if (base_dev->type == HID_LEGACY) {
+IOMMULegacyDevice *ldev = container_of(base_dev,
+   IOMMULegacyDevice, base);
+
+return vtd_check_legacy_hdev(s, ldev, errp);
+}
+
+idev = container_of(base_dev, IOMMUFDDevice, base);
+
+return vtd_check_iommufd_hdev(s, idev, errp);
+}
+
 static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
 HostIOMMUDevice *base_dev, Error **errp)
 {
@@ -3829,6 +3863,7 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 .devfn = devfn,
 };
 struct vtd_as_key *new_key;
+int ret;
 
 assert(base_dev);
 
@@ -3848,6 +3883,13 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 vtd_hdev->iommu_state = s;
 vtd_hdev->dev = base_dev;
 
+ret = vtd_check_hdev(s, vtd_hdev, errp);
+if (ret) {
+g_free(vtd_hdev);
+vtd_iommu_unlock(s);
+return ret;
+}
+
 new_key = g_malloc(sizeof(*new_key));
 new_key->bus = bus;
 new_key->devfn = devfn;
@@ -4083,7 +4125,9 @@ static void vtd_init(IntelIOMMUState *s)
 s->iq_dw = false;
 s->next_frcd_reg = 0;
 
-vtd_cap_init(s);
+if (!s->cap_frozen) {
+vtd_cap_init(s);
+}
 
 /*
  * Rsvd field masks for spte
@@ -4254,6 +4298,10 @@ static int vtd_machine_done_notify_one(Object *child, 
void *unused)
 
 static void vtd_machine_done_hook(Notifier *notifier, void *unused)
 {
+IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
+
+iommu->cap_frozen = true;
+
 object_child_foreach_recursive(object_get_root(),
vtd_machine_done_notify_one, NULL);
 }
-- 
2.34.1




[PATCH v1 2/6] intel_iommu: Extract out vtd_cap_init to initialize cap/ecap

2024-02-28 Thread Zhenzhong Duan
This is a prerequisite for host cap/ecap sync.

No functional change intended.

Reviewed-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 93 ---
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 9b62441439..ffa1ad6429 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4003,30 +4003,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 return;
 }
 
-/* Do the initialization. It will also be called when reset, so pay
- * attention when adding new initialization stuff.
- */
-static void vtd_init(IntelIOMMUState *s)
+static void vtd_cap_init(IntelIOMMUState *s)
 {
 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
-memset(s->csr, 0, DMAR_REG_SIZE);
-memset(s->wmask, 0, DMAR_REG_SIZE);
-memset(s->w1cmask, 0, DMAR_REG_SIZE);
-memset(s->womask, 0, DMAR_REG_SIZE);
-
-s->root = 0;
-s->root_scalable = false;
-s->dmar_enabled = false;
-s->intr_enabled = false;
-s->iq_head = 0;
-s->iq_tail = 0;
-s->iq = 0;
-s->iq_size = 0;
-s->qi_enabled = false;
-s->iq_last_desc_type = VTD_INV_DESC_NONE;
-s->iq_dw = false;
-s->next_frcd_reg = 0;
 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
  VTD_CAP_MGAW(s->aw_bits);
@@ -4043,27 +4023,6 @@ static void vtd_init(IntelIOMMUState *s)
 }
 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
-/*
- * Rsvd field masks for spte
- */
-vtd_spte_rsvd[0] = ~0ULL;
-vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
-  x86_iommu->dt_supported);
-vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
-
-vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-
-if (s->scalable_mode || s->snoop_control) {
-vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
-}
-
 if (x86_iommu_ir_supported(x86_iommu)) {
 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
 if (s->intr_eim == ON_OFF_AUTO_ON) {
@@ -4096,6 +4055,56 @@ static void vtd_init(IntelIOMMUState *s)
 if (s->pasid) {
 s->ecap |= VTD_ECAP_PASID;
 }
+}
+
+/*
+ * Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+memset(s->csr, 0, DMAR_REG_SIZE);
+memset(s->wmask, 0, DMAR_REG_SIZE);
+memset(s->w1cmask, 0, DMAR_REG_SIZE);
+memset(s->womask, 0, DMAR_REG_SIZE);
+
+s->root = 0;
+s->root_scalable = false;
+s->dmar_enabled = false;
+s->intr_enabled = false;
+s->iq_head = 0;
+s->iq_tail = 0;
+s->iq = 0;
+s->iq_size = 0;
+s->qi_enabled = false;
+s->iq_last_desc_type = VTD_INV_DESC_NONE;
+s->iq_dw = false;
+s->next_frcd_reg = 0;
+
+vtd_cap_init(s);
+
+/*
+ * Rsvd field masks for spte
+ */
+vtd_spte_rsvd[0] = ~0ULL;
+vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+  x86_iommu->dt_supported);
+vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+
+if (s->scalable_mode || s->snoop_control) {
+vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+}
 
 vtd_reset_caches(s);
 
-- 
2.34.1




[PATCH v1 4/6] intel_iommu: Implement check and sync mechanism in iommufd mode

2024-02-28 Thread Zhenzhong Duan
We use cap_frozen to mark cap/ecap read/writable or read-only,
At init stage, we allow to update cap/ecap based on host IOMMU
cap/ecap, but when machine create done, cap_frozen is set and
we only allow checking cap/ecap for compatibility.

Currently only stage-2 translation is supported which is backed by
shadow page table on host side. So we don't need exact matching of
each bit of cap/ecap between vIOMMU and host. However, we can still
ensure compatibility of host and vIOMMU's address width at least,
i.e., vIOMMU's mgaw <= host IOMMU mgaw, which is missed before.

When stage-1 translation is supported in future, a.k.a. scalable
modern mode, this mechanism will be further extended to check more
bits.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  1 +
 include/hw/i386/intel_iommu.h  |  1 +
 hw/i386/intel_iommu.c  | 28 
 3 files changed, 30 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index becafd03c1..72a5cb0859 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -204,6 +204,7 @@
 #define VTD_DOMAIN_ID_MASK  ((1UL << VTD_DOMAIN_ID_SHIFT) - 1)
 #define VTD_CAP_ND  (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
 #define VTD_ADDRESS_SIZE(aw)(1ULL << (aw))
+#define VTD_CAP_MGAW_MASK   (0x3fULL << 16)
 #define VTD_CAP_MGAW(aw)aw) - 1) & 0x3fULL) << 16)
 #define VTD_MAMV18ULL
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index c71a133820..a0b530ebc6 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -47,6 +47,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(IntelIOMMUState, 
INTEL_IOMMU_DEVICE)
 #define VTD_HOST_AW_48BIT   48
 #define VTD_HOST_ADDRESS_WIDTH  VTD_HOST_AW_39BIT
 #define VTD_HAW_MASK(aw)((1ULL << (aw)) - 1)
+#define VTD_MGAW_FROM_CAP(cap)  (((cap >> 16) & 0x3fULL) + 1)
 
 #define DMAR_REPORT_F_INTR  (1)
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a9f9dfd6a7..2a55268538 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3832,6 +3832,34 @@ static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
   IOMMUFDDevice *idev,
   Error **errp)
 {
+struct iommu_hw_info_vtd vtd;
+enum iommu_hw_info_type type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
+long host_mgaw, viommu_mgaw = VTD_MGAW_FROM_CAP(s->cap);
+uint64_t tmp_cap = s->cap;
+int ret;
+
+ret = iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd, errp);
+if (ret) {
+return ret;
+}
+
+if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) {
+error_setg(errp, "IOMMU hardware is not compatible");
+return -EINVAL;
+}
+
+host_mgaw = VTD_MGAW_FROM_CAP(vtd.cap_reg);
+if (viommu_mgaw > host_mgaw) {
+if (s->cap_frozen) {
+error_setg(errp, "mgaw %" PRId64 " > host mgaw %" PRId64,
+   viommu_mgaw, host_mgaw);
+return -EINVAL;
+}
+tmp_cap &= ~VTD_CAP_MGAW_MASK;
+tmp_cap |= VTD_CAP_MGAW(host_mgaw + 1);
+}
+
+s->cap = tmp_cap;
 return 0;
 }
 
-- 
2.34.1




[PATCH 0/2] Optimize user_creatable_add_type error path

2024-02-28 Thread Zhenzhong Duan
Hi,

This is a simple optimization to user_creatable_add_type error path.
Removed local_err and its check in err path, use *errp instead.

Thanks
Zhenzhong

Zhenzhong Duan (2):
  qom/object_interfaces: Remove unnecessary local error check
  qom/object_interfaces: Remove local_err in user_creatable_add_type

 qom/object_interfaces.c | 20 
 1 file changed, 8 insertions(+), 12 deletions(-)

-- 
2.34.1




[PATCH 2/2] qom/object_interfaces: Remove local_err in user_creatable_add_type

2024-02-28 Thread Zhenzhong Duan
In user_creatable_add_type, there is mixed usage of ERRP_GUARD and
local_err. This makes error_abort not taking effect in those callee
functions with local_err passed.

Now that we already has ERRP_GUARD, remove local_err and use *errp
instead.

Signed-off-by: Zhenzhong Duan 
---
 qom/object_interfaces.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
index 255a7bf659..165cd433e7 100644
--- a/qom/object_interfaces.c
+++ b/qom/object_interfaces.c
@@ -81,7 +81,6 @@ Object *user_creatable_add_type(const char *type, const char 
*id,
 ERRP_GUARD();
 Object *obj;
 ObjectClass *klass;
-Error *local_err = NULL;
 
 if (id != NULL && !id_wellformed(id)) {
 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "id", "an identifier");
@@ -109,20 +108,20 @@ Object *user_creatable_add_type(const char *type, const 
char *id,
 
 assert(qdict);
 obj = object_new(type);
-object_set_properties_from_qdict(obj, qdict, v, &local_err);
-if (local_err) {
+object_set_properties_from_qdict(obj, qdict, v, errp);
+if (*errp) {
 goto out;
 }
 
 if (id != NULL) {
 object_property_try_add_child(object_get_objects_root(),
-  id, obj, &local_err);
-if (local_err) {
+  id, obj, errp);
+if (*errp) {
 goto out;
 }
 }
 
-if (!user_creatable_complete(USER_CREATABLE(obj), &local_err)) {
+if (!user_creatable_complete(USER_CREATABLE(obj), errp)) {
 if (id != NULL) {
 object_property_del(object_get_objects_root(), id);
 }
@@ -130,7 +129,6 @@ Object *user_creatable_add_type(const char *type, const 
char *id,
 }
 return obj;
 out:
-error_propagate(errp, local_err);
 object_unref(obj);
 return NULL;
 }
-- 
2.34.1




[PATCH 1/2] qom/object_interfaces: Remove unnecessary local_err check

2024-02-28 Thread Zhenzhong Duan
In the error return path, local_err is always set, no need to check it.

Signed-off-by: Zhenzhong Duan 
---
 qom/object_interfaces.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
index e0833c8bfe..255a7bf659 100644
--- a/qom/object_interfaces.c
+++ b/qom/object_interfaces.c
@@ -128,13 +128,11 @@ Object *user_creatable_add_type(const char *type, const 
char *id,
 }
 goto out;
 }
-out:
-if (local_err) {
-error_propagate(errp, local_err);
-object_unref(obj);
-return NULL;
-}
 return obj;
+out:
+error_propagate(errp, local_err);
+object_unref(obj);
+return NULL;
 }
 
 void user_creatable_add_qapi(ObjectOptions *options, Error **errp)
-- 
2.34.1




[PATCH rfcv2 04/18] vfio: Add host iommu device instance into VFIODevice

2024-01-31 Thread Zhenzhong Duan
Either IOMMULegacyDevice or IOMMUFDDevice into VFIODevice, neither
both.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8bfb9cbe94..1bbad003ee 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -32,6 +32,7 @@
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
 #include "sysemu/host_iommu_device.h"
+#include "sysemu/iommufd.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -132,8 +133,18 @@ typedef struct VFIODevice {
 bool dirty_tracking;
 int devid;
 IOMMUFDBackend *iommufd;
+union {
+HostIOMMUDevice base_hdev;
+IOMMULegacyDevice legacy_dev;
+IOMMUFDDevice iommufd_dev;
+};
 } VFIODevice;
 
+QEMU_BUILD_BUG_ON(offsetof(VFIODevice, legacy_dev.base) !=
+  offsetof(VFIODevice, base_hdev));
+QEMU_BUILD_BUG_ON(offsetof(VFIODevice, iommufd_dev.base) !=
+  offsetof(VFIODevice, base_hdev));
+
 struct VFIODeviceOps {
 void (*vfio_compute_needs_reset)(VFIODevice *vdev);
 int (*vfio_hot_reset_multi)(VFIODevice *vdev);
-- 
2.34.1




[PATCH rfcv2 02/18] backends/iommufd: Introduce IOMMUFDDevice

2024-01-31 Thread Zhenzhong Duan
IOMMUFDDevice represents a device in iommufd and can be used as
a communication interface between devices (i.e., VFIO, VDPA) and
vIOMMU.

Currently it includes only public iommufd handle and device id
which could be used by vIOMMU to get hw IOMMU information.

There will also be some elements in private field in future,
i.e., capability bits for dirty tracking; when nested translation
is supported in future, vIOMMU is going to have more iommufd related
operations like allocate hwpt for a device, attach/detach hwpt, etc.
So IOMMUFDDevice will be further extended with those needs.

IOMMUFDDevice is willingly not a QOM object because we don't want
it to be visible from the user interface.

Introduce a helper iommufd_device_init to initialize IOMMUFDDevice.

Originally-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h | 14 ++
 backends/iommufd.c   |  6 ++
 2 files changed, 20 insertions(+)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9af27ebd6c..c3f3469760 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
 OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
@@ -33,4 +34,17 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+
+
+/* Abstraction of host IOMMUFD device */
+typedef struct IOMMUFDDevice {
+HostIOMMUDevice base;
+/* private: */
+
+/* public: */
+IOMMUFDBackend *iommufd;
+uint32_t devid;
+} IOMMUFDDevice;
+
+void iommufd_device_init(IOMMUFDDevice *idev);
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 1ef683c7b0..d92791bba9 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -231,3 +231,9 @@ static void register_types(void)
 }
 
 type_init(register_types);
+
+void iommufd_device_init(IOMMUFDDevice *idev)
+{
+host_iommu_base_device_init(&idev->base, HID_IOMMUFD,
+sizeof(IOMMUFDDevice));
+}
-- 
2.34.1




[PATCH rfcv2 06/18] vfio: Introduce host_iommu_device_init callback

2024-01-31 Thread Zhenzhong Duan
Introduce host_iommu_device_init callback and a wrapper for it.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 1 +
 include/hw/vfio/vfio-container-base.h | 1 +
 hw/vfio/common.c  | 8 
 3 files changed, 10 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 24e3eaaf3d..9c4b60c906 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -216,6 +216,7 @@ struct vfio_device_info *vfio_get_device_info(int fd);
 int vfio_attach_device(char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp);
 void vfio_detach_device(VFIODevice *vbasedev);
+void host_iommu_device_init(VFIODevice *vbasedev);
 
 int vfio_kvm_device_add_fd(int fd, Error **errp);
 int vfio_kvm_device_del_fd(int fd, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index b2813b0c11..c71f4abb2d 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -120,6 +120,7 @@ struct VFIOIOMMUClass {
 int (*attach_device)(const char *name, VFIODevice *vbasedev,
  AddressSpace *as, Error **errp);
 void (*detach_device)(VFIODevice *vbasedev);
+void (*host_iommu_device_init)(VFIODevice *vbasedev);
 /* migration feature */
 int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
bool start);
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8b3b575c9d..f7f85160be 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1521,3 +1521,11 @@ void vfio_detach_device(VFIODevice *vbasedev)
 }
 vbasedev->bcontainer->ops->detach_device(vbasedev);
 }
+
+void host_iommu_device_init(VFIODevice *vbasedev)
+{
+const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops;
+
+assert(ops->host_iommu_device_init);
+ops->host_iommu_device_init(vbasedev);
+}
-- 
2.34.1




[PATCH rfcv2 08/18] vfio/iommufd: Implement host_iommu_device_init callback in iommufd mode

2024-01-31 Thread Zhenzhong Duan
This callback will be used to initialize base and public elements
in IOMMUFDDevice, with the exception of iommufd and devid which
are initialized early in attachment.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 5d50549713..7d39d7a5fa 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -621,6 +621,11 @@ out_single:
 return ret;
 }
 
+static void vfio_cdev_host_iommu_device_init(VFIODevice *vbasedev)
+{
+iommufd_device_init(&vbasedev->iommufd_dev);
+}
+
 static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
@@ -630,6 +635,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 vioc->attach_device = iommufd_cdev_attach;
 vioc->detach_device = iommufd_cdev_detach;
 vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
+vioc->host_iommu_device_init = vfio_cdev_host_iommu_device_init;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH rfcv2 09/18] vfio/pci: Initialize host iommu device instance after attachment

2024-01-31 Thread Zhenzhong Duan
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d1e1b8cb89..dedb64fc08 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3006,6 +3006,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 goto error;
 }
 
+/* Initialize host iommu device after attachment succeed */
+host_iommu_device_init(vbasedev);
+
 vfio_populate_device(vdev, &err);
 if (err) {
 error_propagate(errp, err);
-- 
2.34.1




[PATCH rfcv2 12/18] vfio: Initialize host IOMMU device and pass to vIOMMU

2024-01-31 Thread Zhenzhong Duan
Initialize host IOMMU device in vfio and pass to vIOMMU, so that vIOMMU
could get hw IOMMU information.

Support both iommufd and legacy backend.

Originally-by: Yi Liu 
Signed-off-by: Nicolin Chen 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index dedb64fc08..b23c5ea790 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3112,11 +3112,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 vfio_bars_register(vdev);
 
-ret = vfio_add_capabilities(vdev, errp);
+ret = pci_device_set_iommu_device(pdev, &vbasedev->base_hdev, errp);
 if (ret) {
+error_prepend(errp, "Failed to set iommu_device: ");
 goto out_teardown;
 }
 
+ret = vfio_add_capabilities(vdev, errp);
+if (ret) {
+goto out_unset_idev;
+}
+
 if (vdev->vga) {
 vfio_vga_quirk_setup(vdev);
 }
@@ -3133,7 +3139,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 error_setg(errp,
"cannot support IGD OpRegion feature on hotplugged "
"device");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_get_dev_region_info(vbasedev,
@@ -3142,13 +3148,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 if (ret) {
 error_setg_errno(errp, -ret,
  "does not support requested IGD OpRegion 
feature");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
 g_free(opregion);
 if (ret) {
-goto out_teardown;
+goto out_unset_idev;
 }
 }
 
@@ -3234,6 +3240,8 @@ out_deregister:
 if (vdev->intx.mmap_timer) {
 timer_free(vdev->intx.mmap_timer);
 }
+out_unset_idev:
+pci_device_unset_iommu_device(pdev);
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
@@ -3262,6 +3270,7 @@ static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIODevice *vbasedev = &vdev->vbasedev;
 
 vfio_unregister_req_notifier(vdev);
 vfio_unregister_err_notifier(vdev);
@@ -3276,7 +3285,8 @@ static void vfio_exitfn(PCIDevice *pdev)
 vfio_teardown_msi(vdev);
 vfio_pci_disable_rp_atomics(vdev);
 vfio_bars_exit(vdev);
-vfio_migration_exit(&vdev->vbasedev);
+vfio_migration_exit(vbasedev);
+pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
-- 
2.34.1




[PATCH rfcv2 00/18] Check and sync host IOMMU cap/ecap with vIOMMU

2024-01-31 Thread Zhenzhong Duan
Hi,

This enables vIOMMU to get host IOMMU cap/ecap information by introducing
a new set/unset_iommu_device interface, then vIOMMU could check or sync
with vIOMMU's own cap/ecap config.

It works by having device side, i.e. VFIO, register either an IOMMULegacyDevice
or IOMMUFDDevice to vIOMMU, which includes necessary data to archive that.
Currently only VFIO device is supported, but it could also be used for other
devices, i.e., VDPA.

For coldplugged device, we can get its host IOMMU cap/ecap during qemu init,
then check and sync into vIOMMU cap/ecap.
For hotplugged device, vIOMMU cap/ecap is frozen, we could only check with
vIOMMU cap/ecap, not allowed to update. IF check fails, hotplugged will fail.

This is also a prerequisite for incoming iommufd nesting series:
'intel_iommu: Enable stage-1 translation'.

I didn't implement cap/ecap sync for legacy VFIO backend, would like to see
what Eric want to put in IOMMULegacyDevice for virtio-iommu and if I can
utilize some of them.

PATCH1-3: Introduce HostIOMMUDevice and two sub class
PATCH4-5: Define HostIOMMUDevice instance in VFIODevice
PATCH6-9: Introdcue host_iommu_device_init callback to intialize HostIOMMUDevice
PATCH10-12: Introdcue set/unset_iommu_device to pass HostIOMMUDevice to vIOMMU
PATCH13-18: Implement cap/ecap check and sync in intel_iommu

Qemu code can be found at:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_preq_rfcv2

Thanks
Zhenzhong


Changelog:
rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B


Yi Liu (3):
  hw/pci: Introduce pci_device_set/unset_iommu_device()
  intel_iommu: Add set/unset_iommu_device callback
  intel_iommu: Add a framework to check and sync host IOMMU cap/ecap

Zhenzhong Duan (15):
  Introduce a common abstract struct HostIOMMUDevice
  backends/iommufd: Introduce IOMMUFDDevice
  vfio: Introduce IOMMULegacyDevice
  vfio: Add host iommu device instance into VFIODevice
  vfio: Remove redundant iommufd and devid elements in VFIODevice
  vfio: Introduce host_iommu_device_init callback
  vfio/container: Implement host_iommu_device_init callback in legacy
mode
  vfio/iommufd: Implement host_iommu_device_init callback in iommufd
mode
  vfio/pci: Initialize host iommu device instance after attachment
  vfio: Initialize host IOMMU device and pass to vIOMMU
  intel_iommu: Extract out vtd_cap_init to initialize cap/ecap
  backends/iommufd: Introduce helper function iommufd_device_get_info()
  intel_iommu: Implement check and sync mechanism in iommufd mode
  intel_iommu: Use mgaw instead of s->aw_bits
  intel_iommu: Block migration if cap is updated

 hw/i386/intel_iommu_internal.h|  15 ++
 include/hw/i386/intel_iommu.h |   4 +
 include/hw/pci/pci.h  |  38 +++-
 include/hw/vfio/vfio-common.h |  20 +-
 include/hw/vfio/vfio-container-base.h |   1 +
 include/sysemu/host_iommu_device.h|  22 ++
 include/sysemu/iommufd.h  |  18 ++
 backends/iommufd.c|  31 ++-
 hw/i386/acpi-build.c  |   3 +-
 hw/i386/intel_iommu.c | 279 --
 hw/pci/pci.c  |  62 +-
 hw/vfio/ap.c  |   2 +-
 hw/vfio/ccw.c |   2 +-
 hw/vfio/common.c  |  10 +-
 hw/vfio/container.c   |   7 +
 hw/vfio/helpers.c |   2 +-
 hw/vfio/iommufd.c |  32 +--
 hw/vfio/pci.c |  25 ++-
 hw/vfio/platform.c|   3 +-
 19 files changed, 488 insertions(+), 88 deletions(-)
 create mode 100644 include/sysemu/host_iommu_device.h

-- 
2.34.1




[PATCH rfcv2 15/18] backends/iommufd: Introduce helper function iommufd_device_get_info()

2024-01-31 Thread Zhenzhong Duan
Introduce a helper function iommufd_device_get_info() to get
host IOMMU related information through iommufd uAPI.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  4 
 backends/iommufd.c   | 25 -
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index c3f3469760..ec8b80d8d9 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include 
 #include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
@@ -47,4 +48,7 @@ typedef struct IOMMUFDDevice {
 } IOMMUFDDevice;
 
 void iommufd_device_init(IOMMUFDDevice *idev);
+int iommufd_device_get_info(IOMMUFDDevice *idev,
+enum iommu_hw_info_type *type,
+uint32_t len, void *data, Error **errp);
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index d92791bba9..1b0b991747 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -20,7 +20,6 @@
 #include "monitor/monitor.h"
 #include "trace.h"
 #include 
-#include 
 
 static void iommufd_backend_init(Object *obj)
 {
@@ -237,3 +236,27 @@ void iommufd_device_init(IOMMUFDDevice *idev)
 host_iommu_base_device_init(&idev->base, HID_IOMMUFD,
 sizeof(IOMMUFDDevice));
 }
+
+int iommufd_device_get_info(IOMMUFDDevice *idev,
+enum iommu_hw_info_type *type,
+uint32_t len, void *data, Error **errp)
+{
+struct iommu_hw_info info = {
+.size = sizeof(info),
+.flags = 0,
+.dev_id = idev->devid,
+.data_len = len,
+.__reserved = 0,
+.data_uptr = (uintptr_t)data,
+};
+int ret;
+
+ret = ioctl(idev->iommufd->fd, IOMMU_GET_HW_INFO, &info);
+if (ret) {
+error_setg_errno(errp, errno, "Failed to get hardware info");
+} else {
+*type = info.out_data_type;
+}
+
+return ret;
+}
-- 
2.34.1




[PATCH rfcv2 07/18] vfio/container: Implement host_iommu_device_init callback in legacy mode

2024-01-31 Thread Zhenzhong Duan
This callback will be used to initialize base and public elements
in IOMMULegacyDevice.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index bd25b9fbad..8fafd4b4e5 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1120,6 +1120,12 @@ out_single:
 return ret;
 }
 
+static void vfio_legacy_host_iommu_device_init(VFIODevice *vbasedev)
+{
+host_iommu_base_device_init(&vbasedev->base_hdev, HID_LEGACY,
+sizeof(IOMMULegacyDevice));
+}
+
 static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
@@ -1132,6 +1138,7 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
 vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
+vioc->host_iommu_device_init = vfio_legacy_host_iommu_device_init;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH rfcv2 17/18] intel_iommu: Use mgaw instead of s->aw_bits

2024-01-31 Thread Zhenzhong Duan
Because vIOMMU mgaw can be updated based on host IOMMU mgaw, s->aw_bits
does't necessarily represent the final mgaw now but the mgaw field in
s->cap does.

Replace reference to s->aw_bits with a MACRO S_AW_BITS to fetch mgaw
from s->cap. There are two exceptions on this, aw_bits value sanity
check and s->cap initialization.

ACPI DMAR table is also updated with right mgaw value.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/acpi-build.c  |  3 ++-
 hw/i386/intel_iommu.c | 44 ++-
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index edc979379c..6467157686 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2159,7 +2159,8 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker, 
const char *oem_id,
 
 acpi_table_begin(&table, table_data);
 /* Host Address Width */
-build_append_int_noprefix(table_data, intel_iommu->aw_bits - 1, 1);
+build_append_int_noprefix(table_data,
+  VTD_MGAW_FROM_CAP(intel_iommu->cap), 1);
 build_append_int_noprefix(table_data, dmar_flags, 1); /* Flags */
 g_array_append_vals(table_data, rsvd10, sizeof(rsvd10)); /* Reserved */
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 409f8a59c3..72cc8b2c71 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -41,6 +41,8 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 
+#define S_AW_BITS (VTD_MGAW_FROM_CAP(s->cap) + 1)
+
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
 ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
@@ -1409,13 +1411,13 @@ static int 
vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
 {
 /* Legacy Mode reserved bits check */
 if (!s->root_scalable &&
-(re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits
+(re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(S_AW_BITS
 goto rsvd_err;
 
 /* Scalable Mode reserved bits check */
 if (s->root_scalable &&
-((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
- (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits
+((re->lo & VTD_ROOT_ENTRY_RSVD(S_AW_BITS)) ||
+ (re->hi & VTD_ROOT_ENTRY_RSVD(S_AW_BITS
 goto rsvd_err;
 
 return 0;
@@ -1432,7 +1434,7 @@ static inline int 
vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
 {
 if (!s->root_scalable &&
 (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
- ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
+ ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(S_AW_BITS))) {
 error_report_once("%s: invalid context entry: hi=%"PRIx64
   ", lo=%"PRIx64" (reserved nonzero)",
   __func__, ce->hi, ce->lo);
@@ -1440,7 +1442,7 @@ static inline int 
vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
 }
 
 if (s->root_scalable &&
-(ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
+(ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(S_AW_BITS) ||
  ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
  ce->val[2] ||
  ce->val[3])) {
@@ -1571,7 +1573,7 @@ static int 
vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
 .hook_fn = vtd_sync_shadow_page_hook,
 .private = (void *)&vtd_as->iommu,
 .notify_unmap = true,
-.aw = s->aw_bits,
+.aw = S_AW_BITS,
 .as = vtd_as,
 .domain_id = vtd_get_domain_id(s, ce, vtd_as->pasid),
 };
@@ -1990,7 +1992,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace 
*vtd_as, PCIBus *bus,
 }
 
 ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
-   &reads, &writes, s->aw_bits, pasid);
+   &reads, &writes, S_AW_BITS, pasid);
 if (ret_fr) {
 vtd_report_fault(s, -ret_fr, is_fpd_set, source_id,
  addr, is_write, pasid != PCI_NO_PASID, pasid);
@@ -2004,7 +2006,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace 
*vtd_as, PCIBus *bus,
 out:
 vtd_iommu_unlock(s);
 entry->iova = addr & page_mask;
-entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
+entry->translated_addr = vtd_get_slpte_addr(slpte, S_AW_BITS) & page_mask;
 entry->addr_mask = ~page_mask;
 entry->perm = access_flags;
 return true;
@@ -2021,7 +2023,7 @@ error:
 static void vtd_root_table_setup(IntelIOMMUState *s)
 {
 s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
-s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
+s->root &= VTD_RTADDR_ADDR_MASK(S_AW_BITS);
 
 vtd_update_scalab

[PATCH rfcv2 03/18] vfio: Introduce IOMMULegacyDevice

2024-01-31 Thread Zhenzhong Duan
Similar as IOMMUFDDevice, IOMMULegacyDevice represents a device in
legacy mode and can be used as a communication interface between
devices (i.e., VFIO, VDPA) and vIOMMU.

Currently it includes nothing legacy specific, but could be extended
with any wanted info of legacy mode when necessary.

IOMMULegacyDevice is willingly not a QOM object because we don't want
it to be visible from the user interface.

Suggested-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9b7ef7d02b..8bfb9cbe94 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
 #endif
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -97,6 +98,11 @@ typedef struct VFIOIOMMUFDContainer {
 uint32_t ioas_id;
 } VFIOIOMMUFDContainer;
 
+/* Abstraction of host IOMMU legacy device */
+typedef struct IOMMULegacyDevice {
+HostIOMMUDevice base;
+} IOMMULegacyDevice;
+
 typedef struct VFIODeviceOps VFIODeviceOps;
 
 typedef struct VFIODevice {
-- 
2.34.1




[PATCH rfcv2 14/18] intel_iommu: Add a framework to check and sync host IOMMU cap/ecap

2024-01-31 Thread Zhenzhong Duan
From: Yi Liu 

Add a framework to check and synchronize host IOMMU cap/ecap with
vIOMMU cap/ecap.

The sequence will be:

vtd_cap_init() initializes iommu->cap/ecap.
vtd_check_hdev() update iommu->cap/ecap based on host cap/ecap.
iommu->cap_frozen set when machine create done, iommu->cap/ecap become readonly.

Implementation details for different backends will be in following patches.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/i386/intel_iommu.c | 41 ++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index bbc7b96add..c71a133820 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -283,6 +283,7 @@ struct IntelIOMMUState {
 
 uint64_t cap;   /* The value of capability reg */
 uint64_t ecap;  /* The value of extended capability reg */
+bool cap_frozen;/* cap/ecap become read-only after frozen 
*/
 
 uint32_t context_cache_gen; /* Should be in [1,MAX] */
 GHashTable *iotlb;  /* IOTLB */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index ffa1ad6429..7ed2b79669 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3819,6 +3819,31 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_check_legacy_hdev(IntelIOMMUState *s,
+ IOMMULegacyDevice *ldev,
+ Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
+  IOMMUFDDevice *idev,
+  Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
+  Error **errp)
+{
+HostIOMMUDevice *base_dev = vtd_hdev->dev;
+
+if (base_dev->type == HID_LEGACY) {
+return vtd_check_legacy_hdev(s, vtd_hdev->ldev, errp);
+}
+return vtd_check_iommufd_hdev(s, vtd_hdev->idev, errp);
+}
+
 static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
 HostIOMMUDevice *base_dev, Error **errp)
 {
@@ -3829,6 +3854,7 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 .devfn = devfn,
 };
 struct vtd_as_key *new_key;
+int ret;
 
 assert(base_dev);
 
@@ -3848,6 +3874,13 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 vtd_hdev->iommu_state = s;
 vtd_hdev->dev = base_dev;
 
+ret = vtd_check_hdev(s, vtd_hdev, errp);
+if (ret) {
+g_free(vtd_hdev);
+vtd_iommu_unlock(s);
+return ret;
+}
+
 new_key = g_malloc(sizeof(*new_key));
 new_key->bus = bus;
 new_key->devfn = devfn;
@@ -4083,7 +4116,9 @@ static void vtd_init(IntelIOMMUState *s)
 s->iq_dw = false;
 s->next_frcd_reg = 0;
 
-vtd_cap_init(s);
+if (!s->cap_frozen) {
+vtd_cap_init(s);
+}
 
 /*
  * Rsvd field masks for spte
@@ -4254,6 +4289,10 @@ static int vtd_machine_done_notify_one(Object *child, 
void *unused)
 
 static void vtd_machine_done_hook(Notifier *notifier, void *unused)
 {
+IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
+
+iommu->cap_frozen = true;
+
 object_child_foreach_recursive(object_get_root(),
vtd_machine_done_notify_one, NULL);
 }
-- 
2.34.1




[PATCH rfcv2 10/18] hw/pci: Introduce pci_device_set/unset_iommu_device()

2024-01-31 Thread Zhenzhong Duan
From: Yi Liu 

This adds pci_device_set/unset_iommu_device() to set/unset
HostIOMMUDevice for a given PCIe device. Caller of set
should fail if set operation fails.

Extract out pci_device_get_iommu_bus_devfn() to facilitate
implementation of pci_device_set/unset_iommu_device().

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/pci/pci.h | 38 ++-
 hw/pci/pci.c | 62 +---
 2 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index fa6313aabc..5b471fd380 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
 
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
@@ -384,10 +385,45 @@ typedef struct PCIIOMMUOps {
  *
  * @devfn: device and function number
  */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+/**
+ * @set_iommu_device: set iommufd device for a PCI device to vIOMMU
+ *
+ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+ * utilize iommufd specific features.
+ *
+ * Return true if iommufd device is accepted, or else return false with
+ * errp set.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ *
+ * @dev: the data structure representing host assigned device.
+ *
+ */
+int (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *dev, Error **errp);
+/**
+ * @unset_iommu_device: unset iommufd device for a PCI device from vIOMMU
+ *
+ * Optional callback.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ */
+void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
 } PCIIOMMUOps;
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *base_dev,
+Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
 
 /**
  * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 76080af580..8078307963 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2672,11 +2672,14 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+   PCIBus **aliased_bus,
+   PCIBus **piommu_bus,
+   int *aliased_devfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
-uint8_t devfn = dev->devfn;
+int devfn = dev->devfn;
 
 while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
 PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2717,13 +2720,66 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+assert(iommu_bus);
+
+if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+iommu_bus = NULL;
+}
+
+*piommu_bus = iommu_bus;
+
+if (aliased_bus) {
+*aliased_bus = bus;
+}
+
+if (aliased_devfn) {
+*aliased_devfn = devfn;
+}
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+PCIBus *iommu_bus;
+int devfn;
+
+pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn);
+if (iommu_bus) {
 return iommu_bus->iommu_ops->get_address_space(bus,
  iommu_bus->iommu_opaque, devfn);
 }
 return &address_space_memory;
 }
 
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *base_dev,
+Error **errp)
+{
+PCIBus *iommu_bus;
+
+pci_device_get_iommu_bus_devfn(dev, NULL, &iommu_bus, NULL);
+if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) {
+return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev),
+  iommu_bus->iommu_opaque,
+  dev-&

[PATCH rfcv2 05/18] vfio: Remove redundant iommufd and devid elements in VFIODevice

2024-01-31 Thread Zhenzhong Duan
iommufd and devid in VFIODevice are redundant with the ones
in IOMMUFDDevice, so remove them.

Suggested-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h |  2 --
 hw/vfio/ap.c  |  2 +-
 hw/vfio/ccw.c |  2 +-
 hw/vfio/common.c  |  2 +-
 hw/vfio/helpers.c |  2 +-
 hw/vfio/iommufd.c | 26 ++
 hw/vfio/pci.c |  2 +-
 hw/vfio/platform.c|  3 ++-
 8 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 1bbad003ee..24e3eaaf3d 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -131,8 +131,6 @@ typedef struct VFIODevice {
 OnOffAuto pre_copy_dirty_page_tracking;
 bool dirty_pages_supported;
 bool dirty_tracking;
-int devid;
-IOMMUFDBackend *iommufd;
 union {
 HostIOMMUDevice base_hdev;
 IOMMULegacyDevice legacy_dev;
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index e157aa1ff7..11526d93d4 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -198,7 +198,7 @@ static void vfio_ap_unrealize(DeviceState *dev)
 static Property vfio_ap_properties[] = {
 DEFINE_PROP_STRING("sysfsdev", VFIOAPDevice, vdev.sysfsdev),
 #ifdef CONFIG_IOMMUFD
-DEFINE_PROP_LINK("iommufd", VFIOAPDevice, vdev.iommufd,
+DEFINE_PROP_LINK("iommufd", VFIOAPDevice, vdev.iommufd_dev.iommufd,
  TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
 #endif
 DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 90e4a53437..b1b75ffa2a 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -667,7 +667,7 @@ static Property vfio_ccw_properties[] = {
 DEFINE_PROP_STRING("sysfsdev", VFIOCCWDevice, vdev.sysfsdev),
 DEFINE_PROP_BOOL("force-orb-pfch", VFIOCCWDevice, force_orb_pfch, false),
 #ifdef CONFIG_IOMMUFD
-DEFINE_PROP_LINK("iommufd", VFIOCCWDevice, vdev.iommufd,
+DEFINE_PROP_LINK("iommufd", VFIOCCWDevice, vdev.iommufd_dev.iommufd,
  TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
 #endif
 DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 059bfdc07a..8b3b575c9d 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1505,7 +1505,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
 const VFIOIOMMUClass *ops =
 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
 
-if (vbasedev->iommufd) {
+if (vbasedev->iommufd_dev.iommufd) {
 ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
 }
 
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 6789870802..e5457ca326 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -626,7 +626,7 @@ int vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
 vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
 }
 } else {
-if (!vbasedev->iommufd) {
+if (!vbasedev->iommufd_dev.iommufd) {
 error_setg(errp, "Use FD passing only with iommufd backend");
 return -EINVAL;
 }
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 9bfddc1360..5d50549713 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -65,7 +65,7 @@ static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev)
 
 static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
 {
-IOMMUFDBackend *iommufd = vbasedev->iommufd;
+IOMMUFDBackend *iommufd = vbasedev->iommufd_dev.iommufd;
 struct vfio_device_bind_iommufd bind = {
 .argsz = sizeof(bind),
 .flags = 0,
@@ -96,9 +96,10 @@ static int iommufd_cdev_connect_and_bind(VFIODevice 
*vbasedev, Error **errp)
 goto err_bind;
 }
 
-vbasedev->devid = bind.out_devid;
+vbasedev->iommufd_dev.devid = bind.out_devid;
 trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name,
-vbasedev->fd, vbasedev->devid);
+vbasedev->fd,
+vbasedev->iommufd_dev.devid);
 return ret;
 err_bind:
 iommufd_cdev_kvm_device_del(vbasedev);
@@ -111,7 +112,7 @@ static void iommufd_cdev_unbind_and_disconnect(VFIODevice 
*vbasedev)
 {
 /* Unbind is automatically conducted when device fd is closed */
 iommufd_cdev_kvm_device_del(vbasedev);
-iommufd_backend_disconnect(vbasedev->iommufd);
+iommufd_backend_disconnect(vbasedev->iommufd_dev.iommufd);
 }
 
 static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp)
@@ -181,7 +182,7 @@ out_free_path:
 static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id,
  Error **errp)
 {
-int ret, iommufd = vbasedev->iommufd->fd;
+int ret, io

[PATCH rfcv2 01/18] Introduce a common abstract struct HostIOMMUDevice

2024-01-31 Thread Zhenzhong Duan
HostIOMMUDevice will be inherited by two sub classes,
legacy and iommufd currently.

Introduce a helper function host_iommu_base_device_init to initialize it.

Suggested-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/host_iommu_device.h | 22 ++
 1 file changed, 22 insertions(+)
 create mode 100644 include/sysemu/host_iommu_device.h

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
new file mode 100644
index 00..fe80ab25fb
--- /dev/null
+++ b/include/sysemu/host_iommu_device.h
@@ -0,0 +1,22 @@
+#ifndef HOST_IOMMU_DEVICE_H
+#define HOST_IOMMU_DEVICE_H
+
+typedef enum HostIOMMUDevice_Type {
+HID_LEGACY,
+HID_IOMMUFD,
+HID_MAX,
+} HostIOMMUDevice_Type;
+
+typedef struct HostIOMMUDevice {
+HostIOMMUDevice_Type type;
+size_t size;
+} HostIOMMUDevice;
+
+static inline void host_iommu_base_device_init(HostIOMMUDevice *dev,
+   HostIOMMUDevice_Type type,
+   size_t size)
+{
+dev->type = type;
+dev->size = size;
+}
+#endif
-- 
2.34.1




[PATCH rfcv2 18/18] intel_iommu: Block migration if cap is updated

2024-01-31 Thread Zhenzhong Duan
When there is VFIO device and vIOMMU cap/ecap is updated based on host
IOMMU cap/ecap, migration should be blocked.

Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 72cc8b2c71..7f9ff653b2 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -39,6 +39,7 @@
 #include "hw/i386/apic_internal.h"
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
+#include "migration/blocker.h"
 #include "trace.h"
 
 #define S_AW_BITS (VTD_MGAW_FROM_CAP(s->cap) + 1)
@@ -3829,6 +3830,8 @@ static int vtd_check_legacy_hdev(IntelIOMMUState *s,
 return 0;
 }
 
+static Error *vtd_mig_blocker;
+
 static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
   IOMMUFDDevice *idev,
   Error **errp)
@@ -3860,8 +3863,17 @@ static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
 tmp_cap |= VTD_CAP_MGAW(host_mgaw + 1);
 }
 
-s->cap = tmp_cap;
-return 0;
+if (s->cap != tmp_cap) {
+if (vtd_mig_blocker == NULL) {
+error_setg(&vtd_mig_blocker,
+   "cap/ecap update from host IOMMU block migration");
+ret = migrate_add_blocker(&vtd_mig_blocker, errp);
+}
+if (!ret) {
+s->cap = tmp_cap;
+}
+}
+return ret;
 }
 
 static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
-- 
2.34.1




[PATCH rfcv2 16/18] intel_iommu: Implement check and sync mechanism in iommufd mode

2024-01-31 Thread Zhenzhong Duan
We use cap_frozen to mark cap/ecap read/writable or read-only,
At init stage, we allow to update cap/ecap based on host IOMMU
cap/ecap, but when machine create done, cap_frozen is set and
we only allow checking cap/ecap for compatibility.

Currently only stage-2 translation is supported which is backed by
shadow page table on host side. So we don't need exact matching of
each bit of cap/ecap between vIOMMU and host. However, we can still
ensure compatibility of host and vIOMMU's address width at least,
i.e., vIOMMU's mgaw <= host IOMMU mgaw, which is missed before.

When stage-1 translation is supported in future, a.k.a. scalable
modern mode, this mechanism will be further extended to check more
bits.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  1 +
 include/hw/i386/intel_iommu.h  |  1 +
 hw/i386/intel_iommu.c  | 29 +
 3 files changed, 31 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 3301f54b35..33d2298dce 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -206,6 +206,7 @@
 #define VTD_DOMAIN_ID_MASK  ((1UL << VTD_DOMAIN_ID_SHIFT) - 1)
 #define VTD_CAP_ND  (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
 #define VTD_ADDRESS_SIZE(aw)(1ULL << (aw))
+#define VTD_CAP_MGAW_MASK   (0x3fULL << 16)
 #define VTD_CAP_MGAW(aw)aw) - 1) & 0x3fULL) << 16)
 #define VTD_MAMV18ULL
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index c71a133820..a0b530ebc6 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -47,6 +47,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(IntelIOMMUState, 
INTEL_IOMMU_DEVICE)
 #define VTD_HOST_AW_48BIT   48
 #define VTD_HOST_ADDRESS_WIDTH  VTD_HOST_AW_39BIT
 #define VTD_HAW_MASK(aw)((1ULL << (aw)) - 1)
+#define VTD_MGAW_FROM_CAP(cap)  (((cap >> 16) & 0x3fULL) + 1)
 
 #define DMAR_REPORT_F_INTR  (1)
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7ed2b79669..409f8a59c3 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -35,6 +35,7 @@
 #include "sysemu/kvm.h"
 #include "sysemu/dma.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/iommufd.h"
 #include "hw/i386/apic_internal.h"
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
@@ -3830,6 +3831,34 @@ static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
   IOMMUFDDevice *idev,
   Error **errp)
 {
+struct iommu_hw_info_vtd vtd;
+enum iommu_hw_info_type type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
+long host_mgaw, viommu_mgaw = VTD_MGAW_FROM_CAP(s->cap);
+uint64_t tmp_cap = s->cap;
+int ret;
+
+ret = iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd, errp);
+if (ret) {
+return ret;
+}
+
+if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) {
+error_setg(errp, "IOMMU hardware is not compatible");
+return -EINVAL;
+}
+
+host_mgaw = VTD_MGAW_FROM_CAP(vtd.cap_reg);
+if (viommu_mgaw > host_mgaw) {
+if (s->cap_frozen) {
+error_setg(errp, "mgaw %" PRId64 " > host mgaw %" PRId64,
+   viommu_mgaw, host_mgaw);
+return -EINVAL;
+}
+tmp_cap &= ~VTD_CAP_MGAW_MASK;
+tmp_cap |= VTD_CAP_MGAW(host_mgaw + 1);
+}
+
+s->cap = tmp_cap;
 return 0;
 }
 
-- 
2.34.1




[PATCH rfcv2 11/18] intel_iommu: Add set/unset_iommu_device callback

2024-01-31 Thread Zhenzhong Duan
From: Yi Liu 

This adds set/unset_iommu_device() implementation in Intel vIOMMU.
In set call, a pointer to host IOMMU device info is stored in hash
table indexed by PCI BDF.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h | 14 +++
 include/hw/i386/intel_iommu.h  |  2 +
 hw/i386/intel_iommu.c  | 74 ++
 3 files changed, 90 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index f8cf99bddf..3301f54b35 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -28,6 +28,8 @@
 #ifndef HW_I386_INTEL_IOMMU_INTERNAL_H
 #define HW_I386_INTEL_IOMMU_INTERNAL_H
 #include "hw/i386/intel_iommu.h"
+#include "sysemu/host_iommu_device.h"
+#include "hw/vfio/vfio-common.h"
 
 /*
  * Intel IOMMU register specification
@@ -537,4 +539,16 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SL_IGN_COM  0xbff0ULL
 #define VTD_SL_TM   (1ULL << 62)
 
+
+typedef struct VTDHostIOMMUDevice {
+IntelIOMMUState *iommu_state;
+PCIBus *bus;
+uint8_t devfn;
+union {
+HostIOMMUDevice *dev;
+IOMMULegacyDevice *ldev;
+IOMMUFDDevice *idev;
+};
+QLIST_ENTRY(VTDHostIOMMUDevice) next;
+} VTDHostIOMMUDevice;
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7fa0a695c8..bbc7b96add 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -292,6 +292,8 @@ struct IntelIOMMUState {
 /* list of registered notifiers */
 QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
+GHashTable *vtd_host_iommu_dev; /* VTDHostIOMMUDevice */
+
 /* interrupt remapping */
 bool intr_enabled;  /* Whether guest enabled IR */
 dma_addr_t intr_root;   /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1a07faddb4..9b62441439 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -237,6 +237,13 @@ static gboolean vtd_as_equal(gconstpointer v1, 
gconstpointer v2)
(key1->pasid == key2->pasid);
 }
 
+static gboolean vtd_as_idev_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct vtd_as_key *key1 = v1;
+const struct vtd_as_key *key2 = v2;
+
+return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
+}
 /*
  * Note that we use pointer to PCIBus as the key, so hashing/shifting
  * based on the pointer value is intended. Note that we deal with
@@ -3812,6 +3819,68 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *base_dev, Error **errp)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+struct vtd_as_key *new_key;
+
+assert(base_dev);
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+
+if (vtd_hdev) {
+error_setg(errp, "IOMMUFD device already exist");
+vtd_iommu_unlock(s);
+return -EEXIST;
+}
+
+vtd_hdev = g_malloc0(sizeof(VTDHostIOMMUDevice));
+vtd_hdev->bus = bus;
+vtd_hdev->devfn = (uint8_t)devfn;
+vtd_hdev->iommu_state = s;
+vtd_hdev->dev = base_dev;
+
+new_key = g_malloc(sizeof(*new_key));
+new_key->bus = bus;
+new_key->devfn = devfn;
+
+g_hash_table_insert(s->vtd_host_iommu_dev, new_key, vtd_hdev);
+
+vtd_iommu_unlock(s);
+
+return 0;
+}
+
+static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+if (!vtd_hdev) {
+vtd_iommu_unlock(s);
+return;
+}
+
+g_hash_table_remove(s->vtd_host_iommu_dev, &key);
+
+vtd_iommu_unlock(s);
+}
+
 /* Unmap the whole range in the notifier's scope. */
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 {
@@ -4107,6 +4176,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.set_iommu_device = vtd_dev_set_iommu_device,
+.unset_iommu_device = vtd_dev_unset_iommu_device,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
@@ -4230,6 +4301,9 @@ static void vtd_realize(DeviceState *dev, Error **errp)
  g_free, g_free);
 s->vtd_addres

[PATCH rfcv2 13/18] intel_iommu: Extract out vtd_cap_init to initialize cap/ecap

2024-01-31 Thread Zhenzhong Duan
This is a prerequisite for host cap/ecap sync.

No functional change intended.

Reviewed-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 93 ---
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 9b62441439..ffa1ad6429 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4003,30 +4003,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 return;
 }
 
-/* Do the initialization. It will also be called when reset, so pay
- * attention when adding new initialization stuff.
- */
-static void vtd_init(IntelIOMMUState *s)
+static void vtd_cap_init(IntelIOMMUState *s)
 {
 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
-memset(s->csr, 0, DMAR_REG_SIZE);
-memset(s->wmask, 0, DMAR_REG_SIZE);
-memset(s->w1cmask, 0, DMAR_REG_SIZE);
-memset(s->womask, 0, DMAR_REG_SIZE);
-
-s->root = 0;
-s->root_scalable = false;
-s->dmar_enabled = false;
-s->intr_enabled = false;
-s->iq_head = 0;
-s->iq_tail = 0;
-s->iq = 0;
-s->iq_size = 0;
-s->qi_enabled = false;
-s->iq_last_desc_type = VTD_INV_DESC_NONE;
-s->iq_dw = false;
-s->next_frcd_reg = 0;
 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
  VTD_CAP_MGAW(s->aw_bits);
@@ -4043,27 +4023,6 @@ static void vtd_init(IntelIOMMUState *s)
 }
 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
-/*
- * Rsvd field masks for spte
- */
-vtd_spte_rsvd[0] = ~0ULL;
-vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
-  x86_iommu->dt_supported);
-vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
-
-vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-
-if (s->scalable_mode || s->snoop_control) {
-vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
-}
-
 if (x86_iommu_ir_supported(x86_iommu)) {
 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
 if (s->intr_eim == ON_OFF_AUTO_ON) {
@@ -4096,6 +4055,56 @@ static void vtd_init(IntelIOMMUState *s)
 if (s->pasid) {
 s->ecap |= VTD_ECAP_PASID;
 }
+}
+
+/*
+ * Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+memset(s->csr, 0, DMAR_REG_SIZE);
+memset(s->wmask, 0, DMAR_REG_SIZE);
+memset(s->w1cmask, 0, DMAR_REG_SIZE);
+memset(s->womask, 0, DMAR_REG_SIZE);
+
+s->root = 0;
+s->root_scalable = false;
+s->dmar_enabled = false;
+s->intr_enabled = false;
+s->iq_head = 0;
+s->iq_tail = 0;
+s->iq = 0;
+s->iq_size = 0;
+s->qi_enabled = false;
+s->iq_last_desc_type = VTD_INV_DESC_NONE;
+s->iq_dw = false;
+s->next_frcd_reg = 0;
+
+vtd_cap_init(s);
+
+/*
+ * Rsvd field masks for spte
+ */
+vtd_spte_rsvd[0] = ~0ULL;
+vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+  x86_iommu->dt_supported);
+vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+
+if (s->scalable_mode || s->snoop_control) {
+vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+}
 
 vtd_reset_caches(s);
 
-- 
2.34.1




[PATCH v2 00/10] Add a host IOMMU device abstraction

2024-04-08 Thread Zhenzhong Duan
Based on Joao's suggestion, the iommufd nesting prerequisite series [1]
is further splitted to host IOMMU device abstract part and vIOMMU
check part. This series implements the 1st part.

This split also faciliates the dirty tracking series [2] and virtio-iommu
series [3] to depend on 1st part.

The major change in this version is to use QOM, the class tree is as below:

HostIOMMUDevice
   | .get_host_iommu_info()
   |
   |
..
|  | |
  HIODLegacyVFIO[HIODLegacyVDPA]HIODIOMMUFD
| .vdev| [.vdev] | .iommufd
 | .devid
 | [.ioas_id]
 | [.attach_hwpt()]
 | [.detach_hwpt()]
 |
.--.
|  |
   HIODIOMMUFDVFIO [HIODIOMMUFDVDPA]
| .vdev| [.vdev]

* The classes in [] will be implemented in future.
* .ioas_id, .attach/detach_hwpt() will be implemented in nesting series.
* .vdev in different class points to different agent device,
* i.e., for VFIO it points to VFIODevice.

PATCH1-4: Introduce HostIOMMUDevice and its sub classes
PATCH5-7: Implement get_host_iommu_info() callback
PATCH8-10: Create HostIOMMUDevice instance and pass to vIOMMU

Qemu code can be found at:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_preq_part1_v2

[1] 
https://lore.kernel.org/qemu-devel/20240201072818.327930-1-zhenzhong.d...@intel.com/
[2] 
https://lore.kernel.org/qemu-devel/20240212135643.5858-1-joao.m.mart...@oracle.com/
[3] 
https://lore.kernel.org/qemu-devel/20240117080414.316890-1-eric.au...@redhat.com/

Thanks
Zhenzhong

Changelog:
v2:
- use QOM to abstract host IOMMU device and its sub-classes (Cédric)
- move host IOMMU device creation in attach_device() (Cédric)
- refine pci_device_set/unset_iommu_device doc futher (Eric)
- define host IOMMU info format of different backend
- implement get_host_iommu_info() for different backend (Cédric)

v1:
- use HostIOMMUDevice handle instead of union in VFIODevice (Eric)
- change host_iommu_device_init to host_iommu_device_create
- allocate HostIOMMUDevice in host_iommu_device_create callback
  and set the VFIODevice base_hdev handle (Eric)
- refine pci_device_set/unset_iommu_device doc (Eric)
- use HostIOMMUDevice handle instead of union in VTDHostIOMMUDevice (Eric)

rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B


Yi Liu (1):
  hw/pci: Introduce pci_device_set/unset_iommu_device()

Zhenzhong Duan (9):
  backends: Introduce abstract HostIOMMUDevice
  vfio: Introduce HIODLegacyVFIO device
  backends/iommufd: Introduce abstract HIODIOMMUFD device
  vfio/iommufd: Introduce HIODIOMMUFDVFIO device
  vfio: Implement get_host_iommu_info() callback
  backends/iommufd: Introduce helper function
iommufd_backend_get_device_info()
  backends/iommufd: Implement get_host_iommu_info() callback
  vfio: Create host IOMMU device instance
  vfio: Pass HostIOMMUDevice to vIOMMU

 MAINTAINERS|  2 +
 include/hw/pci/pci.h   | 40 +-
 include/hw/vfio/vfio-common.h  | 23 
 include/sysemu/host_iommu_device.h | 29 ++
 include/sysemu/iommufd.h   | 33 
 backends/host_iommu_device.c   | 19 +++
 backends/iommufd.c | 85 --
 hw/pci/pci.c   | 75 --
 hw/vfio/container.c| 40 +-
 hw/vfio/iommufd.c  | 19 ++-
 hw/vfio/pci.c  | 20 +--
 backends/Kconfig   |  5 ++
 backends/meson.build   |  1 +
 13 files changed, 364 insertions(+), 27 deletions(-)
 create mode 100644 include/sysemu/host_iommu_device.h
 create mode 100644 backends/host_iommu_device.c

-- 
2.34.1




[PATCH v2 03/10] backends/iommufd: Introduce abstract HIODIOMMUFD device

2024-04-08 Thread Zhenzhong Duan
HIODIOMMUFD represents a host IOMMU device under iommufd backend.

Currently it includes only public iommufd handle and device id.
which could be used to get hw IOMMU information.

When nested translation is supported in future, vIOMMU is going
to have iommufd related operations like attaching/detaching hwpt,
So IOMMUFDDevice interface will be further extended at that time.

VFIO and VDPA device have different way of attaching/detaching hwpt.
So HIODIOMMUFD is still an abstract class which will be inherited by
VFIO and VDPA device.

Introduce a helper hiod_iommufd_init() to initialize HIODIOMMUFD
device.

Suggested-by: Cédric Le Goater 
Originally-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h | 22 +++
 backends/iommufd.c   | 47 ++--
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9af27ebd6c..71c53cbb45 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
 OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
@@ -33,4 +34,25 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+
+#define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
+OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass, HIOD_IOMMUFD)
+
+struct HIODIOMMUFD {
+/*< private >*/
+HostIOMMUDevice parent;
+void *opaque;
+
+/*< public >*/
+IOMMUFDBackend *iommufd;
+uint32_t devid;
+};
+
+struct HIODIOMMUFDClass {
+/*< private >*/
+HostIOMMUDeviceClass parent_class;
+};
+
+void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend *iommufd,
+   uint32_t devid);
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 62a79fa6b0..ef8b3a808b 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -212,23 +212,38 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, 
uint32_t ioas_id,
 return ret;
 }
 
-static const TypeInfo iommufd_backend_info = {
-.name = TYPE_IOMMUFD_BACKEND,
-.parent = TYPE_OBJECT,
-.instance_size = sizeof(IOMMUFDBackend),
-.instance_init = iommufd_backend_init,
-.instance_finalize = iommufd_backend_finalize,
-.class_size = sizeof(IOMMUFDBackendClass),
-.class_init = iommufd_backend_class_init,
-.interfaces = (InterfaceInfo[]) {
-{ TYPE_USER_CREATABLE },
-{ }
-}
-};
+void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend *iommufd,
+   uint32_t devid)
+{
+idev->iommufd = iommufd;
+idev->devid = devid;
+}
 
-static void register_types(void)
+static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
 {
-type_register_static(&iommufd_backend_info);
 }
 
-type_init(register_types);
+static const TypeInfo types[] = {
+{
+.name = TYPE_IOMMUFD_BACKEND,
+.parent = TYPE_OBJECT,
+.instance_size = sizeof(IOMMUFDBackend),
+.instance_init = iommufd_backend_init,
+.instance_finalize = iommufd_backend_finalize,
+.class_size = sizeof(IOMMUFDBackendClass),
+.class_init = iommufd_backend_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_USER_CREATABLE },
+{ }
+}
+}, {
+.name = TYPE_HIOD_IOMMUFD,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.instance_size = sizeof(HIODIOMMUFD),
+.class_size = sizeof(HIODIOMMUFDClass),
+.class_init = hiod_iommufd_class_init,
+.abstract = true,
+}
+};
+
+DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v2 04/10] vfio/iommufd: Introduce HIODIOMMUFDVFIO device

2024-04-08 Thread Zhenzhong Duan
HIODIOMMUFDVFIO represents a host IOMMU device under VFIO iommufd
backend. It will be created during VFIO device attaching and passed
to vIOMMU.

It includes a link to VFIODevice so that we can do VFIO device
specific hwpt attaching/detaching.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 11 +++
 hw/vfio/iommufd.c | 11 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f30772f534..d382b12ec1 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -32,6 +32,7 @@
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
 #include "sysemu/host_iommu_device.h"
+#include "sysemu/iommufd.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -158,6 +159,16 @@ struct HIODLegacyVFIO {
 VFIODevice *vdev;
 };
 
+#define TYPE_HIOD_IOMMUFD_VFIO TYPE_HIOD_IOMMUFD "-vfio"
+OBJECT_DECLARE_SIMPLE_TYPE(HIODIOMMUFDVFIO, HIOD_IOMMUFD_VFIO)
+
+/* Abstraction of VFIO IOMMUFD host IOMMU device */
+struct HIODIOMMUFDVFIO {
+/*< private >*/
+HIODIOMMUFD parent;
+VFIODevice *vdev;
+};
+
 typedef struct VFIODMABuf {
 QemuDmaBuf buf;
 uint32_t pos_x, pos_y, pos_updates;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 8827ffe636..115b9f8e7f 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -634,12 +634,21 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
 };
 
+static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data)
+{
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_IOMMUFD,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_iommufd_class_init,
-},
+}, {
+.name = TYPE_HIOD_IOMMUFD_VFIO,
+.parent = TYPE_HIOD_IOMMUFD,
+.instance_size = sizeof(HIODIOMMUFDVFIO),
+.class_init = hiod_iommufd_vfio_class_init,
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v2 06/10] backends/iommufd: Introduce helper function iommufd_backend_get_device_info()

2024-04-08 Thread Zhenzhong Duan
Introduce a helper function iommufd_backend_get_device_info() to get
host IOMMU related information through iommufd uAPI.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  4 
 backends/iommufd.c   | 23 ++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 71c53cbb45..fa1a866237 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include 
 #include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
@@ -34,6 +35,9 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+int iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+enum iommu_hw_info_type *type,
+void *data, uint32_t len, Error **errp);
 
 #define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass, HIOD_IOMMUFD)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index ef8b3a808b..559affa9ec 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -20,7 +20,6 @@
 #include "monitor/monitor.h"
 #include "trace.h"
 #include 
-#include 
 
 static void iommufd_backend_init(Object *obj)
 {
@@ -212,6 +211,28 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t 
ioas_id,
 return ret;
 }
 
+int iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+enum iommu_hw_info_type *type,
+void *data, uint32_t len, Error **errp)
+{
+struct iommu_hw_info info = {
+.size = sizeof(info),
+.dev_id = devid,
+.data_len = len,
+.data_uptr = (uintptr_t)data,
+};
+int ret;
+
+ret = ioctl(be->fd, IOMMU_GET_HW_INFO, &info);
+if (ret) {
+error_setg_errno(errp, errno, "Failed to get hardware info");
+} else {
+*type = info.out_data_type;
+}
+
+return ret;
+}
+
 void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend *iommufd,
uint32_t devid)
 {
-- 
2.34.1




[PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device

2024-04-08 Thread Zhenzhong Duan
HIODLegacyVFIO represents a host IOMMU device under VFIO legacy
container backend.

It includes a link to VFIODevice.

Suggested-by: Eric Auger 
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 11 +++
 hw/vfio/container.c   | 11 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..f30772f534 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
 #endif
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -147,6 +148,16 @@ typedef struct VFIOGroup {
 bool ram_block_discard_allowed;
 } VFIOGroup;
 
+#define TYPE_HIOD_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio"
+OBJECT_DECLARE_SIMPLE_TYPE(HIODLegacyVFIO, HIOD_LEGACY_VFIO)
+
+/* Abstraction of VFIO legacy host IOMMU device */
+struct HIODLegacyVFIO {
+/*< private >*/
+HostIOMMUDevice parent;
+VFIODevice *vdev;
+};
+
 typedef struct VFIODMABuf {
 QemuDmaBuf buf;
 uint32_t pos_x, pos_y, pos_updates;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 77bdec276e..44018ef085 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1143,12 +1143,21 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
 };
 
+static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
+{
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_LEGACY,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_legacy_class_init,
-},
+}, {
+.name = TYPE_HIOD_LEGACY_VFIO,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.instance_size = sizeof(HIODLegacyVFIO),
+.class_init = hiod_legacy_vfio_class_init,
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v2 10/10] vfio: Pass HostIOMMUDevice to vIOMMU

2024-04-08 Thread Zhenzhong Duan
With HostIOMMUDevice passed, vIOMMU can check compatibility with host
IOMMU, call into IOMMUFD specific methods, etc.

Originally-by: Yi Liu 
Signed-off-by: Nicolin Chen 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 64780d1b79..224501a86e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3111,11 +3111,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 vfio_bars_register(vdev);
 
-ret = vfio_add_capabilities(vdev, errp);
+ret = pci_device_set_iommu_device(pdev, vbasedev->hiod, errp);
 if (ret) {
+error_prepend(errp, "Failed to set iommu_device: ");
 goto out_teardown;
 }
 
+ret = vfio_add_capabilities(vdev, errp);
+if (ret) {
+goto out_unset_idev;
+}
+
 if (vdev->vga) {
 vfio_vga_quirk_setup(vdev);
 }
@@ -3132,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 error_setg(errp,
"cannot support IGD OpRegion feature on hotplugged "
"device");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_get_dev_region_info(vbasedev,
@@ -3141,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 if (ret) {
 error_setg_errno(errp, -ret,
  "does not support requested IGD OpRegion 
feature");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
 g_free(opregion);
 if (ret) {
-goto out_teardown;
+goto out_unset_idev;
 }
 }
 
@@ -3233,6 +3239,8 @@ out_deregister:
 if (vdev->intx.mmap_timer) {
 timer_free(vdev->intx.mmap_timer);
 }
+out_unset_idev:
+pci_device_unset_iommu_device(pdev);
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
@@ -3261,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIODevice *vbasedev = &vdev->vbasedev;
 
 vfio_unregister_req_notifier(vdev);
 vfio_unregister_err_notifier(vdev);
@@ -3275,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev)
 vfio_teardown_msi(vdev);
 vfio_pci_disable_rp_atomics(vdev);
 vfio_bars_exit(vdev);
-vfio_migration_exit(&vdev->vbasedev);
+vfio_migration_exit(vbasedev);
+pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
-- 
2.34.1




[PATCH v2 07/10] backends/iommufd: Implement get_host_iommu_info() callback

2024-04-08 Thread Zhenzhong Duan
It calls iommufd_backend_get_device_info() to get host IOMMU
related information.

Define a common structure HIOD_IOMMUFD_INFO to describe the info
returned from kernel. Currently only vtd, but easy to add arm smmu
when kernel supports.

Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  7 +++
 backends/iommufd.c   | 17 +
 2 files changed, 24 insertions(+)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index fa1a866237..44ec1335b2 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -39,6 +39,13 @@ int iommufd_backend_get_device_info(IOMMUFDBackend *be, 
uint32_t devid,
 enum iommu_hw_info_type *type,
 void *data, uint32_t len, Error **errp);
 
+typedef struct HIOD_IOMMUFD_INFO {
+enum iommu_hw_info_type type;
+union {
+struct iommu_hw_info_vtd vtd;
+} data;
+} HIOD_IOMMUFD_INFO;
+
 #define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass, HIOD_IOMMUFD)
 
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 559affa9ec..1e9c469e65 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -240,8 +240,25 @@ void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend 
*iommufd,
 idev->devid = devid;
 }
 
+static int hiod_iommufd_get_host_iommu_info(HostIOMMUDevice *hiod,
+void *data, uint32_t len,
+Error **errp)
+{
+HIODIOMMUFD *idev = HIOD_IOMMUFD(hiod);
+HIOD_IOMMUFD_INFO *info = data;
+
+assert(sizeof(HIOD_IOMMUFD_INFO) <= len);
+
+return iommufd_backend_get_device_info(idev->iommufd, idev->devid,
+   &info->type, &info->data,
+   sizeof(info->data), errp);
+}
+
 static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
 {
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hiodc->get_host_iommu_info = hiod_iommufd_get_host_iommu_info;
 }
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH v2 01/10] backends: Introduce abstract HostIOMMUDevice

2024-04-08 Thread Zhenzhong Duan
Introduce HostIOMMUDevice as an abstraction of host IOMMU device.

get_host_iommu_info() is used to get host IOMMU info, different
backends can have different implementations and result format.

Introduce a macro CONFIG_HOST_IOMMU_DEVICE to define the usage
for VFIO, and VDPA in the future.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 MAINTAINERS|  2 ++
 include/sysemu/host_iommu_device.h | 19 +++
 backends/host_iommu_device.c   | 19 +++
 backends/Kconfig   |  5 +
 backends/meson.build   |  1 +
 5 files changed, 46 insertions(+)
 create mode 100644 include/sysemu/host_iommu_device.h
 create mode 100644 backends/host_iommu_device.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e71183eef9..22f71cbe02 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2202,6 +2202,8 @@ M: Zhenzhong Duan 
 S: Supported
 F: backends/iommufd.c
 F: include/sysemu/iommufd.h
+F: backends/host_iommu_device.c
+F: include/sysemu/host_iommu_device.h
 F: include/qemu/chardev_open.h
 F: util/chardev_open.c
 F: docs/devel/vfio-iommufd.rst
diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
new file mode 100644
index 00..22ccbe3a5d
--- /dev/null
+++ b/include/sysemu/host_iommu_device.h
@@ -0,0 +1,19 @@
+#ifndef HOST_IOMMU_DEVICE_H
+#define HOST_IOMMU_DEVICE_H
+
+#include "qom/object.h"
+
+#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
+OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
+
+struct HostIOMMUDevice {
+Object parent;
+};
+
+struct HostIOMMUDeviceClass {
+ObjectClass parent_class;
+
+int (*get_host_iommu_info)(HostIOMMUDevice *hiod, void *data, uint32_t len,
+   Error **errp);
+};
+#endif
diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c
new file mode 100644
index 00..6cb6007d8c
--- /dev/null
+++ b/backends/host_iommu_device.c
@@ -0,0 +1,19 @@
+#include "qemu/osdep.h"
+#include "sysemu/host_iommu_device.h"
+
+OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice,
+host_iommu_device,
+HOST_IOMMU_DEVICE,
+OBJECT)
+
+static void host_iommu_device_class_init(ObjectClass *oc, void *data)
+{
+}
+
+static void host_iommu_device_init(Object *obj)
+{
+}
+
+static void host_iommu_device_finalize(Object *obj)
+{
+}
diff --git a/backends/Kconfig b/backends/Kconfig
index 2cb23f62fa..34ab29e994 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -3,3 +3,8 @@ source tpm/Kconfig
 config IOMMUFD
 bool
 depends on VFIO
+
+config HOST_IOMMU_DEVICE
+bool
+default y
+depends on VFIO
diff --git a/backends/meson.build b/backends/meson.build
index 8b2b111497..2e975d641e 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -25,6 +25,7 @@ if have_vhost_user
 endif
 system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: 
files('cryptodev-vhost.c'))
 system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
+system_ss.add(when: 'CONFIG_HOST_IOMMU_DEVICE', if_true: 
files('host_iommu_device.c'))
 if have_vhost_user_crypto
   system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: 
files('cryptodev-vhost-user.c'))
 endif
-- 
2.34.1




[PATCH v2 05/10] vfio: Implement get_host_iommu_info() callback

2024-04-08 Thread Zhenzhong Duan
Utilize iova_ranges to calculate host IOMMU address width and
package it in HIOD_LEGACY_INFO for vIOMMU usage.

HIOD_LEGACY_INFO will be used by both VFIO and VDPA so declare
it in host_iommu_device.h.

Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/host_iommu_device.h | 10 ++
 hw/vfio/container.c| 24 
 2 files changed, 34 insertions(+)

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
index 22ccbe3a5d..beb8be8231 100644
--- a/include/sysemu/host_iommu_device.h
+++ b/include/sysemu/host_iommu_device.h
@@ -16,4 +16,14 @@ struct HostIOMMUDeviceClass {
 int (*get_host_iommu_info)(HostIOMMUDevice *hiod, void *data, uint32_t len,
Error **errp);
 };
+
+/*
+ * Define the format of host IOMMU related info that current VFIO
+ * or VDPA can privode to vIOMMU.
+ *
+ * @aw_bits: Host IOMMU address width. 0xff if no limitation.
+ */
+typedef struct HIOD_LEGACY_INFO {
+uint8_t aw_bits;
+} HIOD_LEGACY_INFO;
 #endif
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 44018ef085..ba0ad4a41b 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1143,8 +1143,32 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
 };
 
+static int hiod_legacy_vfio_get_host_iommu_info(HostIOMMUDevice *hiod,
+void *data, uint32_t len,
+Error **errp)
+{
+VFIODevice *vbasedev = HIOD_LEGACY_VFIO(hiod)->vdev;
+/* iova_ranges is a sorted list */
+GList *l = g_list_last(vbasedev->bcontainer->iova_ranges);
+HIOD_LEGACY_INFO *info = data;
+
+assert(sizeof(HIOD_LEGACY_INFO) <= len);
+
+if (l) {
+Range *range = l->data;
+info->aw_bits = find_last_bit(&range->upb, BITS_PER_LONG) + 1;
+} else {
+info->aw_bits = 0xff;
+}
+
+return 0;
+}
+
 static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
 {
+HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hioc->get_host_iommu_info = hiod_legacy_vfio_get_host_iommu_info;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH v2 08/10] vfio: Create host IOMMU device instance

2024-04-08 Thread Zhenzhong Duan
Create host IOMMU device instance and initialize it based on backend.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 1 +
 hw/vfio/container.c   | 5 +
 hw/vfio/iommufd.c | 8 
 3 files changed, 14 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index d382b12ec1..4fbba85018 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -126,6 +126,7 @@ typedef struct VFIODevice {
 OnOffAuto pre_copy_dirty_page_tracking;
 bool dirty_pages_supported;
 bool dirty_tracking;
+HostIOMMUDevice *hiod;
 int devid;
 IOMMUFDBackend *iommufd;
 } VFIODevice;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index ba0ad4a41b..fc0c027501 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -915,6 +915,7 @@ static int vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
 VFIODevice *vbasedev_iter;
 VFIOGroup *group;
 VFIOContainerBase *bcontainer;
+HIODLegacyVFIO *hiod_vfio;
 int ret;
 
 if (groupid < 0) {
@@ -945,6 +946,9 @@ static int vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
 vbasedev->bcontainer = bcontainer;
 QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
 QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+hiod_vfio = HIOD_LEGACY_VFIO(object_new(TYPE_HIOD_LEGACY_VFIO));
+hiod_vfio->vdev = vbasedev;
+vbasedev->hiod = HOST_IOMMU_DEVICE(hiod_vfio);
 
 return ret;
 }
@@ -959,6 +963,7 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev)
 trace_vfio_detach_device(vbasedev->name, group->groupid);
 vfio_put_base_device(vbasedev);
 vfio_put_group(group);
+object_unref(vbasedev->hiod);
 }
 
 static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 115b9f8e7f..b6d058339b 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -308,6 +308,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice 
*vbasedev,
 VFIOIOMMUFDContainer *container;
 VFIOAddressSpace *space;
 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+HIODIOMMUFDVFIO *hiod_vfio;
 int ret, devfd;
 uint32_t ioas_id;
 Error *err = NULL;
@@ -431,6 +432,12 @@ found_container:
 QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
 QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
 
+hiod_vfio = HIOD_IOMMUFD_VFIO(object_new(TYPE_HIOD_IOMMUFD_VFIO));
+hiod_iommufd_init(HIOD_IOMMUFD(hiod_vfio), vbasedev->iommufd,
+  vbasedev->devid);
+hiod_vfio->vdev = vbasedev;
+vbasedev->hiod = HOST_IOMMU_DEVICE(hiod_vfio);
+
 trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
vbasedev->num_regions, vbasedev->flags);
 return 0;
@@ -468,6 +475,7 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev)
 iommufd_cdev_detach_container(vbasedev, container);
 iommufd_cdev_container_destroy(container);
 vfio_put_address_space(space);
+object_unref(vbasedev->hiod);
 
 iommufd_cdev_unbind_and_disconnect(vbasedev);
 close(vbasedev->fd);
-- 
2.34.1




[PATCH v2 09/10] hw/pci: Introduce pci_device_set/unset_iommu_device()

2024-04-08 Thread Zhenzhong Duan
From: Yi Liu 

This adds pci_device_set/unset_iommu_device() to set/unset
HostIOMMUDevice for a given PCI device. Caller of set
should fail if set operation fails.

Extract out pci_device_get_iommu_bus_devfn() to facilitate
implementation of pci_device_set/unset_iommu_device().

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/pci/pci.h | 40 ++-
 hw/pci/pci.c | 75 ++--
 2 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index eaa3fc99d8..4ae7fe6f3f 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
 
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
@@ -383,10 +384,47 @@ typedef struct PCIIOMMUOps {
  *
  * @devfn: device and function number
  */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+/**
+ * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
+ *
+ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+ * retrieve host information from the associated HostIOMMUDevice.
+ *
+ * Return true if HostIOMMUDevice is attached, or else return false
+ * with errp set.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ *
+ * @dev: the data structure representing host IOMMU device.
+ *
+ * @errp: pass an Error out only when return false
+ *
+ */
+int (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *dev, Error **errp);
+/**
+ * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU
+ *
+ * Optional callback.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ */
+void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
 } PCIIOMMUOps;
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
 
 /**
  * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e7a39cb203..8ece617673 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2648,11 +2648,27 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+/*
+ * Get IOMMU root bus, aliased bus and devfn of a PCI device
+ *
+ * IOMMU root bus is needed by all call sites to call into iommu_ops.
+ * For call sites which don't need aliased BDF, passing NULL to
+ * aliased_[bus/devfn] is allowed.
+ *
+ * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device.
+ *
+ * @aliased_bus: return aliased #PCIBus of the PCI device, optional.
+ *
+ * @aliased_devfn: return aliased devfn of the PCI device, optional.
+ */
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+   PCIBus **piommu_bus,
+   PCIBus **aliased_bus,
+   int *aliased_devfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
-uint8_t devfn = dev->devfn;
+int devfn = dev->devfn;
 
 while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
 PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2693,13 +2709,66 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+assert(iommu_bus);
+
+if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+iommu_bus = NULL;
+}
+
+*piommu_bus = iommu_bus;
+
+if (aliased_bus) {
+*aliased_bus = bus;
+}
+
+if (aliased_devfn) {
+*aliased_devfn = devfn;
+}
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+PCIBus *iommu_bus;
+int devfn;
+
+pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
+if (iommu_bus) {
 return iommu_bus->iommu_ops->get_address_space(bus,
  iommu_bus->iommu_opaque, devfn);
 }
 return &add

[PATCH v2 4/5] intel_iommu: Check for compatibility with legacy device

2024-04-08 Thread Zhenzhong Duan
Currently only stage-2 translation is supported which is backed by
shadow page table on host side. So we don't need exact matching of
each bit of cap/ecap between vIOMMU and host. However, we can still
ensure compatibility of host and vIOMMU's address width at least,
i.e., vIOMMU's aw-bits <= host IOMMU aw-bits, which is missed before.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a49b587c73..d2cd186df0 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3824,6 +3824,21 @@ static int vtd_check_legacy_hdev(IntelIOMMUState *s,
  HostIOMMUDevice *hiod,
  Error **errp)
 {
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+HIOD_LEGACY_INFO info;
+int ret;
+
+ret = hiodc->get_host_iommu_info(hiod, &info, sizeof(info), errp);
+if (ret) {
+return ret;
+}
+
+if (s->aw_bits > info.aw_bits) {
+error_setg(errp, "aw-bits %d > host aw-bits %d",
+   s->aw_bits, info.aw_bits);
+return -EINVAL;
+}
+
 return 0;
 }
 
-- 
2.34.1




[PATCH v2 0/5] Check host IOMMU compatilibity with vIOMMU

2024-04-08 Thread Zhenzhong Duan
Hi,

Based on Joao's suggestion, the iommufd nesting prerequisite series [1]
is further splitted to host IOMMU device abstract part [2] and vIOMMU
check part. This series implements the 2nd part.

1st part implements get_host_iommu_info() callback which vIOMMU can call to
get host IOMMU info. For legacy VFIO or VDPA device, aw_bits is provided;
for IOMMUFD backed device, IOMMUFD uAPI provides detailed cap/ecap bits from
host.

vIOMMU implements set/unset_iommu_device() callback to get HostIOMMUDevice
and call get_host_iommu_info(). So vIOMMU can do compatibility check with
the return host IOMMU info.

This is also a prerequisite for incoming iommufd nesting series:
'intel_iommu: Enable stage-1 translation' where HostIOMMUDevice provides
more data such as iommufd/devid/ioas_id and callback attach/detach_hwpt()
for vIOMMU to create nested hwpt, attaching/detaching hwpt, etc.

The major change of this version is dropping the cap/ecap update logic based
on MST's suggestion. We can add property for any cap/ecap bit when necessary
just like "aw-bits". This way we don't need to concern about migration
compatibility and code is cleaner.

Qemu code can be found at:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_preq_part2_v2

[1] 
https://lore.kernel.org/qemu-devel/20240201072818.327930-1-zhenzhong.d...@intel.com/
[2] https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg00763.html

Thanks
Zhenzhong

Changelog:
v2:
- drop cap/ecap update logic (MST)
- check aw-bits from get_host_iommu_info() in legacy mode

v1:
- convert HostIOMMUDevice to sub object pointer in vtd_check_hdev

rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B


Yi Liu (2):
  intel_iommu: Implement set/unset_iommu_device() callback
  intel_iommu: Add a framework to do compatibility check with host IOMMU
cap/ecap

Zhenzhong Duan (3):
  intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap
  intel_iommu: Check for compatibility with legacy device
  intel_iommu: Check for compatibility with iommufd backed device

 hw/i386/intel_iommu_internal.h |   8 ++
 include/hw/i386/intel_iommu.h  |   3 +
 hw/i386/intel_iommu.c  | 242 +++--
 3 files changed, 211 insertions(+), 42 deletions(-)

-- 
2.34.1




[PATCH v2 1/5] intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap

2024-04-08 Thread Zhenzhong Duan
Extract cap/ecap initialization in vtd_cap_init() to make code
cleaner.

No functional change intended.

Reviewed-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 93 ---
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index cc8e59674e..519063c8f8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3934,30 +3934,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 return;
 }
 
-/* Do the initialization. It will also be called when reset, so pay
- * attention when adding new initialization stuff.
- */
-static void vtd_init(IntelIOMMUState *s)
+static void vtd_cap_init(IntelIOMMUState *s)
 {
 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
-memset(s->csr, 0, DMAR_REG_SIZE);
-memset(s->wmask, 0, DMAR_REG_SIZE);
-memset(s->w1cmask, 0, DMAR_REG_SIZE);
-memset(s->womask, 0, DMAR_REG_SIZE);
-
-s->root = 0;
-s->root_scalable = false;
-s->dmar_enabled = false;
-s->intr_enabled = false;
-s->iq_head = 0;
-s->iq_tail = 0;
-s->iq = 0;
-s->iq_size = 0;
-s->qi_enabled = false;
-s->iq_last_desc_type = VTD_INV_DESC_NONE;
-s->iq_dw = false;
-s->next_frcd_reg = 0;
 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
  VTD_CAP_MGAW(s->aw_bits);
@@ -3974,27 +3954,6 @@ static void vtd_init(IntelIOMMUState *s)
 }
 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
-/*
- * Rsvd field masks for spte
- */
-vtd_spte_rsvd[0] = ~0ULL;
-vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
-  x86_iommu->dt_supported);
-vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
-
-vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-
-if (s->scalable_mode || s->snoop_control) {
-vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
-}
-
 if (x86_iommu_ir_supported(x86_iommu)) {
 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
 if (s->intr_eim == ON_OFF_AUTO_ON) {
@@ -4027,6 +3986,56 @@ static void vtd_init(IntelIOMMUState *s)
 if (s->pasid) {
 s->ecap |= VTD_ECAP_PASID;
 }
+}
+
+/*
+ * Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+memset(s->csr, 0, DMAR_REG_SIZE);
+memset(s->wmask, 0, DMAR_REG_SIZE);
+memset(s->w1cmask, 0, DMAR_REG_SIZE);
+memset(s->womask, 0, DMAR_REG_SIZE);
+
+s->root = 0;
+s->root_scalable = false;
+s->dmar_enabled = false;
+s->intr_enabled = false;
+s->iq_head = 0;
+s->iq_tail = 0;
+s->iq = 0;
+s->iq_size = 0;
+s->qi_enabled = false;
+s->iq_last_desc_type = VTD_INV_DESC_NONE;
+s->iq_dw = false;
+s->next_frcd_reg = 0;
+
+vtd_cap_init(s);
+
+/*
+ * Rsvd field masks for spte
+ */
+vtd_spte_rsvd[0] = ~0ULL;
+vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+  x86_iommu->dt_supported);
+vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+
+if (s->scalable_mode || s->snoop_control) {
+vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+}
 
 vtd_reset_caches(s);
 
-- 
2.34.1




[PATCH v2 2/5] intel_iommu: Implement set/unset_iommu_device() callback

2024-04-08 Thread Zhenzhong Duan
From: Yi Liu 

Implement set/unset_iommu_device() callback in Intel vIOMMU.
In set call, a new structure VTDHostIOMMUDevice which holds
a reference to HostIOMMUDevice is stored in hash table
indexed by PCI BDF.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  8 
 include/hw/i386/intel_iommu.h  |  2 +
 hw/i386/intel_iommu.c  | 76 ++
 3 files changed, 86 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index f8cf99bddf..becafd03c1 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -537,4 +537,12 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SL_IGN_COM  0xbff0ULL
 #define VTD_SL_TM   (1ULL << 62)
 
+
+typedef struct VTDHostIOMMUDevice {
+IntelIOMMUState *iommu_state;
+PCIBus *bus;
+uint8_t devfn;
+HostIOMMUDevice *dev;
+QLIST_ENTRY(VTDHostIOMMUDevice) next;
+} VTDHostIOMMUDevice;
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7fa0a695c8..bbc7b96add 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -292,6 +292,8 @@ struct IntelIOMMUState {
 /* list of registered notifiers */
 QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
+GHashTable *vtd_host_iommu_dev; /* VTDHostIOMMUDevice */
+
 /* interrupt remapping */
 bool intr_enabled;  /* Whether guest enabled IR */
 dma_addr_t intr_root;   /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 519063c8f8..4f84e2e801 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -237,6 +237,13 @@ static gboolean vtd_as_equal(gconstpointer v1, 
gconstpointer v2)
(key1->pasid == key2->pasid);
 }
 
+static gboolean vtd_as_idev_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct vtd_as_key *key1 = v1;
+const struct vtd_as_key *key2 = v2;
+
+return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
+}
 /*
  * Note that we use pointer to PCIBus as the key, so hashing/shifting
  * based on the pointer value is intended. Note that we deal with
@@ -3812,6 +3819,70 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *hiod, Error **errp)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+struct vtd_as_key *new_key;
+
+assert(hiod);
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+
+if (vtd_hdev) {
+error_setg(errp, "IOMMUFD device already exist");
+vtd_iommu_unlock(s);
+return -EEXIST;
+}
+
+vtd_hdev = g_malloc0(sizeof(VTDHostIOMMUDevice));
+vtd_hdev->bus = bus;
+vtd_hdev->devfn = (uint8_t)devfn;
+vtd_hdev->iommu_state = s;
+vtd_hdev->dev = hiod;
+
+new_key = g_malloc(sizeof(*new_key));
+new_key->bus = bus;
+new_key->devfn = devfn;
+
+object_ref(hiod);
+g_hash_table_insert(s->vtd_host_iommu_dev, new_key, vtd_hdev);
+
+vtd_iommu_unlock(s);
+
+return 0;
+}
+
+static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+if (!vtd_hdev) {
+vtd_iommu_unlock(s);
+return;
+}
+
+g_hash_table_remove(s->vtd_host_iommu_dev, &key);
+object_unref(vtd_hdev->dev);
+
+vtd_iommu_unlock(s);
+}
+
 /* Unmap the whole range in the notifier's scope. */
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 {
@@ -4116,6 +4187,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.set_iommu_device = vtd_dev_set_iommu_device,
+.unset_iommu_device = vtd_dev_unset_iommu_device,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
@@ -4235,6 +4308,9 @@ static void vtd_realize(DeviceState *dev, Error **errp)
  g_free, g_free);
 s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,
   g_free, g_free);
+s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_as_hash,
+  vtd_as_idev_equal,
+

[PATCH v2 3/5] intel_iommu: Add a framework to do compatibility check with host IOMMU cap/ecap

2024-04-08 Thread Zhenzhong Duan
From: Yi Liu 

If check fails, the host side device(either vfio or vdpa device) should not
be passed to guest.

Implementation details for different backends will be in following patches.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4f84e2e801..a49b587c73 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -35,6 +35,7 @@
 #include "sysemu/kvm.h"
 #include "sysemu/dma.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/iommufd.h"
 #include "hw/i386/apic_internal.h"
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
@@ -3819,6 +3820,32 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_check_legacy_hdev(IntelIOMMUState *s,
+ HostIOMMUDevice *hiod,
+ Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
+  HostIOMMUDevice *hiod,
+  Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
+  Error **errp)
+{
+HostIOMMUDevice *hiod = vtd_hdev->dev;
+
+if (object_dynamic_cast(OBJECT(hiod), TYPE_HIOD_IOMMUFD)) {
+return vtd_check_iommufd_hdev(s, hiod, errp);
+}
+
+return vtd_check_legacy_hdev(s, hiod, errp);
+}
+
 static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
 HostIOMMUDevice *hiod, Error **errp)
 {
@@ -3829,6 +3856,7 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 .devfn = devfn,
 };
 struct vtd_as_key *new_key;
+int ret;
 
 assert(hiod);
 
@@ -3848,6 +3876,13 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 vtd_hdev->iommu_state = s;
 vtd_hdev->dev = hiod;
 
+ret = vtd_check_hdev(s, vtd_hdev, errp);
+if (ret) {
+g_free(vtd_hdev);
+vtd_iommu_unlock(s);
+return ret;
+}
+
 new_key = g_malloc(sizeof(*new_key));
 new_key->bus = bus;
 new_key->devfn = devfn;
-- 
2.34.1




[PATCH v2 5/5] intel_iommu: Check for compatibility with iommufd backed device

2024-04-08 Thread Zhenzhong Duan
Currently only stage-2 translation is supported which is backed by
shadow page table on host side. So we don't need exact matching of
each bit of cap/ecap between vIOMMU and host. However, we can still
ensure compatibility of host and vIOMMU's address width at least,
i.e., vIOMMU's aw-bits <= host IOMMU aw-bits, which is missed before.

When stage-1 translation is supported in future, a.k.a. scalable
modern mode, this mechanism will be further extended to check more
bits.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/i386/intel_iommu.c | 23 +++
 2 files changed, 24 insertions(+)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index bbc7b96add..2bbde41e45 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -47,6 +47,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(IntelIOMMUState, 
INTEL_IOMMU_DEVICE)
 #define VTD_HOST_AW_48BIT   48
 #define VTD_HOST_ADDRESS_WIDTH  VTD_HOST_AW_39BIT
 #define VTD_HAW_MASK(aw)((1ULL << (aw)) - 1)
+#define VTD_MGAW_FROM_CAP(cap)  ((cap >> 16) & 0x3fULL)
 
 #define DMAR_REPORT_F_INTR  (1)
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index d2cd186df0..d8fac9ef9f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3846,6 +3846,29 @@ static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
   HostIOMMUDevice *hiod,
   Error **errp)
 {
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+struct iommu_hw_info_vtd *vtd;
+HIOD_IOMMUFD_INFO info;
+int host_aw_bits, ret;
+
+ret = hiodc->get_host_iommu_info(hiod, &info, sizeof(info), errp);
+if (ret) {
+return ret;
+}
+
+if (info.type != IOMMU_HW_INFO_TYPE_INTEL_VTD) {
+error_setg(errp, "IOMMU hardware is not compatible");
+return -EINVAL;
+}
+
+vtd = &info.data.vtd;
+host_aw_bits = VTD_MGAW_FROM_CAP(vtd->cap_reg) + 1;
+if (s->aw_bits > host_aw_bits) {
+error_setg(errp, "aw-bits %d > host aw-bits %d",
+   s->aw_bits, host_aw_bits);
+return -EINVAL;
+}
+
 return 0;
 }
 
-- 
2.34.1




[PATCH v2 3/3] qom/object_interfaces: Remove local_err in user_creatable_add_type

2024-03-17 Thread Zhenzhong Duan
In user_creatable_add_type, there is mixed usage of ERRP_GUARD and
local_err. This makes error_abort not taking effect in those callee
functions with &local_err passed.

Now that we already use ERRP_GUARD, remove local_err and pass errp.

Signed-off-by: Zhenzhong Duan 
---
 qom/object_interfaces.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
index e17e2de46d..2067bf2230 100644
--- a/qom/object_interfaces.c
+++ b/qom/object_interfaces.c
@@ -84,7 +84,6 @@ Object *user_creatable_add_type(const char *type, const char 
*id,
 ERRP_GUARD();
 Object *obj;
 ObjectClass *klass;
-Error *local_err = NULL;
 
 if (id != NULL && !id_wellformed(id)) {
 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "id", "an identifier");
@@ -112,16 +111,16 @@ Object *user_creatable_add_type(const char *type, const 
char *id,
 
 assert(qdict);
 obj = object_new(type);
-if (!object_set_properties_from_qdict(obj, qdict, v, &local_err)) {
+if (!object_set_properties_from_qdict(obj, qdict, v, errp)) {
 goto err;
 }
 
 if (id != NULL && !object_property_try_add_child(object_get_objects_root(),
- id, obj, &local_err)) {
+ id, obj, errp)) {
 goto err;
 }
 
-if (!user_creatable_complete(USER_CREATABLE(obj), &local_err)) {
+if (!user_creatable_complete(USER_CREATABLE(obj), errp)) {
 if (id != NULL) {
 object_property_del(object_get_objects_root(), id);
 }
@@ -129,7 +128,6 @@ Object *user_creatable_add_type(const char *type, const 
char *id,
 }
 return obj;
 err:
-error_propagate(errp, local_err);
 object_unref(obj);
 return NULL;
 }
-- 
2.34.1




[PATCH v2 1/3] qom/object_interfaces: Remove unnecessary local error check

2024-03-17 Thread Zhenzhong Duan
The original error handling code indicates "local_err is always set",
and error_propagate() can handle the case that local_err is NULL.

Use err label instead of out label for error path.

Reviewed-by: Zhao Liu 
Signed-off-by: Zhenzhong Duan 
---
 qom/object_interfaces.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
index e0833c8bfe..70179877f1 100644
--- a/qom/object_interfaces.c
+++ b/qom/object_interfaces.c
@@ -111,14 +111,14 @@ Object *user_creatable_add_type(const char *type, const 
char *id,
 obj = object_new(type);
 object_set_properties_from_qdict(obj, qdict, v, &local_err);
 if (local_err) {
-goto out;
+goto err;
 }
 
 if (id != NULL) {
 object_property_try_add_child(object_get_objects_root(),
   id, obj, &local_err);
 if (local_err) {
-goto out;
+goto err;
 }
 }
 
@@ -126,15 +126,13 @@ Object *user_creatable_add_type(const char *type, const 
char *id,
 if (id != NULL) {
 object_property_del(object_get_objects_root(), id);
 }
-goto out;
-}
-out:
-if (local_err) {
-error_propagate(errp, local_err);
-object_unref(obj);
-return NULL;
+goto err;
 }
 return obj;
+err:
+error_propagate(errp, local_err);
+object_unref(obj);
+return NULL;
 }
 
 void user_creatable_add_qapi(ObjectOptions *options, Error **errp)
-- 
2.34.1




[PATCH v2 2/3] qom/object_interfaces: Make object_set_properties_from_qdict return bool

2024-03-17 Thread Zhenzhong Duan
Make object_set_properties_from_qdict() return bool, so that
user_creatable_add_type() could check its return value instead
of local_err pointer.

Opportunistically, do the same change to check return value of
object_property_try_add_child() instead of local_err pointer.

Suggested-by: Zhao Liu 
Signed-off-by: Zhenzhong Duan 
---
 qom/object_interfaces.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
index 70179877f1..e17e2de46d 100644
--- a/qom/object_interfaces.c
+++ b/qom/object_interfaces.c
@@ -43,22 +43,25 @@ bool user_creatable_can_be_deleted(UserCreatable *uc)
 }
 }
 
-static void object_set_properties_from_qdict(Object *obj, const QDict *qdict,
+static bool object_set_properties_from_qdict(Object *obj, const QDict *qdict,
  Visitor *v, Error **errp)
 {
 const QDictEntry *e;
+bool ret;
 
 if (!visit_start_struct(v, NULL, NULL, 0, errp)) {
-return;
+return false;
 }
 for (e = qdict_first(qdict); e; e = qdict_next(qdict, e)) {
-if (!object_property_set(obj, e->key, v, errp)) {
+ret = object_property_set(obj, e->key, v, errp);
+if (!ret) {
 goto out;
 }
 }
-visit_check_struct(v, errp);
+ret = visit_check_struct(v, errp);
 out:
 visit_end_struct(v, NULL);
+return ret;
 }
 
 void object_set_properties_from_keyval(Object *obj, const QDict *qdict,
@@ -109,17 +112,13 @@ Object *user_creatable_add_type(const char *type, const 
char *id,
 
 assert(qdict);
 obj = object_new(type);
-object_set_properties_from_qdict(obj, qdict, v, &local_err);
-if (local_err) {
+if (!object_set_properties_from_qdict(obj, qdict, v, &local_err)) {
 goto err;
 }
 
-if (id != NULL) {
-object_property_try_add_child(object_get_objects_root(),
-  id, obj, &local_err);
-if (local_err) {
+if (id != NULL && !object_property_try_add_child(object_get_objects_root(),
+ id, obj, &local_err)) {
 goto err;
-}
 }
 
 if (!user_creatable_complete(USER_CREATABLE(obj), &local_err)) {
-- 
2.34.1




[PATCH v2 0/3] Simplify user_creatable_add_type error path

2024-03-17 Thread Zhenzhong Duan
Hi,

This is a simplification to user_creatable_add_type error path.
Removed local_err and its check in error path, check return value
instead.

Tested with make check and guest bootup.

Thanks
Zhenzhong

Changelog:
v2:
- Use err label to replace out label (Zhao Liu)
- Refine patch description (Zhao Liu)
- Make object_set_properties_from_qdict return bool (Zhao Liu)
- Check return value of object_property_try_add_child (Zhao Liu)
- Add R-B

Zhenzhong Duan (3):
  qom/object_interfaces: Remove unnecessary local error check
  qom/object_interfaces: Make object_set_properties_from_qdict return
bool
  qom/object_interfaces: Remove local_err in user_creatable_add_type

 qom/object_interfaces.c | 39 +--
 1 file changed, 17 insertions(+), 22 deletions(-)

-- 
2.34.1




[PATCH v3 0/1] Introduce Icelake-Server-v7 to enable TSX

2024-03-20 Thread Zhenzhong Duan
Hi,

This is a new effort trying to enable TSX in Icelake model.

Currently Icelake-Server-v3 and above has TSX disabled but taa-no enabled.
This is a invalid config as taa-no hints TSX exist and is invulnerable.
When start L2 guest with both L1/L2 using Icelake-Server-v3 or above,
QEMU reports below warning:

"warning: host doesn't support requested feature: MSR(10AH).taa-no [bit 8]"


Different from v2 patch 
https://www.mail-archive.com/qemu-devel@nongnu.org/msg907730.html
This patch add a new version Icelake-Server-v7, so the compatility of old
version is guaranteed.

Comments welcome!

Thanks
Zhenzhong

Zhenzhong Duan (1):
  target/i386: Introduce Icelake-Server-v7 to enable TSX

 target/i386/cpu.c | 10 ++
 1 file changed, 10 insertions(+)

-- 
2.34.1




[PATCH v3 1/1] target/i386: Introduce Icelake-Server-v7 to enable TSX

2024-03-20 Thread Zhenzhong Duan
When start L2 guest with both L1/L2 using Icelake-Server-v3 or above,
QEMU reports below warning:

"warning: host doesn't support requested feature: MSR(10AH).taa-no [bit 8]"

Reason is QEMU Icelake-Server-v3 has TSX feature disabled but enables taa-no
bit. It's meaningless that TSX isn't supported but still claim TSX is secure.
So L1 KVM doesn't expose taa-no to L2 if TSX is unsupported, then starting L2
triggers the warning.

Fix it by introducing a new version Icelake-Server-v7 which has both TSX
and taa-no features. Then guest can use TSX securely when it see taa-no.

This matches the production Icelake which supports TSX and isn't susceptible
to TSX Async Abort (TAA) vulnerabilities, a.k.a, taa-no.

Ideally, TSX should have being enabled together with taa-no since v3, but for
compatibility, we'd better to add v7 to enable it.

Fixes: d965dc35592d ("target/i386: Add ARCH_CAPABILITIES related bits into 
Icelake-Server CPU model")
Tested-by: Xiangfei Ma 
Signed-off-by: Zhenzhong Duan 
---
 target/i386/cpu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 9a210d8d92..5f2191cd99 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3822,6 +3822,16 @@ static const X86CPUDefinition builtin_x86_defs[] = {
 { /* end of list */ }
 },
 },
+{
+.version = 7,
+.note = "TSX, taa-no",
+.props = (PropValue[]) {
+/* Restore TSX features removed by -v2 above */
+{ "hle", "on" },
+{ "rtm", "on" },
+{ /* end of list */ }
+},
+},
 { /* end of list */ }
 }
 },
-- 
2.34.1




[PATCH v3 01/19] backends: Introduce HostIOMMUDevice abstract

2024-04-28 Thread Zhenzhong Duan
Introduce HostIOMMUDevice as an abstraction of host IOMMU device.

Introduce .realize() to initialize HostIOMMUDevice further after
instance init.

Introduce a macro CONFIG_HOST_IOMMU_DEVICE to define the usage
for VFIO, and VDPA in the future.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 MAINTAINERS|  2 ++
 include/sysemu/host_iommu_device.h | 51 ++
 backends/host_iommu_device.c   | 30 ++
 backends/Kconfig   |  5 +++
 backends/meson.build   |  1 +
 5 files changed, 89 insertions(+)
 create mode 100644 include/sysemu/host_iommu_device.h
 create mode 100644 backends/host_iommu_device.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 302b6fd00c..f67cd36b34 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2190,6 +2190,8 @@ M: Zhenzhong Duan 
 S: Supported
 F: backends/iommufd.c
 F: include/sysemu/iommufd.h
+F: backends/host_iommu_device.c
+F: include/sysemu/host_iommu_device.h
 F: include/qemu/chardev_open.h
 F: util/chardev_open.c
 F: docs/devel/vfio-iommufd.rst
diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
new file mode 100644
index 00..2b58a94d62
--- /dev/null
+++ b/include/sysemu/host_iommu_device.h
@@ -0,0 +1,51 @@
+/*
+ * Host IOMMU device abstract declaration
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ *
+ * Authors: Zhenzhong Duan 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef HOST_IOMMU_DEVICE_H
+#define HOST_IOMMU_DEVICE_H
+
+#include "qom/object.h"
+#include "qapi/error.h"
+
+#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
+OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
+
+struct HostIOMMUDevice {
+Object parent_obj;
+};
+
+/**
+ * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices.
+ *
+ * Different type of host devices (e.g., VFIO or VDPA device) or devices
+ * with different backend (e.g., VFIO legacy container or IOMMUFD backend)
+ * can have different sub-classes.
+ */
+struct HostIOMMUDeviceClass {
+ObjectClass parent_class;
+
+/**
+ * @realize: initialize host IOMMU device instance further.
+ *
+ * Mandatory callback.
+ *
+ * @hiod: pointer to a host IOMMU device instance.
+ *
+ * @opaque: pointer to agent device of this host IOMMU device,
+ *  i.e., for VFIO, pointer to VFIODevice
+ *
+ * @errp: pass an Error out when realize fails.
+ *
+ * Returns: true on success, false on failure.
+ */
+bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+};
+#endif
diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c
new file mode 100644
index 00..41f2fdce20
--- /dev/null
+++ b/backends/host_iommu_device.c
@@ -0,0 +1,30 @@
+/*
+ * Host IOMMU device abstract
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ *
+ * Authors: Zhenzhong Duan 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/host_iommu_device.h"
+
+OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice,
+host_iommu_device,
+HOST_IOMMU_DEVICE,
+OBJECT)
+
+static void host_iommu_device_class_init(ObjectClass *oc, void *data)
+{
+}
+
+static void host_iommu_device_init(Object *obj)
+{
+}
+
+static void host_iommu_device_finalize(Object *obj)
+{
+}
diff --git a/backends/Kconfig b/backends/Kconfig
index 2cb23f62fa..34ab29e994 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -3,3 +3,8 @@ source tpm/Kconfig
 config IOMMUFD
 bool
 depends on VFIO
+
+config HOST_IOMMU_DEVICE
+bool
+default y
+depends on VFIO
diff --git a/backends/meson.build b/backends/meson.build
index 8b2b111497..2e975d641e 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -25,6 +25,7 @@ if have_vhost_user
 endif
 system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: 
files('cryptodev-vhost.c'))
 system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
+system_ss.add(when: 'CONFIG_HOST_IOMMU_DEVICE', if_true: 
files('host_iommu_device.c'))
 if have_vhost_user_crypto
   system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: 
files('cryptodev-vhost-user.c'))
 endif
-- 
2.34.1




[PATCH v3 02/19] vfio/container: Introduce HostIOMMUDeviceLegacyVFIO device

2024-04-28 Thread Zhenzhong Duan
HostIOMMUDeviceLegacyVFIO represents a host IOMMU device under VFIO
legacy container backend.

It includes a link to VFIODevice.

Suggested-by: Eric Auger 
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 12 
 hw/vfio/container.c   |  6 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..aa3abe0a18 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
 #endif
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -147,6 +148,17 @@ typedef struct VFIOGroup {
 bool ram_block_discard_allowed;
 } VFIOGroup;
 
+#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE 
"-legacy-vfio"
+OBJECT_DECLARE_SIMPLE_TYPE(HostIOMMUDeviceLegacyVFIO,
+   HOST_IOMMU_DEVICE_LEGACY_VFIO)
+
+/* Abstract of host IOMMU device with VFIO legacy container backend */
+struct HostIOMMUDeviceLegacyVFIO {
+HostIOMMUDevice parent_obj;
+
+VFIODevice *vdev;
+};
+
 typedef struct VFIODMABuf {
 QemuDmaBuf buf;
 uint32_t pos_x, pos_y, pos_updates;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 77bdec276e..3b6826996a 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1148,7 +1148,11 @@ static const TypeInfo types[] = {
 .name = TYPE_VFIO_IOMMU_LEGACY,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_legacy_class_init,
-},
+}, {
+.name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.instance_size = sizeof(HostIOMMUDeviceLegacyVFIO),
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v3 00/19] Add a host IOMMU device abstraction to check with vIOMMU

2024-04-28 Thread Zhenzhong Duan
Hi,

The most important change in this version is instroducing a common
HostIOMMUDeviceCaps structure in HostIOMMUDevice and a new interface
between vIOMMU and HostIOMMUDevice.

HostIOMMUDeviceClass::realize() is introduced to initialize
HostIOMMUDeviceCaps and other fields of HostIOMMUDevice variants.

HostIOMMUDeviceClass::check_cap() is introduced to query host IOMMU
device capabilities.

After the change, part2 is only 3 patches, so merge it with part1 to be
a single prerequisite series, same for changelog. If anyone doesn't like
that, I can split again.

The class tree is as below:

  HostIOMMUDevice
 | .caps
 | .realize()
 | .check_cap()
 |
.---.
||  |
HostIOMMUDeviceLegacyVFIO  {HostIOMMUDeviceLegacyVDPA}  HostIOMMUDeviceIOMMUFD
| .vdev  | {.vdev}  | .iommufd
| .devid
| [.ioas_id]
| [.attach_hwpt()]
| [.detach_hwpt()]
|
  .--.
  |  |
   HostIOMMUDeviceIOMMUFDVFIO  {HostIOMMUDeviceIOMMUFDVDPA}
  | .vdev| {.vdev}

* The attributes in [] will be implemented in nesting series.
* The classes in {} will be implemented in future.
* .vdev in different class points to different agent device,
* i.e., for VFIO it points to VFIODevice.

PATCH1-4: Introduce HostIOMMUDevice and its sub classes
PATCH5-11: Introduce HostIOMMUDeviceCaps, implement .realize() and .check_cap() 
handler
PATCH12-16: Create HostIOMMUDevice instance and pass to vIOMMU
PATCH17-19: Implement compatibility check between host IOMMU and 
vIOMMU(intel_iommu)

Qemu code can be found at:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_preq_v3

Besides the compatibility check in this series, in nesting series, this
host IOMMU device is extended for much wider usage. For anyone interested
on the nesting series, here is the link:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_rfcv2

Thanks
Zhenzhong

Changelog:
v3:
- refine declaration and doc for HostIOMMUDevice (Cédric, Philippe)
- introduce HostIOMMUDeviceCaps, .realize() and .check_cap() (Cédric)
- introduce helper range_get_last_bit() for range operation (Cédric)
- separate pci_device_get_iommu_bus_devfn() in a prereq patch (Cédric)
- replace HIOD_ abbreviation with HOST_IOMMU_DEVICE_ (Cédric)
- add header in include/sysemu/iommufd.h (Cédric)

v2:
- use QOM to abstract host IOMMU device and its sub-classes (Cédric)
- move host IOMMU device creation in attach_device() (Cédric)
- refine pci_device_set/unset_iommu_device doc further (Eric)
- define host IOMMU info format of different backend
- implement get_host_iommu_info() for different backend (Cédric)
- drop cap/ecap update logic (MST)
- check aw-bits from get_host_iommu_info() in legacy mode

v1:
- use HostIOMMUDevice handle instead of union in VFIODevice (Eric)
- change host_iommu_device_init to host_iommu_device_create
- allocate HostIOMMUDevice in host_iommu_device_create callback
  and set the VFIODevice base_hdev handle (Eric)
- refine pci_device_set/unset_iommu_device doc (Eric)
- use HostIOMMUDevice handle instead of union in VTDHostIOMMUDevice (Eric)
- convert HostIOMMUDevice to sub object pointer in vtd_check_hdev

rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B

Yi Liu (2):
  hw/pci: Introduce pci_device_[set|unset]_iommu_device()
  intel_iommu: Implement [set|unset]_iommu_device() callbacks

Zhenzhong Duan (17):
  backends: Introduce HostIOMMUDevice abstract
  vfio/container: Introduce HostIOMMUDeviceLegacyVFIO device
  backends/iommufd: Introduce abstract HostIOMMUDeviceIOMMUFD device
  vfio/iommufd: Introduce HostIO

[PATCH v3 04/19] vfio/iommufd: Introduce HostIOMMUDeviceIOMMUFDVFIO device

2024-04-28 Thread Zhenzhong Duan
HostIOMMUDeviceIOMMUFDVFIO represents a host IOMMU device under VFIO
iommufd backend. It will be created during VFIO device attaching and
passed to vIOMMU.

It includes a link to VFIODevice so that we can do VFIO device
specific operations, i.e., [at/de]taching hwpt, etc.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 13 +
 hw/vfio/iommufd.c |  6 +-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index aa3abe0a18..0943add3bc 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -32,6 +32,7 @@
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
 #include "sysemu/host_iommu_device.h"
+#include "sysemu/iommufd.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -159,6 +160,18 @@ struct HostIOMMUDeviceLegacyVFIO {
 VFIODevice *vdev;
 };
 
+#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \
+TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio"
+OBJECT_DECLARE_SIMPLE_TYPE(HostIOMMUDeviceIOMMUFDVFIO,
+   HOST_IOMMU_DEVICE_IOMMUFD_VFIO)
+
+/* Abstraction of host IOMMU device with VFIO IOMMUFD backend */
+struct HostIOMMUDeviceIOMMUFDVFIO {
+HostIOMMUDeviceIOMMUFD parent;
+
+VFIODevice *vdev;
+};
+
 typedef struct VFIODMABuf {
 QemuDmaBuf buf;
 uint32_t pos_x, pos_y, pos_updates;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 8827ffe636..997f4ac43e 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -639,7 +639,11 @@ static const TypeInfo types[] = {
 .name = TYPE_VFIO_IOMMU_IOMMUFD,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_iommufd_class_init,
-},
+}, {
+.name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO,
+.parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+.instance_size = sizeof(HostIOMMUDeviceIOMMUFDVFIO),
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v3 11/19] backends/iommufd: Implement HostIOMMUDeviceClass::check_cap() handler

2024-04-28 Thread Zhenzhong Duan
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 backends/iommufd.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/backends/iommufd.c b/backends/iommufd.c
index d61209788a..28faec528e 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -233,6 +233,23 @@ int iommufd_backend_get_device_info(IOMMUFDBackend *be, 
uint32_t devid,
 return ret;
 }
 
+static int hiod_iommufd_check_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
+{
+switch (cap) {
+case HOST_IOMMU_DEVICE_CAP_IOMMUFD:
+return 1;
+default:
+return host_iommu_device_check_cap_common(hiod, cap, errp);
+}
+}
+
+static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
+{
+HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hioc->check_cap = hiod_iommufd_check_cap;
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_IOMMUFD_BACKEND,
@@ -251,6 +268,7 @@ static const TypeInfo types[] = {
 .parent = TYPE_HOST_IOMMU_DEVICE,
 .instance_size = sizeof(HostIOMMUDeviceIOMMUFD),
 .class_size = sizeof(HostIOMMUDeviceIOMMUFDClass),
+.class_init = hiod_iommufd_class_init,
 .abstract = true,
 }
 };
-- 
2.34.1




[PATCH v3 10/19] vfio/container: Implement HostIOMMUDeviceClass::check_cap() handler

2024-04-28 Thread Zhenzhong Duan
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 863eec3943..3683487605 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1164,11 +1164,23 @@ static bool hiod_legacy_vfio_realize(HostIOMMUDevice 
*hiod, void *opaque,
 return true;
 }
 
+static int hiod_legacy_vfio_check_cap(HostIOMMUDevice *hiod, int cap,
+  Error **errp)
+{
+switch (cap) {
+case HOST_IOMMU_DEVICE_CAP_IOMMUFD:
+return 0;
+default:
+return host_iommu_device_check_cap_common(hiod, cap, errp);
+}
+}
+
 static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
 {
 HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
 
 hioc->realize = hiod_legacy_vfio_realize;
+hioc->check_cap = hiod_legacy_vfio_check_cap;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1




[PATCH v3 09/19] vfio/iommufd: Implement HostIOMMUDeviceClass::realize() handler

2024-04-28 Thread Zhenzhong Duan
It calls iommufd_backend_get_device_info() to get host IOMMU
related information and translate it into HostIOMMUDeviceCaps
for query with .check_cap().

Introduce macro VTD_MGAW_FROM_CAP to get MGAW which equals to
(aw_bits - 1).

Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/vfio/iommufd.c | 44 +++
 2 files changed, 45 insertions(+)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7fa0a695c8..7d694b0813 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -47,6 +47,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(IntelIOMMUState, 
INTEL_IOMMU_DEVICE)
 #define VTD_HOST_AW_48BIT   48
 #define VTD_HOST_ADDRESS_WIDTH  VTD_HOST_AW_39BIT
 #define VTD_HAW_MASK(aw)((1ULL << (aw)) - 1)
+#define VTD_MGAW_FROM_CAP(cap)  ((cap >> 16) & 0x3fULL)
 
 #define DMAR_REPORT_F_INTR  (1)
 
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 997f4ac43e..6bc2dc68f6 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -25,6 +25,7 @@
 #include "qemu/cutils.h"
 #include "qemu/chardev_open.h"
 #include "pci.h"
+#include "hw/i386/intel_iommu_internal.h"
 
 static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly)
@@ -634,6 +635,48 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
 };
 
+static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
+  Error **errp)
+{
+VFIODevice *vdev = opaque;
+HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod);
+HostIOMMUDeviceCaps *caps = &hiod->caps;
+enum iommu_hw_info_type type;
+union {
+struct iommu_hw_info_vtd vtd;
+} data;
+int ret;
+
+HOST_IOMMU_DEVICE_IOMMUFD_VFIO(hiod)->vdev = vdev;
+idev->iommufd = vdev->iommufd;
+idev->devid = vdev->devid;
+
+ret = iommufd_backend_get_device_info(idev->iommufd, idev->devid,
+  &type, &data, sizeof(data), errp);
+if (ret) {
+return false;
+}
+
+caps->type = type;
+
+switch (type) {
+case IOMMU_HW_INFO_TYPE_INTEL_VTD:
+caps->aw_bits = VTD_MGAW_FROM_CAP(data.vtd.cap_reg) + 1;
+break;
+case IOMMU_HW_INFO_TYPE_NONE:
+break;
+}
+
+return true;
+}
+
+static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data)
+{
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hiodc->realize = hiod_iommufd_vfio_realize;
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_IOMMUFD,
@@ -643,6 +686,7 @@ static const TypeInfo types[] = {
 .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO,
 .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
 .instance_size = sizeof(HostIOMMUDeviceIOMMUFDVFIO),
+.class_init = hiod_iommufd_vfio_class_init,
 }
 };
 
-- 
2.34.1




[PATCH v3 16/19] vfio/pci: Pass HostIOMMUDevice to vIOMMU

2024-04-28 Thread Zhenzhong Duan
With HostIOMMUDevice passed, vIOMMU can check compatibility with host
IOMMU, call into IOMMUFD specific methods, etc.

Originally-by: Yi Liu 
Signed-off-by: Nicolin Chen 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 64780d1b79..224501a86e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3111,11 +3111,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 vfio_bars_register(vdev);
 
-ret = vfio_add_capabilities(vdev, errp);
+ret = pci_device_set_iommu_device(pdev, vbasedev->hiod, errp);
 if (ret) {
+error_prepend(errp, "Failed to set iommu_device: ");
 goto out_teardown;
 }
 
+ret = vfio_add_capabilities(vdev, errp);
+if (ret) {
+goto out_unset_idev;
+}
+
 if (vdev->vga) {
 vfio_vga_quirk_setup(vdev);
 }
@@ -3132,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 error_setg(errp,
"cannot support IGD OpRegion feature on hotplugged "
"device");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_get_dev_region_info(vbasedev,
@@ -3141,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 if (ret) {
 error_setg_errno(errp, -ret,
  "does not support requested IGD OpRegion 
feature");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
 g_free(opregion);
 if (ret) {
-goto out_teardown;
+goto out_unset_idev;
 }
 }
 
@@ -3233,6 +3239,8 @@ out_deregister:
 if (vdev->intx.mmap_timer) {
 timer_free(vdev->intx.mmap_timer);
 }
+out_unset_idev:
+pci_device_unset_iommu_device(pdev);
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
@@ -3261,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIODevice *vbasedev = &vdev->vbasedev;
 
 vfio_unregister_req_notifier(vdev);
 vfio_unregister_err_notifier(vdev);
@@ -3275,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev)
 vfio_teardown_msi(vdev);
 vfio_pci_disable_rp_atomics(vdev);
 vfio_bars_exit(vdev);
-vfio_migration_exit(&vdev->vbasedev);
+vfio_migration_exit(vbasedev);
+pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
-- 
2.34.1




[PATCH v3 15/19] hw/pci: Introduce pci_device_[set|unset]_iommu_device()

2024-04-28 Thread Zhenzhong Duan
From: Yi Liu 

pci_device_[set|unset]_iommu_device() call pci_device_get_iommu_bus_devfn()
to get iommu_bus->iommu_ops and call [set|unset]_iommu_device callback to
set/unset HostIOMMUDevice for a given PCI device.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/pci/pci.h | 38 +-
 hw/pci/pci.c | 27 +++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index eaa3fc99d8..849e391813 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
 
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
@@ -383,10 +384,45 @@ typedef struct PCIIOMMUOps {
  *
  * @devfn: device and function number
  */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+/**
+ * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
+ *
+ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+ * retrieve host information from the associated HostIOMMUDevice.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ *
+ * @dev: the data structure representing host IOMMU device.
+ *
+ * @errp: pass an Error out only when return false
+ *
+ * Returns: 0 if HostIOMMUDevice is attached, or else <0 with errp set.
+ */
+int (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *dev, Error **errp);
+/**
+ * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU
+ *
+ * Optional callback.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ */
+void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
 } PCIIOMMUOps;
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
 
 /**
  * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 02a4bb2af6..c3293e9357 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2742,6 +2742,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 return &address_space_memory;
 }
 
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+Error **errp)
+{
+PCIBus *iommu_bus;
+
+/* set_iommu_device requires device's direct BDF instead of aliased BDF */
+pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL);
+if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) {
+return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev),
+  iommu_bus->iommu_opaque,
+  dev->devfn, hiod, errp);
+}
+return 0;
+}
+
+void pci_device_unset_iommu_device(PCIDevice *dev)
+{
+PCIBus *iommu_bus;
+
+pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL);
+if (iommu_bus && iommu_bus->iommu_ops->unset_iommu_device) {
+return iommu_bus->iommu_ops->unset_iommu_device(pci_get_bus(dev),
+
iommu_bus->iommu_opaque,
+dev->devfn);
+}
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
 /*
-- 
2.34.1




[PATCH v3 07/19] vfio/container: Implement HostIOMMUDeviceClass::realize() handler

2024-04-28 Thread Zhenzhong Duan
Utilize range_get_last_bit() to get host IOMMU address width and
package it in HostIOMMUDeviceCaps for query with .check_cap().

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 3b6826996a..863eec3943 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1143,6 +1143,34 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
 };
 
+static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque,
+ Error **errp)
+{
+VFIODevice *vdev = opaque;
+/* iova_ranges is a sorted list */
+GList *l = g_list_last(vdev->bcontainer->iova_ranges);
+
+/* There is no VFIO uAPI to query host platform IOMMU type */
+hiod->caps.type = IOMMU_HW_INFO_TYPE_NONE;
+HOST_IOMMU_DEVICE_IOMMUFD_VFIO(hiod)->vdev = vdev;
+
+if (l) {
+Range *range = l->data;
+hiod->caps.aw_bits = range_get_last_bit(range) + 1;
+} else {
+hiod->caps.aw_bits = 0xff;
+}
+
+return true;
+}
+
+static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
+{
+HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hioc->realize = hiod_legacy_vfio_realize;
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_LEGACY,
@@ -1152,6 +1180,7 @@ static const TypeInfo types[] = {
 .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO,
 .parent = TYPE_HOST_IOMMU_DEVICE,
 .instance_size = sizeof(HostIOMMUDeviceLegacyVFIO),
+.class_init = hiod_legacy_vfio_class_init,
 }
 };
 
-- 
2.34.1




[PATCH v3 18/19] intel_iommu: Implement [set|unset]_iommu_device() callbacks

2024-04-28 Thread Zhenzhong Duan
From: Yi Liu 

Implement [set|unset]_iommu_device() callbacks in Intel vIOMMU.
In set call, a new structure VTDHostIOMMUDevice which holds
a reference to HostIOMMUDevice is stored in hash table
indexed by PCI BDF.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  8 
 include/hw/i386/intel_iommu.h  |  2 +
 hw/i386/intel_iommu.c  | 76 ++
 3 files changed, 86 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index f8cf99bddf..becafd03c1 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -537,4 +537,12 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SL_IGN_COM  0xbff0ULL
 #define VTD_SL_TM   (1ULL << 62)
 
+
+typedef struct VTDHostIOMMUDevice {
+IntelIOMMUState *iommu_state;
+PCIBus *bus;
+uint8_t devfn;
+HostIOMMUDevice *dev;
+QLIST_ENTRY(VTDHostIOMMUDevice) next;
+} VTDHostIOMMUDevice;
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7d694b0813..2bbde41e45 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -293,6 +293,8 @@ struct IntelIOMMUState {
 /* list of registered notifiers */
 QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
+GHashTable *vtd_host_iommu_dev; /* VTDHostIOMMUDevice */
+
 /* interrupt remapping */
 bool intr_enabled;  /* Whether guest enabled IR */
 dma_addr_t intr_root;   /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 519063c8f8..4f84e2e801 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -237,6 +237,13 @@ static gboolean vtd_as_equal(gconstpointer v1, 
gconstpointer v2)
(key1->pasid == key2->pasid);
 }
 
+static gboolean vtd_as_idev_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct vtd_as_key *key1 = v1;
+const struct vtd_as_key *key2 = v2;
+
+return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
+}
 /*
  * Note that we use pointer to PCIBus as the key, so hashing/shifting
  * based on the pointer value is intended. Note that we deal with
@@ -3812,6 +3819,70 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *hiod, Error **errp)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+struct vtd_as_key *new_key;
+
+assert(hiod);
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+
+if (vtd_hdev) {
+error_setg(errp, "IOMMUFD device already exist");
+vtd_iommu_unlock(s);
+return -EEXIST;
+}
+
+vtd_hdev = g_malloc0(sizeof(VTDHostIOMMUDevice));
+vtd_hdev->bus = bus;
+vtd_hdev->devfn = (uint8_t)devfn;
+vtd_hdev->iommu_state = s;
+vtd_hdev->dev = hiod;
+
+new_key = g_malloc(sizeof(*new_key));
+new_key->bus = bus;
+new_key->devfn = devfn;
+
+object_ref(hiod);
+g_hash_table_insert(s->vtd_host_iommu_dev, new_key, vtd_hdev);
+
+vtd_iommu_unlock(s);
+
+return 0;
+}
+
+static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+if (!vtd_hdev) {
+vtd_iommu_unlock(s);
+return;
+}
+
+g_hash_table_remove(s->vtd_host_iommu_dev, &key);
+object_unref(vtd_hdev->dev);
+
+vtd_iommu_unlock(s);
+}
+
 /* Unmap the whole range in the notifier's scope. */
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 {
@@ -4116,6 +4187,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.set_iommu_device = vtd_dev_set_iommu_device,
+.unset_iommu_device = vtd_dev_unset_iommu_device,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
@@ -4235,6 +4308,9 @@ static void vtd_realize(DeviceState *dev, Error **errp)
  g_free, g_free);
 s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,
   g_free, g_free);
+s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_as_hash,
+  vtd_as_idev_equal,
+

[PATCH v3 13/19] vfio: Create host IOMMU device instance

2024-04-28 Thread Zhenzhong Duan
Create host IOMMU device instance in vfio_attach_device() and call
.realize() to initialize it further.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h |  1 +
 hw/vfio/common.c  | 18 +-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 0943add3bc..b204b93a55 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -126,6 +126,7 @@ typedef struct VFIODevice {
 OnOffAuto pre_copy_dirty_page_tracking;
 bool dirty_pages_supported;
 bool dirty_tracking;
+HostIOMMUDevice *hiod;
 int devid;
 IOMMUFDBackend *iommufd;
 } VFIODevice;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8f9cbdc026..0be8b70ebd 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1497,6 +1497,8 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
 {
 const VFIOIOMMUClass *ops =
 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
+HostIOMMUDevice *hiod;
+int ret;
 
 if (vbasedev->iommufd) {
 ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
@@ -1504,7 +1506,20 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
 
 assert(ops);
 
-return ops->attach_device(name, vbasedev, as, errp);
+ret = ops->attach_device(name, vbasedev, as, errp);
+if (ret < 0) {
+return ret;
+}
+
+hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename));
+if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) {
+object_unref(hiod);
+ops->detach_device(vbasedev);
+return -EINVAL;
+}
+vbasedev->hiod = hiod;
+
+return 0;
 }
 
 void vfio_detach_device(VFIODevice *vbasedev)
@@ -1512,5 +1527,6 @@ void vfio_detach_device(VFIODevice *vbasedev)
 if (!vbasedev->bcontainer) {
 return;
 }
+object_unref(vbasedev->hiod);
 vbasedev->bcontainer->ops->detach_device(vbasedev);
 }
-- 
2.34.1




[PATCH v3 19/19] intel_iommu: Check compatibility with host IOMMU capabilities

2024-04-28 Thread Zhenzhong Duan
If check fails, host device (either VFIO or VDPA device) is not
compatible with current vIOMMU config and should not be passed to
guest.

Only aw_bits is checked for now, we don't care other capabilities
before scalable modern mode is introduced.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4f84e2e801..4a295c41cc 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3819,6 +3819,26 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
+  Error **errp)
+{
+HostIOMMUDevice *hiod = vtd_hdev->dev;
+int ret;
+
+/* Common checks */
+ret = host_iommu_device_check_cap(hiod, HOST_IOMMU_DEVICE_CAP_AW_BITS,
+  errp);
+if (ret < 0) {
+return ret;
+}
+if (s->aw_bits > ret) {
+error_setg(errp, "aw-bits %d > host aw-bits %d", s->aw_bits, ret);
+return -EINVAL;
+}
+
+return 0;
+}
+
 static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
 HostIOMMUDevice *hiod, Error **errp)
 {
@@ -3829,6 +3849,7 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 .devfn = devfn,
 };
 struct vtd_as_key *new_key;
+int ret;
 
 assert(hiod);
 
@@ -3848,6 +3869,13 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 vtd_hdev->iommu_state = s;
 vtd_hdev->dev = hiod;
 
+ret = vtd_check_hdev(s, vtd_hdev, errp);
+if (ret) {
+g_free(vtd_hdev);
+vtd_iommu_unlock(s);
+return ret;
+}
+
 new_key = g_malloc(sizeof(*new_key));
 new_key->bus = bus;
 new_key->devfn = devfn;
-- 
2.34.1




[PATCH v3 05/19] backends/host_iommu_device: Introduce HostIOMMUDeviceCaps

2024-04-28 Thread Zhenzhong Duan
HostIOMMUDeviceCaps's elements map to the host IOMMU's capabilities.
Different platform IOMMU can support different elements.

Currently only two elements, type and aw_bits, type hints the host
platform IOMMU type, i.e., INTEL vtd, ARM smmu, etc; aw_bits hints
host IOMMU address width.

Introduce .check_cap() handler to check if HOST_IOMMU_DEVICE_CAP_XXX
is supported.

Introduce a HostIOMMUDevice API host_iommu_device_check_cap() which
is a wrapper of .check_cap().

Introduce a HostIOMMUDevice API host_iommu_device_check_cap_common()
to check common capabalities of different host platform IOMMUs.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/host_iommu_device.h | 44 ++
 backends/host_iommu_device.c   | 29 
 2 files changed, 73 insertions(+)

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
index 2b58a94d62..12b6afb463 100644
--- a/include/sysemu/host_iommu_device.h
+++ b/include/sysemu/host_iommu_device.h
@@ -14,12 +14,27 @@
 
 #include "qom/object.h"
 #include "qapi/error.h"
+#include "linux/iommufd.h"
+
+/**
+ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities.
+ *
+ * @type: host platform IOMMU type.
+ *
+ * @aw_bits: host IOMMU address width. 0xff if no limitation.
+ */
+typedef struct HostIOMMUDeviceCaps {
+enum iommu_hw_info_type type;
+uint8_t aw_bits;
+} HostIOMMUDeviceCaps;
 
 #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
 OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
 
 struct HostIOMMUDevice {
 Object parent_obj;
+
+HostIOMMUDeviceCaps caps;
 };
 
 /**
@@ -47,5 +62,34 @@ struct HostIOMMUDeviceClass {
  * Returns: true on success, false on failure.
  */
 bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+/**
+ * @check_cap: check if a host IOMMU device capability is supported.
+ *
+ * Optional callback, if not implemented, hint not supporting query
+ * of @cap.
+ *
+ * @hiod: pointer to a host IOMMU device instance.
+ *
+ * @cap: capability to check.
+ *
+ * @errp: pass an Error out when fails to query capability.
+ *
+ * Returns: <0 on failure, 0 if a @cap is unsupported, or else
+ * 1 or some positive value for some special @cap,
+ * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS.
+ */
+int (*check_cap)(HostIOMMUDevice *hiod, int cap, Error **errp);
 };
+
+/*
+ * Host IOMMU device capability list.
+ */
+#define HOST_IOMMU_DEVICE_CAP_IOMMUFD   0
+#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE1
+#define HOST_IOMMU_DEVICE_CAP_AW_BITS   2
+
+
+int host_iommu_device_check_cap(HostIOMMUDevice *hiod, int cap, Error **errp);
+int host_iommu_device_check_cap_common(HostIOMMUDevice *hiod, int cap,
+   Error **errp);
 #endif
diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c
index 41f2fdce20..b97d008cc7 100644
--- a/backends/host_iommu_device.c
+++ b/backends/host_iommu_device.c
@@ -28,3 +28,32 @@ static void host_iommu_device_init(Object *obj)
 static void host_iommu_device_finalize(Object *obj)
 {
 }
+
+/* Wrapper of HostIOMMUDeviceClass:check_cap */
+int host_iommu_device_check_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
+{
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+if (!hiodc->check_cap) {
+error_setg(errp, ".check_cap() not implemented");
+return -EINVAL;
+}
+
+return hiodc->check_cap(hiod, cap, errp);
+}
+
+/* Implement check on common IOMMU capabilities */
+int host_iommu_device_check_cap_common(HostIOMMUDevice *hiod, int cap,
+   Error **errp)
+{
+HostIOMMUDeviceCaps *caps = &hiod->caps;
+
+switch (cap) {
+case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE:
+return caps->type;
+case HOST_IOMMU_DEVICE_CAP_AW_BITS:
+return caps->aw_bits;
+default:
+error_setg(errp, "Not support query cap %x", cap);
+return -EINVAL;
+}
+}
-- 
2.34.1




[PATCH v3 14/19] hw/pci: Introduce helper function pci_device_get_iommu_bus_devfn()

2024-04-28 Thread Zhenzhong Duan
Extract out pci_device_get_iommu_bus_devfn() from
pci_device_iommu_address_space() to facilitate
implementation of pci_device_[set|unset]_iommu_device()
in following patch.

No functional change intended.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 hw/pci/pci.c | 48 +---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 324c1302d2..02a4bb2af6 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2648,11 +2648,27 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+/*
+ * Get IOMMU root bus, aliased bus and devfn of a PCI device
+ *
+ * IOMMU root bus is needed by all call sites to call into iommu_ops.
+ * For call sites which don't need aliased BDF, passing NULL to
+ * aliased_[bus|devfn] is allowed.
+ *
+ * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device.
+ *
+ * @aliased_bus: return aliased #PCIBus of the PCI device, optional.
+ *
+ * @aliased_devfn: return aliased devfn of the PCI device, optional.
+ */
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+   PCIBus **piommu_bus,
+   PCIBus **aliased_bus,
+   int *aliased_devfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
-uint8_t devfn = dev->devfn;
+int devfn = dev->devfn;
 
 while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
 PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2693,7 +2709,33 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+assert(iommu_bus);
+
+if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+iommu_bus = NULL;
+}
+
+*piommu_bus = iommu_bus;
+
+if (aliased_bus) {
+*aliased_bus = bus;
+}
+
+if (aliased_devfn) {
+*aliased_devfn = devfn;
+}
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+PCIBus *iommu_bus;
+int devfn;
+
+pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
+if (iommu_bus) {
 return iommu_bus->iommu_ops->get_address_space(bus,
  iommu_bus->iommu_opaque, devfn);
 }
-- 
2.34.1




[PATCH v3 08/19] backends/iommufd: Introduce helper function iommufd_backend_get_device_info()

2024-04-28 Thread Zhenzhong Duan
Introduce a helper function iommufd_backend_get_device_info() to get
host IOMMU related information through iommufd uAPI.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  4 
 backends/iommufd.c   | 24 +++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 6a9fb0007a..e9593637a3 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -17,6 +17,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include 
 #include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
@@ -47,6 +48,9 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+int iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+enum iommu_hw_info_type *type,
+void *data, uint32_t len, Error **errp);
 
 #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass,
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 19e46194a2..d61209788a 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -19,7 +19,6 @@
 #include "monitor/monitor.h"
 #include "trace.h"
 #include 
-#include 
 
 static void iommufd_backend_init(Object *obj)
 {
@@ -211,6 +210,29 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t 
ioas_id,
 return ret;
 }
 
+int iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+enum iommu_hw_info_type *type,
+void *data, uint32_t len, Error **errp)
+{
+struct iommu_hw_info info = {
+.size = sizeof(info),
+.dev_id = devid,
+.data_len = len,
+.data_uptr = (uintptr_t)data,
+};
+int ret;
+
+ret = ioctl(be->fd, IOMMU_GET_HW_INFO, &info);
+if (ret) {
+error_setg_errno(errp, errno, "Failed to get hardware info");
+} else {
+g_assert(type);
+*type = info.out_data_type;
+}
+
+return ret;
+}
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_IOMMUFD_BACKEND,
-- 
2.34.1




[PATCH v3 12/19] vfio: Introduce VFIOIOMMUClass::hiod_typename attribute

2024-04-28 Thread Zhenzhong Duan
Initialize attribute VFIOIOMMUClass::hiod_typename based on
VFIO backend type.

This attribute will facilitate HostIOMMUDevice creation in
vfio_attach_device().

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-container-base.h | 3 +++
 hw/vfio/container.c   | 2 ++
 hw/vfio/iommufd.c | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 3582d5f97a..c387f0d8a4 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -110,6 +110,9 @@ DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, 
TYPE_VFIO_IOMMU)
 struct VFIOIOMMUClass {
 InterfaceClass parent_class;
 
+/* Properties */
+const char *hiod_typename;
+
 /* basic feature */
 int (*setup)(VFIOContainerBase *bcontainer, Error **errp);
 int (*dma_map)(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 3683487605..57c814fcd5 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1133,6 +1133,8 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
 
+vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO;
+
 vioc->setup = vfio_legacy_setup;
 vioc->dma_map = vfio_legacy_dma_map;
 vioc->dma_unmap = vfio_legacy_dma_unmap;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 6bc2dc68f6..1ac7dea789 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -628,6 +628,8 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 {
 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
 
+vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO;
+
 vioc->dma_map = iommufd_cdev_map;
 vioc->dma_unmap = iommufd_cdev_unmap;
 vioc->attach_device = iommufd_cdev_attach;
-- 
2.34.1




[PATCH v3 17/19] intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap

2024-04-28 Thread Zhenzhong Duan
Extract cap/ecap initialization in vtd_cap_init() to make code
cleaner.

No functional change intended.

Reviewed-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 93 ---
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index cc8e59674e..519063c8f8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3934,30 +3934,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 return;
 }
 
-/* Do the initialization. It will also be called when reset, so pay
- * attention when adding new initialization stuff.
- */
-static void vtd_init(IntelIOMMUState *s)
+static void vtd_cap_init(IntelIOMMUState *s)
 {
 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
-memset(s->csr, 0, DMAR_REG_SIZE);
-memset(s->wmask, 0, DMAR_REG_SIZE);
-memset(s->w1cmask, 0, DMAR_REG_SIZE);
-memset(s->womask, 0, DMAR_REG_SIZE);
-
-s->root = 0;
-s->root_scalable = false;
-s->dmar_enabled = false;
-s->intr_enabled = false;
-s->iq_head = 0;
-s->iq_tail = 0;
-s->iq = 0;
-s->iq_size = 0;
-s->qi_enabled = false;
-s->iq_last_desc_type = VTD_INV_DESC_NONE;
-s->iq_dw = false;
-s->next_frcd_reg = 0;
 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
  VTD_CAP_MGAW(s->aw_bits);
@@ -3974,27 +3954,6 @@ static void vtd_init(IntelIOMMUState *s)
 }
 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
-/*
- * Rsvd field masks for spte
- */
-vtd_spte_rsvd[0] = ~0ULL;
-vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
-  x86_iommu->dt_supported);
-vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
-
-vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-
-if (s->scalable_mode || s->snoop_control) {
-vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
-}
-
 if (x86_iommu_ir_supported(x86_iommu)) {
 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
 if (s->intr_eim == ON_OFF_AUTO_ON) {
@@ -4027,6 +3986,56 @@ static void vtd_init(IntelIOMMUState *s)
 if (s->pasid) {
 s->ecap |= VTD_ECAP_PASID;
 }
+}
+
+/*
+ * Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+memset(s->csr, 0, DMAR_REG_SIZE);
+memset(s->wmask, 0, DMAR_REG_SIZE);
+memset(s->w1cmask, 0, DMAR_REG_SIZE);
+memset(s->womask, 0, DMAR_REG_SIZE);
+
+s->root = 0;
+s->root_scalable = false;
+s->dmar_enabled = false;
+s->intr_enabled = false;
+s->iq_head = 0;
+s->iq_tail = 0;
+s->iq = 0;
+s->iq_size = 0;
+s->qi_enabled = false;
+s->iq_last_desc_type = VTD_INV_DESC_NONE;
+s->iq_dw = false;
+s->next_frcd_reg = 0;
+
+vtd_cap_init(s);
+
+/*
+ * Rsvd field masks for spte
+ */
+vtd_spte_rsvd[0] = ~0ULL;
+vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+  x86_iommu->dt_supported);
+vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+
+if (s->scalable_mode || s->snoop_control) {
+vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+}
 
 vtd_reset_caches(s);
 
-- 
2.34.1




[PATCH v3 03/19] backends/iommufd: Introduce abstract HostIOMMUDeviceIOMMUFD device

2024-04-28 Thread Zhenzhong Duan
HostIOMMUDeviceIOMMUFD represents a host IOMMU device under iommufd
backend.

Currently it contains public iommufd handle and device id which
will be passed to vIOMMU to allocate/free ioas, hwpt, etc.

When nested translation is supported in future, vIOMMU will
request iommufd related operations like attaching/detaching hwpt.
VFIO and VDPA device have different way of attaching/detaching hwpt.
So HostIOMMUDeviceIOMMUFD is still an abstract class which will be
inherited by VFIO and VDPA device sub-classes.

Opportunistically, add missed header to include/sysemu/iommufd.h.

Suggested-by: Cédric Le Goater 
Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h | 30 ++
 backends/iommufd.c   | 37 -
 2 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9af27ebd6c..6a9fb0007a 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -1,9 +1,23 @@
+/*
+ * iommufd container backend declaration
+ *
+ * Copyright (C) 2024 Intel Corporation.
+ * Copyright Red Hat, Inc. 2024
+ *
+ * Authors: Yi Liu 
+ *  Eric Auger 
+ *  Zhenzhong Duan 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
 #ifndef SYSEMU_IOMMUFD_H
 #define SYSEMU_IOMMUFD_H
 
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
 OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
@@ -33,4 +47,20 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+
+#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
+OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass,
+HOST_IOMMU_DEVICE_IOMMUFD)
+
+/* Abstract of host IOMMU device with iommufd backend */
+struct HostIOMMUDeviceIOMMUFD {
+HostIOMMUDevice parent_obj;
+
+IOMMUFDBackend *iommufd;
+uint32_t devid;
+};
+
+struct HostIOMMUDeviceIOMMUFDClass {
+HostIOMMUDeviceClass parent_class;
+};
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 76a0204852..19e46194a2 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -211,23 +211,26 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, 
uint32_t ioas_id,
 return ret;
 }
 
-static const TypeInfo iommufd_backend_info = {
-.name = TYPE_IOMMUFD_BACKEND,
-.parent = TYPE_OBJECT,
-.instance_size = sizeof(IOMMUFDBackend),
-.instance_init = iommufd_backend_init,
-.instance_finalize = iommufd_backend_finalize,
-.class_size = sizeof(IOMMUFDBackendClass),
-.class_init = iommufd_backend_class_init,
-.interfaces = (InterfaceInfo[]) {
-{ TYPE_USER_CREATABLE },
-{ }
+static const TypeInfo types[] = {
+{
+.name = TYPE_IOMMUFD_BACKEND,
+.parent = TYPE_OBJECT,
+.instance_size = sizeof(IOMMUFDBackend),
+.instance_init = iommufd_backend_init,
+.instance_finalize = iommufd_backend_finalize,
+.class_size = sizeof(IOMMUFDBackendClass),
+.class_init = iommufd_backend_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_USER_CREATABLE },
+{ }
+}
+}, {
+.name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.instance_size = sizeof(HostIOMMUDeviceIOMMUFD),
+.class_size = sizeof(HostIOMMUDeviceIOMMUFDClass),
+.abstract = true,
 }
 };
 
-static void register_types(void)
-{
-type_register_static(&iommufd_backend_info);
-}
-
-type_init(register_types);
+DEFINE_TYPES(types)
-- 
2.34.1




[PATCH v3 06/19] range: Introduce range_get_last_bit()

2024-04-28 Thread Zhenzhong Duan
This helper get the highest 1 bit position of the upper bound.

If the range is empty or upper bound is zero, -1 is returned.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/qemu/range.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/qemu/range.h b/include/qemu/range.h
index 205e1da76d..8e05bc1d9f 100644
--- a/include/qemu/range.h
+++ b/include/qemu/range.h
@@ -20,6 +20,8 @@
 #ifndef QEMU_RANGE_H
 #define QEMU_RANGE_H
 
+#include "qemu/bitops.h"
+
 /*
  * Operations on 64 bit address ranges.
  * Notes:
@@ -217,6 +219,15 @@ static inline int ranges_overlap(uint64_t first1, uint64_t 
len1,
 return !(last2 < first1 || last1 < first2);
 }
 
+/* Get highest non-zero bit position of a range */
+static inline int range_get_last_bit(Range *range)
+{
+if (range_is_empty(range) || !range->upb) {
+return -1;
+}
+return find_last_bit(&range->upb, sizeof(range->upb));
+}
+
 /*
  * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap.
  * Both @a and @b must not be empty.
-- 
2.34.1




[PATCH 0/3] Cleanup VFIOIOMMUClass callback return with bool

2024-05-06 Thread Zhenzhong Duan
Hi

This is a cleanup series to change VFIOIOMMUClass callbacks to return
bool when the error is passed through errp parameter.

See discussion at 
https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg04782.html

It looks many functions in VFIO sub-system need same change,
so this can be a very first series.

Test done on x86 platform:
vfio device hotplug/unplug with different backend
reboot

Thanks
Zhenzhong

Zhenzhong Duan (3):
  vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool
  vfio: Make VFIOIOMMUClass::setup() return bool
  vfio: Make VFIOIOMMUClass::add_window() and its wrapper return bool

 include/hw/vfio/vfio-common.h |  4 ++--
 include/hw/vfio/vfio-container-base.h | 18 -
 hw/vfio/ap.c  |  6 ++
 hw/vfio/ccw.c |  6 ++
 hw/vfio/common.c  |  6 +++---
 hw/vfio/container-base.c  |  8 
 hw/vfio/container.c   | 24 +++
 hw/vfio/iommufd.c | 11 +--
 hw/vfio/pci.c |  8 +++-
 hw/vfio/platform.c|  7 +++
 hw/vfio/spapr.c   | 28 +--
 11 files changed, 58 insertions(+), 68 deletions(-)

-- 
2.34.1




[PATCH 2/3] vfio: Make VFIOIOMMUClass::setup() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-container-base.h |  2 +-
 hw/vfio/container.c   | 10 +-
 hw/vfio/spapr.c   | 12 +---
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index c839cfd9cb..68539e3bed 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -111,7 +111,7 @@ struct VFIOIOMMUClass {
 InterfaceClass parent_class;
 
 /* basic feature */
-int (*setup)(VFIOContainerBase *bcontainer, Error **errp);
+bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
 int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index ea3b145913..85a8a369dc 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -505,7 +505,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer 
*container,
 }
 }
 
-static int vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
+static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
 {
 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 bcontainer);
@@ -515,7 +515,7 @@ static int vfio_legacy_setup(VFIOContainerBase *bcontainer, 
Error **errp)
 ret = vfio_get_iommu_info(container, &info);
 if (ret) {
 error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
-return ret;
+return false;
 }
 
 if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
@@ -531,7 +531,7 @@ static int vfio_legacy_setup(VFIOContainerBase *bcontainer, 
Error **errp)
 vfio_get_info_iova_range(info, bcontainer);
 
 vfio_get_iommu_info_migration(container, info);
-return 0;
+return true;
 }
 
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
@@ -633,8 +633,8 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 
 assert(bcontainer->ops->setup);
 
-ret = bcontainer->ops->setup(bcontainer, errp);
-if (ret) {
+if (!bcontainer->ops->setup(bcontainer, errp)) {
+ret = -EINVAL;
 goto enable_discards_exit;
 }
 
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 0d949bb728..148b257c9c 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -458,8 +458,8 @@ static void vfio_spapr_container_release(VFIOContainerBase 
*bcontainer)
 }
 }
 
-static int vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
-  Error **errp)
+static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
+   Error **errp)
 {
 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 bcontainer);
@@ -480,7 +480,7 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
 if (ret) {
 error_setg_errno(errp, errno, "failed to enable container");
-return -errno;
+return false;
 }
 } else {
 scontainer->prereg_listener = vfio_prereg_listener;
@@ -488,7 +488,6 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
 memory_listener_register(&scontainer->prereg_listener,
  &address_space_memory);
 if (bcontainer->error) {
-ret = -1;
 error_propagate_prepend(errp, bcontainer->error,
 "RAM memory listener initialization failed: ");
 goto listener_unregister_exit;
@@ -500,7 +499,6 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
 if (ret) {
 error_setg_errno(errp, errno,
  "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
-ret = -errno;
 goto listener_unregister_exit;
 }
 
@@ -527,13 +525,13 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
   0x1000);
 }
 
-return 0;
+return true;
 
 listener_unregister_exit:
 if (v2) {
 memory_listener_unregister(&scontainer->prereg_listener);
 }
-return ret;
+return false;
 }
 
 static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data)
-- 
2.34.1




[PATCH 3/3] vfio: Make VFIOIOMMUClass::add_window() and its wrapper return bool

2024-05-06 Thread Zhenzhong Duan
Make VFIOIOMMUClass::add_window() and its wrapper function
vfio_container_add_section_window() return bool.

This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-container-base.h | 12 ++--
 hw/vfio/common.c  |  2 +-
 hw/vfio/container-base.c  |  8 
 hw/vfio/spapr.c   | 16 
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 68539e3bed..e96cda78c8 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -76,9 +76,9 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer,
 int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
  hwaddr iova, ram_addr_t size,
  IOMMUTLBEntry *iotlb);
-int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
-  MemoryRegionSection *section,
-  Error **errp);
+bool vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+   MemoryRegionSection *section,
+   Error **errp);
 void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
 int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
@@ -131,9 +131,9 @@ struct VFIOIOMMUClass {
 int (*pci_hot_reset)(VFIODevice *vbasedev, bool single);
 
 /* SPAPR specific */
-int (*add_window)(VFIOContainerBase *bcontainer,
-  MemoryRegionSection *section,
-  Error **errp);
+bool (*add_window)(VFIOContainerBase *bcontainer,
+   MemoryRegionSection *section,
+   Error **errp);
 void (*del_window)(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
 void (*release)(VFIOContainerBase *bcontainer);
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 890d30910e..9f1f2e19f7 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -585,7 +585,7 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 return;
 }
 
-if (vfio_container_add_section_window(bcontainer, section, &err)) {
+if (!vfio_container_add_section_window(bcontainer, section, &err)) {
 goto fail;
 }
 
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 913ae49077..98d71b3144 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -31,12 +31,12 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
 return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
 }
 
-int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
-  MemoryRegionSection *section,
-  Error **errp)
+bool vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+   MemoryRegionSection *section,
+   Error **errp)
 {
 if (!bcontainer->ops->add_window) {
-return 0;
+return true;
 }
 
 return bcontainer->ops->add_window(bcontainer, section, errp);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 148b257c9c..47b040f1bc 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -323,7 +323,7 @@ static int vfio_spapr_create_window(VFIOContainer 
*container,
 return 0;
 }
 
-static int
+static bool
 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
 MemoryRegionSection *section,
 Error **errp)
@@ -351,13 +351,13 @@ vfio_spapr_container_add_section_window(VFIOContainerBase 
*bcontainer,
 error_setg(errp, "Container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
iova, end);
-return -EINVAL;
+return false;
 }
-return 0;
+return true;
 }
 
 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
-return 0;
+return true;
 }
 
 /* For now intersections are not allowed, we may relax this later */
@@ -373,14 +373,14 @@ vfio_spapr_container_add_section_window(VFIOContainerBase 
*bcontainer,
 section->offset_within_address_space +
 int128_get64(section->size) - 1,
 hostwin->min_iova, hostwin->max_iova);
-return -EINVAL;
+return false;
 }
 }
 
 ret = vfio_spapr_create_window(container, section, &

[PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool

2024-05-06 Thread Zhenzhong Duan
Make VFIOIOMMUClass::attach_device() and its wrapper function
vfio_attach_device() return bool.

This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h |  4 ++--
 include/hw/vfio/vfio-container-base.h |  4 ++--
 hw/vfio/ap.c  |  6 ++
 hw/vfio/ccw.c |  6 ++
 hw/vfio/common.c  |  4 ++--
 hw/vfio/container.c   | 14 +++---
 hw/vfio/iommufd.c | 11 +--
 hw/vfio/pci.c |  8 +++-
 hw/vfio/platform.c|  7 +++
 9 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..a7b6fc8f46 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
 struct vfio_device_info *vfio_get_device_info(int fd);
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp);
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp);
 void vfio_detach_device(VFIODevice *vbasedev);
 
 int vfio_kvm_device_add_fd(int fd, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 3582d5f97a..c839cfd9cb 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -118,8 +118,8 @@ struct VFIOIOMMUClass {
 int (*dma_unmap)(const VFIOContainerBase *bcontainer,
  hwaddr iova, ram_addr_t size,
  IOMMUTLBEntry *iotlb);
-int (*attach_device)(const char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp);
+bool (*attach_device)(const char *name, VFIODevice *vbasedev,
+  AddressSpace *as, Error **errp);
 void (*detach_device)(VFIODevice *vbasedev);
 /* migration feature */
 int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 7c4caa5938..d50600b702 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -156,7 +156,6 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice 
*vapdev,
 static void vfio_ap_realize(DeviceState *dev, Error **errp)
 {
 ERRP_GUARD();
-int ret;
 Error *err = NULL;
 VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
 VFIODevice *vbasedev = &vapdev->vdev;
@@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
 return;
 }
 
-ret = vfio_attach_device(vbasedev->name, vbasedev,
- &address_space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(vbasedev->name, vbasedev,
+&address_space_memory, errp)) {
 goto error;
 }
 
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 90e4a53437..782bd4bed7 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
 VFIODevice *vbasedev = &vcdev->vdev;
 Error *err = NULL;
-int ret;
 
 /* Call the class init function for subchannel. */
 if (cdc->realize) {
@@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 return;
 }
 
-ret = vfio_attach_device(cdev->mdevid, vbasedev,
- &address_space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(cdev->mdevid, vbasedev,
+&address_space_memory, errp)) {
 goto out_attach_dev_err;
 }
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8f9cbdc026..890d30910e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1492,8 +1492,8 @@ retry:
 return info;
 }
 
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp)
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp)
 {
 const VFIOIOMMUClass *ops =
 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 77bdec276e..ea3b145913 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -908,8 +908,8 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error 
**errp)
  * @name and @vbasedev->name are likely to be different depending
  * on the type of the device, hence the need for passing @name
  */
-static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
-   

[PATCH v2 00/11] VFIO: misc cleanups

2024-05-06 Thread Zhenzhong Duan
Hi

This is a cleanup series to change functions in hw/vfio/ to return bool 
when the error is passed through errp parameter, also some cleanup
with g_autofree.

See discussion at 
https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg04782.html

This series processed below files:
hw/vfio/container.c
hw/vfio/iommufd.c
hw/vfio/cpr.c
backends/iommufd.c

So above files are clean now, there are still other files need processing
in hw/vfio.

Test done on x86 platform:
vfio device hotplug/unplug with different backend
reboot

Thanks
Zhenzhong

Changelog:
v2:
- split out g_autofree code as a patch (Cédric)
- add processing for more files

Zhenzhong Duan (11):
  vfio/pci: Use g_autofree in vfio_realize
  vfio/pci: Use g_autofree in iommufd_cdev_get_info_iova_range()
  vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool
  vfio: Make VFIOIOMMUClass::setup() return bool
  vfio: Make VFIOIOMMUClass::add_window() and its wrapper return bool
  vfio/container: Make vfio_connect_container() return bool
  vfio/container: Make vfio_set_iommu() return bool
  vfio/container: Make vfio_get_device() return bool
  vfio/iommufd: Make iommufd_cdev_*() return bool
  vfio/cpr: Make vfio_cpr_register_container() return bool
  backends/iommufd: Make iommufd_backend_*() return bool

 include/hw/vfio/vfio-common.h |   6 +-
 include/hw/vfio/vfio-container-base.h |  18 ++---
 include/sysemu/iommufd.h  |   6 +-
 backends/iommufd.c|  29 +++
 hw/vfio/ap.c  |   6 +-
 hw/vfio/ccw.c |   6 +-
 hw/vfio/common.c  |   6 +-
 hw/vfio/container-base.c  |   8 +-
 hw/vfio/container.c   |  81 +--
 hw/vfio/cpr.c |   4 +-
 hw/vfio/iommufd.c | 109 +++---
 hw/vfio/pci.c |  12 ++-
 hw/vfio/platform.c|   7 +-
 hw/vfio/spapr.c   |  28 +++
 backends/trace-events |   4 +-
 15 files changed, 147 insertions(+), 183 deletions(-)

-- 
2.34.1




[PATCH v2 03/11] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool

2024-05-06 Thread Zhenzhong Duan
Make VFIOIOMMUClass::attach_device() and its wrapper function
vfio_attach_device() return bool.

This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h |  4 ++--
 include/hw/vfio/vfio-container-base.h |  4 ++--
 hw/vfio/ap.c  |  6 ++
 hw/vfio/ccw.c |  6 ++
 hw/vfio/common.c  |  4 ++--
 hw/vfio/container.c   | 14 +++---
 hw/vfio/iommufd.c | 11 +--
 hw/vfio/pci.c |  5 ++---
 hw/vfio/platform.c|  7 +++
 9 files changed, 27 insertions(+), 34 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..a7b6fc8f46 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
 struct vfio_device_info *vfio_get_device_info(int fd);
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp);
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp);
 void vfio_detach_device(VFIODevice *vbasedev);
 
 int vfio_kvm_device_add_fd(int fd, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 3582d5f97a..c839cfd9cb 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -118,8 +118,8 @@ struct VFIOIOMMUClass {
 int (*dma_unmap)(const VFIOContainerBase *bcontainer,
  hwaddr iova, ram_addr_t size,
  IOMMUTLBEntry *iotlb);
-int (*attach_device)(const char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp);
+bool (*attach_device)(const char *name, VFIODevice *vbasedev,
+  AddressSpace *as, Error **errp);
 void (*detach_device)(VFIODevice *vbasedev);
 /* migration feature */
 int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 7c4caa5938..d50600b702 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -156,7 +156,6 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice 
*vapdev,
 static void vfio_ap_realize(DeviceState *dev, Error **errp)
 {
 ERRP_GUARD();
-int ret;
 Error *err = NULL;
 VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
 VFIODevice *vbasedev = &vapdev->vdev;
@@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
 return;
 }
 
-ret = vfio_attach_device(vbasedev->name, vbasedev,
- &address_space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(vbasedev->name, vbasedev,
+&address_space_memory, errp)) {
 goto error;
 }
 
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 90e4a53437..782bd4bed7 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
 VFIODevice *vbasedev = &vcdev->vdev;
 Error *err = NULL;
-int ret;
 
 /* Call the class init function for subchannel. */
 if (cdc->realize) {
@@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 return;
 }
 
-ret = vfio_attach_device(cdev->mdevid, vbasedev,
- &address_space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(cdev->mdevid, vbasedev,
+&address_space_memory, errp)) {
 goto out_attach_dev_err;
 }
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8f9cbdc026..890d30910e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1492,8 +1492,8 @@ retry:
 return info;
 }
 
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp)
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp)
 {
 const VFIOIOMMUClass *ops =
 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 77bdec276e..ea3b145913 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -908,8 +908,8 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error 
**errp)
  * @name and @vbasedev->name are likely to be different depending
  * on the type of the device, hence the need for passing @name
  */
-static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
-   

[PATCH v2 04/11] vfio: Make VFIOIOMMUClass::setup() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-container-base.h |  2 +-
 hw/vfio/container.c   | 10 +-
 hw/vfio/spapr.c   | 12 +---
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index c839cfd9cb..68539e3bed 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -111,7 +111,7 @@ struct VFIOIOMMUClass {
 InterfaceClass parent_class;
 
 /* basic feature */
-int (*setup)(VFIOContainerBase *bcontainer, Error **errp);
+bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
 int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index ea3b145913..85a8a369dc 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -505,7 +505,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer 
*container,
 }
 }
 
-static int vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
+static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
 {
 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 bcontainer);
@@ -515,7 +515,7 @@ static int vfio_legacy_setup(VFIOContainerBase *bcontainer, 
Error **errp)
 ret = vfio_get_iommu_info(container, &info);
 if (ret) {
 error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
-return ret;
+return false;
 }
 
 if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
@@ -531,7 +531,7 @@ static int vfio_legacy_setup(VFIOContainerBase *bcontainer, 
Error **errp)
 vfio_get_info_iova_range(info, bcontainer);
 
 vfio_get_iommu_info_migration(container, info);
-return 0;
+return true;
 }
 
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
@@ -633,8 +633,8 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 
 assert(bcontainer->ops->setup);
 
-ret = bcontainer->ops->setup(bcontainer, errp);
-if (ret) {
+if (!bcontainer->ops->setup(bcontainer, errp)) {
+ret = -EINVAL;
 goto enable_discards_exit;
 }
 
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 0d949bb728..148b257c9c 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -458,8 +458,8 @@ static void vfio_spapr_container_release(VFIOContainerBase 
*bcontainer)
 }
 }
 
-static int vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
-  Error **errp)
+static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
+   Error **errp)
 {
 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 bcontainer);
@@ -480,7 +480,7 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
 if (ret) {
 error_setg_errno(errp, errno, "failed to enable container");
-return -errno;
+return false;
 }
 } else {
 scontainer->prereg_listener = vfio_prereg_listener;
@@ -488,7 +488,6 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
 memory_listener_register(&scontainer->prereg_listener,
  &address_space_memory);
 if (bcontainer->error) {
-ret = -1;
 error_propagate_prepend(errp, bcontainer->error,
 "RAM memory listener initialization failed: ");
 goto listener_unregister_exit;
@@ -500,7 +499,6 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
 if (ret) {
 error_setg_errno(errp, errno,
  "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
-ret = -errno;
 goto listener_unregister_exit;
 }
 
@@ -527,13 +525,13 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
   0x1000);
 }
 
-return 0;
+return true;
 
 listener_unregister_exit:
 if (v2) {
 memory_listener_unregister(&scontainer->prereg_listener);
 }
-return ret;
+return false;
 }
 
 static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data)
-- 
2.34.1




[PATCH v2 02/11] vfio/pci: Use g_autofree in iommufd_cdev_get_info_iova_range()

2024-05-06 Thread Zhenzhong Duan
Local pointer info is freed before return from
iommufd_cdev_get_info_iova_range().

Use 'g_autofree' to avoid the g_free() calls.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 8827ffe636..c644127972 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -258,7 +258,7 @@ static int 
iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container,
 uint32_t ioas_id, Error **errp)
 {
 VFIOContainerBase *bcontainer = &container->bcontainer;
-struct iommu_ioas_iova_ranges *info;
+g_autofree struct iommu_ioas_iova_ranges *info = NULL;
 struct iommu_iova_range *iova_ranges;
 int ret, sz, fd = container->be->fd;
 
@@ -291,12 +291,10 @@ static int 
iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container,
 }
 bcontainer->pgsizes = info->out_iova_alignment;
 
-g_free(info);
 return 0;
 
 error:
 ret = -errno;
-g_free(info);
 error_setg_errno(errp, errno, "Cannot get IOVA ranges");
 return ret;
 }
-- 
2.34.1




[PATCH v2 01/11] vfio/pci: Use g_autofree in vfio_realize

2024-05-06 Thread Zhenzhong Duan
Local pointer name is allocated before vfio_attach_device() call
and freed after the call.

Same for tmp when calling realpath().

Use 'g_autofree' to avoid the g_free() calls.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 64780d1b79..576b21e2bb 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2946,12 +2946,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 ERRP_GUARD();
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
 VFIODevice *vbasedev = &vdev->vbasedev;
-char *tmp, *subsys;
+char *subsys;
 Error *err = NULL;
 int i, ret;
 bool is_mdev;
 char uuid[UUID_STR_LEN];
-char *name;
+g_autofree char *name = NULL;
+g_autofree char *tmp = NULL;
 
 if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
 if (!(~vdev->host.domain || ~vdev->host.bus ||
@@ -2982,7 +2983,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  */
 tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
 subsys = realpath(tmp, NULL);
-g_free(tmp);
 is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
 free(subsys);
 
@@ -3003,7 +3003,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 ret = vfio_attach_device(name, vbasedev,
  pci_device_iommu_address_space(pdev), errp);
-g_free(name);
 if (ret) {
 goto error;
 }
-- 
2.34.1




[PATCH v2 10/11] vfio/cpr: Make vfio_cpr_register_container() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 2 +-
 hw/vfio/container.c   | 3 +--
 hw/vfio/cpr.c | 4 ++--
 hw/vfio/iommufd.c | 3 +--
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index a7b6fc8f46..e4c60374fa 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -205,7 +205,7 @@ void vfio_detach_device(VFIODevice *vbasedev);
 int vfio_kvm_device_add_fd(int fd, Error **errp);
 int vfio_kvm_device_del_fd(int fd, Error **errp);
 
-int vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp);
+bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp);
 void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer);
 
 extern const MemoryRegionOps vfio_region_ops;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index b02583ea16..86266f3b83 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -616,8 +616,7 @@ static bool vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 goto free_container_exit;
 }
 
-ret = vfio_cpr_register_container(bcontainer, errp);
-if (ret) {
+if (!vfio_cpr_register_container(bcontainer, errp)) {
 goto free_container_exit;
 }
 
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index 392c2dd95d..87e51fcee1 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -25,12 +25,12 @@ static int vfio_cpr_reboot_notifier(NotifierWithReturn 
*notifier,
 return 0;
 }
 
-int vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp)
+bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp)
 {
 migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier,
 vfio_cpr_reboot_notifier,
 MIG_MODE_CPR_REBOOT);
-return 0;
+return true;
 }
 
 void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 84c86b970e..6a446b16dc 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -396,8 +396,7 @@ found_container:
 goto err_listener_register;
 }
 
-ret = vfio_cpr_register_container(bcontainer, errp);
-if (ret) {
+if (!vfio_cpr_register_container(bcontainer, errp)) {
 goto err_listener_register;
 }
 
-- 
2.34.1




[PATCH v2 06/11] vfio/container: Make vfio_connect_container() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 18 +++---
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 85a8a369dc..0a7edfcc43 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -534,8 +534,8 @@ static bool vfio_legacy_setup(VFIOContainerBase 
*bcontainer, Error **errp)
 return true;
 }
 
-static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
-  Error **errp)
+static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+   Error **errp)
 {
 VFIOContainer *container;
 VFIOContainerBase *bcontainer;
@@ -587,19 +587,18 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 error_report("vfio: error disconnecting group %d from"
  " container", group->groupid);
 }
-return ret;
+return false;
 }
 group->container = container;
 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
 vfio_kvm_device_add_group(group);
-return 0;
+return true;
 }
 }
 
 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
 if (fd < 0) {
 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
-ret = -errno;
 goto put_space_exit;
 }
 
@@ -607,7 +606,6 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 if (ret != VFIO_API_VERSION) {
 error_setg(errp, "supported vfio version: %d, "
"reported version: %d", VFIO_API_VERSION, ret);
-ret = -EINVAL;
 goto close_fd_exit;
 }
 
@@ -634,7 +632,6 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 assert(bcontainer->ops->setup);
 
 if (!bcontainer->ops->setup(bcontainer, errp)) {
-ret = -EINVAL;
 goto enable_discards_exit;
 }
 
@@ -650,7 +647,6 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 memory_listener_register(&bcontainer->listener, bcontainer->space->as);
 
 if (bcontainer->error) {
-ret = -1;
 error_propagate_prepend(errp, bcontainer->error,
 "memory listener initialization failed: ");
 goto listener_release_exit;
@@ -658,7 +654,7 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 
 bcontainer->initialized = true;
 
-return 0;
+return true;
 listener_release_exit:
 QLIST_REMOVE(group, container_next);
 QLIST_REMOVE(bcontainer, next);
@@ -683,7 +679,7 @@ close_fd_exit:
 put_space_exit:
 vfio_put_address_space(space);
 
-return ret;
+return false;
 }
 
 static void vfio_disconnect_container(VFIOGroup *group)
@@ -770,7 +766,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace 
*as, Error **errp)
 group->groupid = groupid;
 QLIST_INIT(&group->device_list);
 
-if (vfio_connect_container(group, as, errp)) {
+if (!vfio_connect_container(group, as, errp)) {
 error_prepend(errp, "failed to setup container for group %d: ",
   groupid);
 goto close_fd_exit;
-- 
2.34.1




[PATCH v2 07/11] vfio/container: Make vfio_set_iommu() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 0a7edfcc43..5fb4bee082 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -391,21 +391,20 @@ static const VFIOIOMMUClass *vfio_get_iommu_class(int 
iommu_type, Error **errp)
 return VFIO_IOMMU_CLASS(klass);
 }
 
-static int vfio_set_iommu(VFIOContainer *container, int group_fd,
-  VFIOAddressSpace *space, Error **errp)
+static bool vfio_set_iommu(VFIOContainer *container, int group_fd,
+   VFIOAddressSpace *space, Error **errp)
 {
-int iommu_type, ret;
+int iommu_type;
 const VFIOIOMMUClass *vioc;
 
 iommu_type = vfio_get_iommu_type(container, errp);
 if (iommu_type < 0) {
-return iommu_type;
+return false;
 }
 
-ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
-if (ret) {
+if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
 error_setg_errno(errp, errno, "Failed to set group container");
-return -errno;
+return false;
 }
 
 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
@@ -420,7 +419,7 @@ static int vfio_set_iommu(VFIOContainer *container, int 
group_fd,
 continue;
 }
 error_setg_errno(errp, errno, "Failed to set iommu for container");
-return -errno;
+return false;
 }
 
 container->iommu_type = iommu_type;
@@ -428,11 +427,11 @@ static int vfio_set_iommu(VFIOContainer *container, int 
group_fd,
 vioc = vfio_get_iommu_class(iommu_type, errp);
 if (!vioc) {
 error_setg(errp, "No available IOMMU models");
-return -EINVAL;
+return false;
 }
 
 vfio_container_init(&container->bcontainer, space, vioc);
-return 0;
+return true;
 }
 
 static int vfio_get_iommu_info(VFIOContainer *container,
@@ -613,8 +612,7 @@ static bool vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 container->fd = fd;
 bcontainer = &container->bcontainer;
 
-ret = vfio_set_iommu(container, group->fd, space, errp);
-if (ret) {
+if (!vfio_set_iommu(container, group->fd, space, errp)) {
 goto free_container_exit;
 }
 
-- 
2.34.1




[PATCH v2 09/11] vfio/iommufd: Make iommufd_cdev_*() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

The changed functions include:

iommufd_cdev_kvm_device_add
iommufd_cdev_connect_and_bind
iommufd_cdev_attach_ioas_hwpt
iommufd_cdev_detach_ioas_hwpt
iommufd_cdev_attach_container
iommufd_cdev_get_info_iova_range

After the change, all functions in hw/vfio/iommufd.c follows the
standand.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 88 +--
 1 file changed, 39 insertions(+), 49 deletions(-)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 4c6992fca1..84c86b970e 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -49,9 +49,9 @@ static int iommufd_cdev_unmap(const VFIOContainerBase 
*bcontainer,
  container->ioas_id, iova, size);
 }
 
-static int iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp)
+static bool iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp)
 {
-return vfio_kvm_device_add_fd(vbasedev->fd, errp);
+return !vfio_kvm_device_add_fd(vbasedev->fd, errp);
 }
 
 static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev)
@@ -63,18 +63,16 @@ static void iommufd_cdev_kvm_device_del(VFIODevice 
*vbasedev)
 }
 }
 
-static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
+static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
 {
 IOMMUFDBackend *iommufd = vbasedev->iommufd;
 struct vfio_device_bind_iommufd bind = {
 .argsz = sizeof(bind),
 .flags = 0,
 };
-int ret;
 
-ret = iommufd_backend_connect(iommufd, errp);
-if (ret) {
-return ret;
+if (iommufd_backend_connect(iommufd, errp)) {
+return false;
 }
 
 /*
@@ -82,15 +80,13 @@ static int iommufd_cdev_connect_and_bind(VFIODevice 
*vbasedev, Error **errp)
  * in KVM. Especially for some emulated devices, it requires
  * to have kvm information in the device open.
  */
-ret = iommufd_cdev_kvm_device_add(vbasedev, errp);
-if (ret) {
+if (!iommufd_cdev_kvm_device_add(vbasedev, errp)) {
 goto err_kvm_device_add;
 }
 
 /* Bind device to iommufd */
 bind.iommufd = iommufd->fd;
-ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
-if (ret) {
+if (ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind)) {
 error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d",
  vbasedev->fd, bind.iommufd);
 goto err_bind;
@@ -99,12 +95,12 @@ static int iommufd_cdev_connect_and_bind(VFIODevice 
*vbasedev, Error **errp)
 vbasedev->devid = bind.out_devid;
 trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name,
 vbasedev->fd, vbasedev->devid);
-return ret;
+return true;
 err_bind:
 iommufd_cdev_kvm_device_del(vbasedev);
 err_kvm_device_add:
 iommufd_backend_disconnect(iommufd);
-return ret;
+return false;
 }
 
 static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev)
@@ -176,10 +172,10 @@ out:
 return ret;
 }
 
-static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id,
+static bool iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id,
  Error **errp)
 {
-int ret, iommufd = vbasedev->iommufd->fd;
+int iommufd = vbasedev->iommufd->fd;
 struct vfio_device_attach_iommufd_pt attach_data = {
 .argsz = sizeof(attach_data),
 .flags = 0,
@@ -187,38 +183,38 @@ static int iommufd_cdev_attach_ioas_hwpt(VFIODevice 
*vbasedev, uint32_t id,
 };
 
 /* Attach device to an IOAS or hwpt within iommufd */
-ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data);
-if (ret) {
+if (ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data)) {
 error_setg_errno(errp, errno,
  "[iommufd=%d] error attach %s (%d) to id=%d",
  iommufd, vbasedev->name, vbasedev->fd, id);
-} else {
-trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name,
-vbasedev->fd, id);
+return false;
 }
-return ret;
+
+trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name,
+vbasedev->fd, id);
+return true;
 }
 
-static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
+static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
 {
-int ret, iommufd = vbasedev->iommufd->fd;
+int iommufd = vbasedev->iommufd->fd;
 struct vfio_device_detach_iommufd_pt detach_data = {
 .argsz = sizeof(detach_data),
 .flags = 0,
 };
 
-ret = ioctl(vbas

[PATCH v2 05/11] vfio: Make VFIOIOMMUClass::add_window() and its wrapper return bool

2024-05-06 Thread Zhenzhong Duan
Make VFIOIOMMUClass::add_window() and its wrapper function
vfio_container_add_section_window() return bool.

This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-container-base.h | 12 ++--
 hw/vfio/common.c  |  2 +-
 hw/vfio/container-base.c  |  8 
 hw/vfio/spapr.c   | 16 
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 68539e3bed..e96cda78c8 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -76,9 +76,9 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer,
 int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
  hwaddr iova, ram_addr_t size,
  IOMMUTLBEntry *iotlb);
-int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
-  MemoryRegionSection *section,
-  Error **errp);
+bool vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+   MemoryRegionSection *section,
+   Error **errp);
 void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
 int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
@@ -131,9 +131,9 @@ struct VFIOIOMMUClass {
 int (*pci_hot_reset)(VFIODevice *vbasedev, bool single);
 
 /* SPAPR specific */
-int (*add_window)(VFIOContainerBase *bcontainer,
-  MemoryRegionSection *section,
-  Error **errp);
+bool (*add_window)(VFIOContainerBase *bcontainer,
+   MemoryRegionSection *section,
+   Error **errp);
 void (*del_window)(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
 void (*release)(VFIOContainerBase *bcontainer);
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 890d30910e..9f1f2e19f7 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -585,7 +585,7 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 return;
 }
 
-if (vfio_container_add_section_window(bcontainer, section, &err)) {
+if (!vfio_container_add_section_window(bcontainer, section, &err)) {
 goto fail;
 }
 
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 913ae49077..98d71b3144 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -31,12 +31,12 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
 return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
 }
 
-int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
-  MemoryRegionSection *section,
-  Error **errp)
+bool vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+   MemoryRegionSection *section,
+   Error **errp)
 {
 if (!bcontainer->ops->add_window) {
-return 0;
+return true;
 }
 
 return bcontainer->ops->add_window(bcontainer, section, errp);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 148b257c9c..47b040f1bc 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -323,7 +323,7 @@ static int vfio_spapr_create_window(VFIOContainer 
*container,
 return 0;
 }
 
-static int
+static bool
 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
 MemoryRegionSection *section,
 Error **errp)
@@ -351,13 +351,13 @@ vfio_spapr_container_add_section_window(VFIOContainerBase 
*bcontainer,
 error_setg(errp, "Container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
iova, end);
-return -EINVAL;
+return false;
 }
-return 0;
+return true;
 }
 
 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
-return 0;
+return true;
 }
 
 /* For now intersections are not allowed, we may relax this later */
@@ -373,14 +373,14 @@ vfio_spapr_container_add_section_window(VFIOContainerBase 
*bcontainer,
 section->offset_within_address_space +
 int128_get64(section->size) - 1,
 hostwin->min_iova, hostwin->max_iova);
-return -EINVAL;
+return false;
 }
 }
 
 ret = vfio_spapr_crea

  1   2   3   4   5   6   7   8   9   10   >