Introduce a new PCIIOMMUOps optional callback, get_viommu_cap() which allows to retrieve capabilities exposed by a vIOMMU. The first planned capability is VIOMMU_CAP_HW_NESTED that advertises the support of HW nested stage translation scheme. pci_device_get_viommu_cap is a wrapper that can be called on a PCI device potentially protected by a vIOMMU.
get_viommu_cap() is designed to return 64bit bitmap of purely emulated capabilities which are only derermined by user's configuration, no host capabilities involved. Reasons are: 1. there can be more than one host IOMMUs with different capabilities 2. there can also be more than one vIOMMUs with different user configuration, e.g., arm smmuv3. 3. This is migration friendly, return value is consistent between source and target. 4. It's too late for VFIO to call get_viommu_cap() after set_iommu_device() because we need get_viommu_cap() to determine if creating nested parent hwpt or not at attaching stage, meanwhile hiod realize needs iommufd, devid and hwpt_id which are ready after attach_device(). See below sequence: attach_device() get_viommu_cap() create hwpt ... create hiod set_iommu_device(hiod) Suggested-by: Yi Liu <yi.l....@intel.com> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com> --- MAINTAINERS | 1 + include/hw/iommu.h | 17 +++++++++++++++++ include/hw/pci/pci.h | 25 +++++++++++++++++++++++++ hw/pci/pci.c | 11 +++++++++++ 4 files changed, 54 insertions(+) create mode 100644 include/hw/iommu.h diff --git a/MAINTAINERS b/MAINTAINERS index 37879ab64e..840cb1e604 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2304,6 +2304,7 @@ F: include/system/iommufd.h F: backends/host_iommu_device.c F: include/system/host_iommu_device.h F: include/qemu/chardev_open.h +F: include/hw/iommu.h F: util/chardev_open.c F: docs/devel/vfio-iommufd.rst diff --git a/include/hw/iommu.h b/include/hw/iommu.h new file mode 100644 index 0000000000..021db50db5 --- /dev/null +++ b/include/hw/iommu.h @@ -0,0 +1,17 @@ +/* + * General vIOMMU capabilities, flags, etc + * + * Copyright (C) 2025 Intel Corporation. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_IOMMU_H +#define HW_IOMMU_H + +enum { + /* hardware nested stage-1 page table support */ + VIOMMU_CAP_HW_NESTED = BIT_ULL(0), +}; + +#endif /* HW_IOMMU_H */ diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 6b7d3ac8a3..d89aefc030 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -462,6 +462,21 @@ typedef struct PCIIOMMUOps { * @devfn: device and function number of the PCI device. */ void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn); + /** + * @get_viommu_cap: get vIOMMU capabilities + * + * Optional callback, if not implemented, then vIOMMU doesn't + * support exposing capabilities to other subsystem, e.g., VFIO. + * vIOMMU can choose which capabilities to expose. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * Returns: 64bit bitmap with each bit represents a capability emulated by + * VIOMMU_CAP_* in include/hw/iommu.h, these capabilities are theoretical + * which are only determined by user's configuration and independent on the + * actual host capabilities they may depend on. + */ + uint64_t (*get_viommu_cap)(void *opaque); /** * @get_iotlb_info: get properties required to initialize a device IOTLB. * @@ -642,6 +657,16 @@ bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, Error **errp); void pci_device_unset_iommu_device(PCIDevice *dev); +/** + * pci_device_get_viommu_cap: get vIOMMU capabilities. + * + * Returns a 64bit bitmap with each bit represents a vIOMMU exposed + * capability, 0 if vIOMMU doesn't support esposing capabilities. + * + * @dev: PCI device pointer. + */ +uint64_t pci_device_get_viommu_cap(PCIDevice *dev); + /** * pci_iommu_get_iotlb_info: get properties required to initialize a * device IOTLB. diff --git a/hw/pci/pci.c b/hw/pci/pci.c index c70b5ceeba..df1fb615a8 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2992,6 +2992,17 @@ void pci_device_unset_iommu_device(PCIDevice *dev) } } +uint64_t pci_device_get_viommu_cap(PCIDevice *dev) +{ + PCIBus *iommu_bus; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->get_viommu_cap) { + return iommu_bus->iommu_ops->get_viommu_cap(iommu_bus->iommu_opaque); + } + return 0; +} + int pci_pri_request_page(PCIDevice *dev, uint32_t pasid, bool priv_req, bool exec_req, hwaddr addr, bool lpig, uint16_t prgi, bool is_read, bool is_write) -- 2.47.1