Hi Eric, >-----Original Message----- >From: Eric Auger <eric.au...@redhat.com> >Sent: Wednesday, November 15, 2023 8:53 PM >Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object > >Hi Zhenzhong, > >On 11/14/23 11:09, Zhenzhong Duan wrote: >> From: Eric Auger <eric.au...@redhat.com> >> >> Introduce an iommufd object which allows the interaction >> with the host /dev/iommu device. >> >> The /dev/iommu can have been already pre-opened outside of qemu, >> in which case the fd can be passed directly along with the >> iommufd object: >> >> This allows the iommufd object to be shared accross several >> subsystems (VFIO, VDPA, ...). For example, libvirt would open >> the /dev/iommu once. >> >> If no fd is passed along with the iommufd object, the /dev/iommu >> is opened by the qemu code. >> >> Suggested-by: Alex Williamson <alex.william...@redhat.com> >> Signed-off-by: Eric Auger <eric.au...@redhat.com> >> Signed-off-by: Yi Liu <yi.l....@intel.com> >> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com> >> --- >> v6: remove redundant call, alloc_hwpt, get/put_ioas >> >> MAINTAINERS | 7 ++ >> qapi/qom.json | 19 ++++ >> include/sysemu/iommufd.h | 44 ++++++++ >> backends/iommufd.c | 228 +++++++++++++++++++++++++++++++++++++++ >> backends/Kconfig | 4 + >> backends/meson.build | 1 + >> backends/trace-events | 10 ++ >> qemu-options.hx | 12 +++ >> 8 files changed, 325 insertions(+) >> create mode 100644 include/sysemu/iommufd.h >> create mode 100644 backends/iommufd.c >> >> diff --git a/MAINTAINERS b/MAINTAINERS >> index ff1238bb98..a4891f7bda 100644 >> --- a/MAINTAINERS >> +++ b/MAINTAINERS >> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c >> F: docs/system/s390x/vfio-ap.rst >> L: qemu-s3...@nongnu.org >> >> +iommufd >> +M: Yi Liu <yi.l....@intel.com> >> +M: Eric Auger <eric.au...@redhat.com> >Zhenzhong, don't you want to be added here? >> +S: Supported >> +F: backends/iommufd.c >> +F: include/sysemu/iommufd.h >> + >> vhost >> M: Michael S. Tsirkin <m...@redhat.com> >> S: Supported >> diff --git a/qapi/qom.json b/qapi/qom.json >> index c53ef978ff..1fd8555a75 100644 >> --- a/qapi/qom.json >> +++ b/qapi/qom.json >> @@ -794,6 +794,23 @@ >> { 'struct': 'VfioUserServerProperties', >> 'data': { 'socket': 'SocketAddress', 'device': 'str' } } >> >> +## >> +# @IOMMUFDProperties: >> +# >> +# Properties for iommufd objects. >> +# >> +# @fd: file descriptor name previously passed via 'getfd' command, > >"previously passed via 'getfd' command", I wonder if this applies here or >whether >it is copy/paste of >RemoteObjectProperties.fd doc?
Maybe copied😊 I thinks this applies here because I use monitor_fd_param to get fd. Or I miss anything? > >> +# which represents a pre-opened /dev/iommu. This allows the >> +# iommufd object to be shared accross several subsystems >> +# (VFIO, VDPA, ...), and the file descriptor to be shared >> +# with other process, e.g. DPDK. (default: QEMU opens >> +# /dev/iommu by itself) >> +# >> +# Since: 8.2 >> +## >> +{ 'struct': 'IOMMUFDProperties', >> + 'data': { '*fd': 'str' } } >> + >> ## >> # @RngProperties: >> # >> @@ -934,6 +951,7 @@ >> 'input-barrier', >> { 'name': 'input-linux', >> 'if': 'CONFIG_LINUX' }, >> + 'iommufd', >> 'iothread', >> 'main-loop', >> { 'name': 'memory-backend-epc', >> @@ -1003,6 +1021,7 @@ >> 'input-barrier': 'InputBarrierProperties', >> 'input-linux': { 'type': 'InputLinuxProperties', >> 'if': 'CONFIG_LINUX' }, >> + 'iommufd': 'IOMMUFDProperties', >> 'iothread': 'IothreadProperties', >> 'main-loop': 'MainLoopProperties', >> 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', >> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h >> new file mode 100644 >> index 0000000000..9b3a86f57d >> --- /dev/null >> +++ b/include/sysemu/iommufd.h >> @@ -0,0 +1,44 @@ >> +#ifndef SYSEMU_IOMMUFD_H >> +#define SYSEMU_IOMMUFD_H >> + >> +#include "qom/object.h" >> +#include "qemu/thread.h" >> +#include "exec/hwaddr.h" >> +#include "exec/cpu-common.h" >> + >> +#define TYPE_IOMMUFD_BACKEND "iommufd" >> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, >> + IOMMUFD_BACKEND) >> +#define IOMMUFD_BACKEND(obj) \ >> + OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND) >> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \ >> + OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj), >TYPE_IOMMUFD_BACKEND) >> +#define IOMMUFD_BACKEND_CLASS(klass) \ >> + OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass), >TYPE_IOMMUFD_BACKEND) >> +struct IOMMUFDBackendClass { >> + ObjectClass parent_class; >> +}; >> + >> +struct IOMMUFDBackend { >> + Object parent; >> + >> + /*< protected >*/ >> + int fd; /* /dev/iommu file descriptor */ >> + bool owned; /* is the /dev/iommu opened internally */ >> + QemuMutex lock; >> + uint32_t users; >> + >> + /*< public >*/ >> +}; >> + >> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); >> +void iommufd_backend_disconnect(IOMMUFDBackend *be); >> + >> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, >> + Error **errp); >> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); >> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >hwaddr iova, >> + ram_addr_t size, void *vaddr, bool readonly); >> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> + hwaddr iova, ram_addr_t size); >> +#endif >> diff --git a/backends/iommufd.c b/backends/iommufd.c >> new file mode 100644 >> index 0000000000..ea3e2a8f85 >> --- /dev/null >> +++ b/backends/iommufd.c >> @@ -0,0 +1,228 @@ >> +/* >> + * iommufd container backend >> + * >> + * Copyright (C) 2023 Intel Corporation. >> + * Copyright Red Hat, Inc. 2023 >> + * >> + * Authors: Yi Liu <yi.l....@intel.com> >> + * Eric Auger <eric.au...@redhat.com> >> + * >> + * SPDX-License-Identifier: GPL-2.0-or-later >> + */ >> + >> +#include "qemu/osdep.h" >> +#include "sysemu/iommufd.h" >> +#include "qapi/error.h" >> +#include "qapi/qmp/qerror.h" >> +#include "qemu/module.h" >> +#include "qom/object_interfaces.h" >> +#include "qemu/error-report.h" >> +#include "monitor/monitor.h" >> +#include "trace.h" >> +#include <sys/ioctl.h> >> +#include <linux/iommufd.h> >> + >> +static void iommufd_backend_init(Object *obj) >> +{ >> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >> + >> + be->fd = -1; >> + be->users = 0; >> + be->owned = true; >> + qemu_mutex_init(&be->lock); >> +} >> + >> +static void iommufd_backend_finalize(Object *obj) >> +{ >> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >> + >> + if (be->owned) { >> + close(be->fd); >> + be->fd = -1; >> + } >> +} >> + >> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error >> **errp) >> +{ >> + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); >> + int fd = -1; >> + >> + fd = monitor_fd_param(monitor_cur(), str, errp); >> + if (fd == -1) { >> + error_prepend(errp, "Could not parse remote object fd %s:", str); >> + return; >> + } >> + qemu_mutex_lock(&be->lock); >> + be->fd = fd; >> + be->owned = false; >> + qemu_mutex_unlock(&be->lock); >> + trace_iommu_backend_set_fd(be->fd); >> +} >> + >> +static void iommufd_backend_class_init(ObjectClass *oc, void *data) >> +{ >> + object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); >> +} >> + >> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) >> +{ >> + int fd, ret = 0; >> + >> + qemu_mutex_lock(&be->lock); >> + if (be->users == UINT32_MAX) { >> + error_setg(errp, "too many connections"); >> + ret = -E2BIG; >> + goto out; >> + } >> + if (be->owned && !be->users) { >> + fd = qemu_open_old("/dev/iommu", O_RDWR); >> + if (fd < 0) { >> + error_setg_errno(errp, errno, "/dev/iommu opening failed"); >> + ret = fd; >> + goto out; >> + } >> + be->fd = fd; >> + } >> + be->users++; >> +out: >> + trace_iommufd_backend_connect(be->fd, be->owned, >> + be->users, ret); >> + qemu_mutex_unlock(&be->lock); >> + return ret; >> +} >> + >> +void iommufd_backend_disconnect(IOMMUFDBackend *be) >> +{ >> + qemu_mutex_lock(&be->lock); >> + if (!be->users) { >> + goto out; >> + } >> + be->users--; >> + if (!be->users && be->owned) { >> + close(be->fd); >> + be->fd = -1; >> + } >> +out: >> + trace_iommufd_backend_disconnect(be->fd, be->users); >> + qemu_mutex_unlock(&be->lock); >> +} >> + >> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, >> + Error **errp) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_alloc alloc_data = { >> + .size = sizeof(alloc_data), >> + .flags = 0, >> + }; >> + >> + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); >> + if (ret) { >> + error_setg_errno(errp, errno, "Failed to allocate ioas"); >> + return ret; >> + } >> + >> + *ioas_id = alloc_data.out_ioas_id; >> + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); >> + >> + return ret; >> +} >> + >> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_destroy des = { >> + .size = sizeof(des), >> + .id = id, >> + }; >> + >> + ret = ioctl(fd, IOMMU_DESTROY, &des); >> + trace_iommufd_backend_free_id(fd, id, ret); >> + if (ret) { >> + error_report("Failed to free id: %u %m", id); >> + } >> +} >> + >> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, >hwaddr iova, >> + ram_addr_t size, void *vaddr, bool readonly) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_map map = { >> + .size = sizeof(map), >> + .flags = IOMMU_IOAS_MAP_READABLE | >> + IOMMU_IOAS_MAP_FIXED_IOVA, >> + .ioas_id = ioas_id, >> + .__reserved = 0, >> + .user_va = (uintptr_t)vaddr, >> + .iova = iova, >> + .length = size, >> + }; >> + >> + if (!readonly) { >> + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; >> + } >> + >> + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); >> + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, >> + vaddr, readonly, ret); >> + if (ret) { >> + ret = -errno; >> + error_report("IOMMU_IOAS_MAP failed: %m"); >> + } >> + return ret; >> +} >> + >> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, >> + hwaddr iova, ram_addr_t size) >> +{ >> + int ret, fd = be->fd; >> + struct iommu_ioas_unmap unmap = { >> + .size = sizeof(unmap), >> + .ioas_id = ioas_id, >> + .iova = iova, >> + .length = size, >> + }; >> + >> + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); >> + /* >> + * IOMMUFD takes mapping as some kind of object, unmapping >> + * nonexistent mapping is treated as deleting a nonexistent >> + * object and return ENOENT. This is different from legacy >> + * backend which allows it. vIOMMU may trigger a lot of >> + * redundant unmapping, to avoid flush the log, treat them >> + * as succeess for IOMMUFD just like legacy backend. >> + */ >> + if (ret && errno == ENOENT) { >> + trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, >ret); >> + ret = 0; >> + } else { >> + trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); >> + } >> + >> + if (ret) { >> + ret = -errno; >> + error_report("IOMMU_IOAS_UNMAP failed: %m"); >> + } >> + return ret; >> +} >> + >> +static const TypeInfo iommufd_backend_info = { >> + .name = TYPE_IOMMUFD_BACKEND, >> + .parent = TYPE_OBJECT, >> + .instance_size = sizeof(IOMMUFDBackend), >> + .instance_init = iommufd_backend_init, >> + .instance_finalize = iommufd_backend_finalize, >> + .class_size = sizeof(IOMMUFDBackendClass), >> + .class_init = iommufd_backend_class_init, >> + .interfaces = (InterfaceInfo[]) { >> + { TYPE_USER_CREATABLE }, >> + { } >> + } >> +}; >> + >> +static void register_types(void) >> +{ >> + type_register_static(&iommufd_backend_info); >> +} >> + >> +type_init(register_types); >> diff --git a/backends/Kconfig b/backends/Kconfig >> index f35abc1609..2cb23f62fa 100644 >> --- a/backends/Kconfig >> +++ b/backends/Kconfig >> @@ -1 +1,5 @@ >> source tpm/Kconfig >> + >> +config IOMMUFD >> + bool >> + depends on VFIO >I don't know the state of vDPA/iommufd integration but this extra might >be added in short term. Thanks for reminder. But I think it make more sense that series relax the check itself? E.g. depends on VFIO || VHOST_VDPA Thanks Zhenzhong