Hi Zhenzhong,

On 11/16/23 05:04, Duan, Zhenzhong wrote:
> Hi Eric,
>
>> -----Original Message-----
>> From: Eric Auger <eric.au...@redhat.com>
>> Sent: Wednesday, November 15, 2023 8:53 PM
>> Subject: Re: [PATCH v6 01/21] backends/iommufd: Introduce the iommufd object
>>
>> Hi Zhenzhong,
>>
>> On 11/14/23 11:09, Zhenzhong Duan wrote:
>>> From: Eric Auger <eric.au...@redhat.com>
>>>
>>> Introduce an iommufd object which allows the interaction
>>> with the host /dev/iommu device.
>>>
>>> The /dev/iommu can have been already pre-opened outside of qemu,
>>> in which case the fd can be passed directly along with the
>>> iommufd object:
>>>
>>> This allows the iommufd object to be shared accross several
>>> subsystems (VFIO, VDPA, ...). For example, libvirt would open
>>> the /dev/iommu once.
>>>
>>> If no fd is passed along with the iommufd object, the /dev/iommu
>>> is opened by the qemu code.
>>>
>>> Suggested-by: Alex Williamson <alex.william...@redhat.com>
>>> Signed-off-by: Eric Auger <eric.au...@redhat.com>
>>> Signed-off-by: Yi Liu <yi.l....@intel.com>
>>> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com>
>>> ---
>>> v6: remove redundant call, alloc_hwpt, get/put_ioas
>>>
>>>  MAINTAINERS              |   7 ++
>>>  qapi/qom.json            |  19 ++++
>>>  include/sysemu/iommufd.h |  44 ++++++++
>>>  backends/iommufd.c       | 228 +++++++++++++++++++++++++++++++++++++++
>>>  backends/Kconfig         |   4 +
>>>  backends/meson.build     |   1 +
>>>  backends/trace-events    |  10 ++
>>>  qemu-options.hx          |  12 +++
>>>  8 files changed, 325 insertions(+)
>>>  create mode 100644 include/sysemu/iommufd.h
>>>  create mode 100644 backends/iommufd.c
>>>
>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>> index ff1238bb98..a4891f7bda 100644
>>> --- a/MAINTAINERS
>>> +++ b/MAINTAINERS
>>> @@ -2166,6 +2166,13 @@ F: hw/vfio/ap.c
>>>  F: docs/system/s390x/vfio-ap.rst
>>>  L: qemu-s3...@nongnu.org
>>>
>>> +iommufd
>>> +M: Yi Liu <yi.l....@intel.com>
>>> +M: Eric Auger <eric.au...@redhat.com>
>> Zhenzhong, don't you want to be added here?
>>> +S: Supported
>>> +F: backends/iommufd.c
>>> +F: include/sysemu/iommufd.h
>>> +
>>>  vhost
>>>  M: Michael S. Tsirkin <m...@redhat.com>
>>>  S: Supported
>>> diff --git a/qapi/qom.json b/qapi/qom.json
>>> index c53ef978ff..1fd8555a75 100644
>>> --- a/qapi/qom.json
>>> +++ b/qapi/qom.json
>>> @@ -794,6 +794,23 @@
>>>  { 'struct': 'VfioUserServerProperties',
>>>    'data': { 'socket': 'SocketAddress', 'device': 'str' } }
>>>
>>> +##
>>> +# @IOMMUFDProperties:
>>> +#
>>> +# Properties for iommufd objects.
>>> +#
>>> +# @fd: file descriptor name previously passed via 'getfd' command,
>> "previously passed via 'getfd' command", I wonder if this applies here or 
>> whether
>> it is copy/paste of
>> RemoteObjectProperties.fd doc?
> Maybe copied😊 I thinks this applies here because I use monitor_fd_param to 
> get fd.
> Or I miss anything?
This is a bit cryptic to me and I don't really understand what it means.
It does not mean it is not correct but I am curious about explanations
if anybody has some ;-)
>
>>> +#     which represents a pre-opened /dev/iommu.  This allows the
>>> +#     iommufd object to be shared accross several subsystems
>>> +#     (VFIO, VDPA, ...), and the file descriptor to be shared
>>> +#     with other process, e.g. DPDK.  (default: QEMU opens
>>> +#     /dev/iommu by itself)
>>> +#
>>> +# Since: 8.2
>>> +##
>>> +{ 'struct': 'IOMMUFDProperties',
>>> +  'data': { '*fd': 'str' } }
>>> +
>>>  ##
>>>  # @RngProperties:
>>>  #
>>> @@ -934,6 +951,7 @@
>>>      'input-barrier',
>>>      { 'name': 'input-linux',
>>>        'if': 'CONFIG_LINUX' },
>>> +    'iommufd',
>>>      'iothread',
>>>      'main-loop',
>>>      { 'name': 'memory-backend-epc',
>>> @@ -1003,6 +1021,7 @@
>>>        'input-barrier':              'InputBarrierProperties',
>>>        'input-linux':                { 'type': 'InputLinuxProperties',
>>>                                        'if': 'CONFIG_LINUX' },
>>> +      'iommufd':                    'IOMMUFDProperties',
>>>        'iothread':                   'IothreadProperties',
>>>        'main-loop':                  'MainLoopProperties',
>>>        'memory-backend-epc':         { 'type': 'MemoryBackendEpcProperties',
>>> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
>>> new file mode 100644
>>> index 0000000000..9b3a86f57d
>>> --- /dev/null
>>> +++ b/include/sysemu/iommufd.h
>>> @@ -0,0 +1,44 @@
>>> +#ifndef SYSEMU_IOMMUFD_H
>>> +#define SYSEMU_IOMMUFD_H
>>> +
>>> +#include "qom/object.h"
>>> +#include "qemu/thread.h"
>>> +#include "exec/hwaddr.h"
>>> +#include "exec/cpu-common.h"
>>> +
>>> +#define TYPE_IOMMUFD_BACKEND "iommufd"
>>> +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass,
>>> +                    IOMMUFD_BACKEND)
>>> +#define IOMMUFD_BACKEND(obj) \
>>> +    OBJECT_CHECK(IOMMUFDBackend, (obj), TYPE_IOMMUFD_BACKEND)
>>> +#define IOMMUFD_BACKEND_GET_CLASS(obj) \
>>> +    OBJECT_GET_CLASS(IOMMUFDBackendClass, (obj),
>> TYPE_IOMMUFD_BACKEND)
>>> +#define IOMMUFD_BACKEND_CLASS(klass) \
>>> +    OBJECT_CLASS_CHECK(IOMMUFDBackendClass, (klass),
>> TYPE_IOMMUFD_BACKEND)
>>> +struct IOMMUFDBackendClass {
>>> +    ObjectClass parent_class;
>>> +};
>>> +
>>> +struct IOMMUFDBackend {
>>> +    Object parent;
>>> +
>>> +    /*< protected >*/
>>> +    int fd;            /* /dev/iommu file descriptor */
>>> +    bool owned;        /* is the /dev/iommu opened internally */
>>> +    QemuMutex lock;
>>> +    uint32_t users;
>>> +
>>> +    /*< public >*/
>>> +};
>>> +
>>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
>>> +void iommufd_backend_disconnect(IOMMUFDBackend *be);
>>> +
>>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
>>> +                               Error **errp);
>>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> hwaddr iova,
>>> +                            ram_addr_t size, void *vaddr, bool readonly);
>>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>>> +                              hwaddr iova, ram_addr_t size);
>>> +#endif
>>> diff --git a/backends/iommufd.c b/backends/iommufd.c
>>> new file mode 100644
>>> index 0000000000..ea3e2a8f85
>>> --- /dev/null
>>> +++ b/backends/iommufd.c
>>> @@ -0,0 +1,228 @@
>>> +/*
>>> + * iommufd container backend
>>> + *
>>> + * Copyright (C) 2023 Intel Corporation.
>>> + * Copyright Red Hat, Inc. 2023
>>> + *
>>> + * Authors: Yi Liu <yi.l....@intel.com>
>>> + *          Eric Auger <eric.au...@redhat.com>
>>> + *
>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "sysemu/iommufd.h"
>>> +#include "qapi/error.h"
>>> +#include "qapi/qmp/qerror.h"
>>> +#include "qemu/module.h"
>>> +#include "qom/object_interfaces.h"
>>> +#include "qemu/error-report.h"
>>> +#include "monitor/monitor.h"
>>> +#include "trace.h"
>>> +#include <sys/ioctl.h>
>>> +#include <linux/iommufd.h>
>>> +
>>> +static void iommufd_backend_init(Object *obj)
>>> +{
>>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>>> +
>>> +    be->fd = -1;
>>> +    be->users = 0;
>>> +    be->owned = true;
>>> +    qemu_mutex_init(&be->lock);
>>> +}
>>> +
>>> +static void iommufd_backend_finalize(Object *obj)
>>> +{
>>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>>> +
>>> +    if (be->owned) {
>>> +        close(be->fd);
>>> +        be->fd = -1;
>>> +    }
>>> +}
>>> +
>>> +static void iommufd_backend_set_fd(Object *obj, const char *str, Error 
>>> **errp)
>>> +{
>>> +    IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
>>> +    int fd = -1;
>>> +
>>> +    fd = monitor_fd_param(monitor_cur(), str, errp);
>>> +    if (fd == -1) {
>>> +        error_prepend(errp, "Could not parse remote object fd %s:", str);
>>> +        return;
>>> +    }
>>> +    qemu_mutex_lock(&be->lock);
>>> +    be->fd = fd;
>>> +    be->owned = false;
>>> +    qemu_mutex_unlock(&be->lock);
>>> +    trace_iommu_backend_set_fd(be->fd);
>>> +}
>>> +
>>> +static void iommufd_backend_class_init(ObjectClass *oc, void *data)
>>> +{
>>> +    object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
>>> +}
>>> +
>>> +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
>>> +{
>>> +    int fd, ret = 0;
>>> +
>>> +    qemu_mutex_lock(&be->lock);
>>> +    if (be->users == UINT32_MAX) {
>>> +        error_setg(errp, "too many connections");
>>> +        ret = -E2BIG;
>>> +        goto out;
>>> +    }
>>> +    if (be->owned && !be->users) {
>>> +        fd = qemu_open_old("/dev/iommu", O_RDWR);
>>> +        if (fd < 0) {
>>> +            error_setg_errno(errp, errno, "/dev/iommu opening failed");
>>> +            ret = fd;
>>> +            goto out;
>>> +        }
>>> +        be->fd = fd;
>>> +    }
>>> +    be->users++;
>>> +out:
>>> +    trace_iommufd_backend_connect(be->fd, be->owned,
>>> +                                  be->users, ret);
>>> +    qemu_mutex_unlock(&be->lock);
>>> +    return ret;
>>> +}
>>> +
>>> +void iommufd_backend_disconnect(IOMMUFDBackend *be)
>>> +{
>>> +    qemu_mutex_lock(&be->lock);
>>> +    if (!be->users) {
>>> +        goto out;
>>> +    }
>>> +    be->users--;
>>> +    if (!be->users && be->owned) {
>>> +        close(be->fd);
>>> +        be->fd = -1;
>>> +    }
>>> +out:
>>> +    trace_iommufd_backend_disconnect(be->fd, be->users);
>>> +    qemu_mutex_unlock(&be->lock);
>>> +}
>>> +
>>> +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
>>> +                               Error **errp)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_ioas_alloc alloc_data  = {
>>> +        .size = sizeof(alloc_data),
>>> +        .flags = 0,
>>> +    };
>>> +
>>> +    ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
>>> +    if (ret) {
>>> +        error_setg_errno(errp, errno, "Failed to allocate ioas");
>>> +        return ret;
>>> +    }
>>> +
>>> +    *ioas_id = alloc_data.out_ioas_id;
>>> +    trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_destroy des = {
>>> +        .size = sizeof(des),
>>> +        .id = id,
>>> +    };
>>> +
>>> +    ret = ioctl(fd, IOMMU_DESTROY, &des);
>>> +    trace_iommufd_backend_free_id(fd, id, ret);
>>> +    if (ret) {
>>> +        error_report("Failed to free id: %u %m", id);
>>> +    }
>>> +}
>>> +
>>> +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>> hwaddr iova,
>>> +                            ram_addr_t size, void *vaddr, bool readonly)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_ioas_map map = {
>>> +        .size = sizeof(map),
>>> +        .flags = IOMMU_IOAS_MAP_READABLE |
>>> +                 IOMMU_IOAS_MAP_FIXED_IOVA,
>>> +        .ioas_id = ioas_id,
>>> +        .__reserved = 0,
>>> +        .user_va = (uintptr_t)vaddr,
>>> +        .iova = iova,
>>> +        .length = size,
>>> +    };
>>> +
>>> +    if (!readonly) {
>>> +        map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
>>> +    }
>>> +
>>> +    ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
>>> +    trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
>>> +                                  vaddr, readonly, ret);
>>> +    if (ret) {
>>> +        ret = -errno;
>>> +        error_report("IOMMU_IOAS_MAP failed: %m");
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>>> +                              hwaddr iova, ram_addr_t size)
>>> +{
>>> +    int ret, fd = be->fd;
>>> +    struct iommu_ioas_unmap unmap = {
>>> +        .size = sizeof(unmap),
>>> +        .ioas_id = ioas_id,
>>> +        .iova = iova,
>>> +        .length = size,
>>> +    };
>>> +
>>> +    ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
>>> +    /*
>>> +     * IOMMUFD takes mapping as some kind of object, unmapping
>>> +     * nonexistent mapping is treated as deleting a nonexistent
>>> +     * object and return ENOENT. This is different from legacy
>>> +     * backend which allows it. vIOMMU may trigger a lot of
>>> +     * redundant unmapping, to avoid flush the log, treat them
>>> +     * as succeess for IOMMUFD just like legacy backend.
>>> +     */
>>> +    if (ret && errno == ENOENT) {
>>> +        trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size,
>> ret);
>>> +        ret = 0;
>>> +    } else {
>>> +        trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
>>> +    }
>>> +
>>> +    if (ret) {
>>> +        ret = -errno;
>>> +        error_report("IOMMU_IOAS_UNMAP failed: %m");
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +static const TypeInfo iommufd_backend_info = {
>>> +    .name = TYPE_IOMMUFD_BACKEND,
>>> +    .parent = TYPE_OBJECT,
>>> +    .instance_size = sizeof(IOMMUFDBackend),
>>> +    .instance_init = iommufd_backend_init,
>>> +    .instance_finalize = iommufd_backend_finalize,
>>> +    .class_size = sizeof(IOMMUFDBackendClass),
>>> +    .class_init = iommufd_backend_class_init,
>>> +    .interfaces = (InterfaceInfo[]) {
>>> +        { TYPE_USER_CREATABLE },
>>> +        { }
>>> +    }
>>> +};
>>> +
>>> +static void register_types(void)
>>> +{
>>> +    type_register_static(&iommufd_backend_info);
>>> +}
>>> +
>>> +type_init(register_types);
>>> diff --git a/backends/Kconfig b/backends/Kconfig
>>> index f35abc1609..2cb23f62fa 100644
>>> --- a/backends/Kconfig
>>> +++ b/backends/Kconfig
>>> @@ -1 +1,5 @@
>>>  source tpm/Kconfig
>>> +
>>> +config IOMMUFD
>>> +    bool
>>> +    depends on VFIO
>> I don't know the state of vDPA/iommufd integration but this extra might
>> be added in short term.
> Thanks for reminder. But I think it make more sense that series relax the 
> check
> itself?
> E.g. depends on VFIO || VHOST_VDPA
yeah we can leave it as it is for now

Eric
>
> Thanks
> Zhenzhong


Reply via email to