On Thu, Jan 29, 2015 at 08:27:20PM +1100, Alexey Kardashevskiy wrote:
> This makes use of the new "memory registering" feature. The idea is
> to provide the guest ability to notify the host kernel about pages which
> are going to be used for DMA. Having this information, the host kernel
> can pin them all, do locked pages accounting and not spent time on
> doing that in real time with possible failures which cannot be handled
> nicely in some cases.
> 
> This adds a memory listener which notifies a VFIO container about memory
> which needs to be pinned/unpinned. VFIO MMIO regions are skipped.
> 
> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
> ---
>  hw/vfio/common.c              | 99 
> ++++++++++++++++++++++++++++++++++++++++++-
>  include/hw/vfio/vfio-common.h |  3 +-
>  2 files changed, 100 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index cf483ff..c08f9ab 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -485,6 +485,99 @@ void vfio_listener_release(VFIOContainer *container)
>      memory_listener_unregister(&container->iommu_data.type1.listener);
>  }
>  
> +static int vfio_mem_register(VFIOContainer *container,
> +                             void *vaddr, ram_addr_t size)
> +{
> +    struct vfio_iommu_type1_register_memory reg = {
> +        .argsz = sizeof(reg),
> +        .vaddr = (__u64)(uintptr_t)vaddr,
> +        .size = size,
> +    };
> +    long ret = ioctl(container->fd, VFIO_IOMMU_REGISTER_MEMORY, &reg);
> +
> +    DPRINTF("(%u) %s %u: va=%lx size=%lx, ret=%ld\n", getpid(),
> +            __func__, __LINE__, reg.vaddr, reg.size, ret);
> +
> +    return ret;
> +}
> +
> +static int vfio_mem_unregister(VFIOContainer *container,
> +                               void *vaddr, ram_addr_t size)
> +{
> +    struct vfio_iommu_type1_unregister_memory reg = {
> +        .argsz = sizeof(reg),
> +        .vaddr = (__u64)(uintptr_t)vaddr,
> +        .size = size,
> +    };
> +    long ret = ioctl(container->fd, VFIO_IOMMU_UNREGISTER_MEMORY, &reg);
> +
> +    DPRINTF("(%u) %s %u: va=%lx size=%lx, ret=%ld\n", getpid(),
> +            __func__, __LINE__, reg.vaddr, reg.size, ret);
> +
> +    return ret;
> +}
> +
> +static void vfio_ram_region_add(MemoryListener *listener,
> +                                MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.type1.ramlistener);
> +    hwaddr end;
> +    Int128 llend;
> +    void *vaddr;
> +
> +    if (!memory_region_is_ram(section->mr) ||
> +        memory_region_is_skip_dump(section->mr)) {
> +        return;
> +    }
> +
> +    llend = int128_make64(section->offset_within_address_space);
> +    llend = int128_add(llend, section->size);

Does this need an add TARGET_PAGE_SIZE-1 in order to make it round up
to a page boundary, rather than truncate to a page boundary?

> +    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
> +
> +    memory_region_ref(section->mr);
> +
> +    end = int128_get64(llend);
> +    vaddr = memory_region_get_ram_ptr(section->mr) +
> +        section->offset_within_region;
> +    vfio_mem_register(container, vaddr, end);
> +}
> +
> +static void vfio_ram_region_del(MemoryListener *listener,
> +                                MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            iommu_data.type1.ramlistener);
> +    hwaddr iova, end;
> +    void *vaddr;
> +
> +    if (!memory_region_is_ram(section->mr) ||
> +        memory_region_is_skip_dump(section->mr)) {
> +        return;
> +    }
> +
> +
> +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +    end = (section->offset_within_address_space + 
> int128_get64(section->size)) &
> +        TARGET_PAGE_MASK;
> +    vaddr = memory_region_get_ram_ptr(section->mr) +
> +        section->offset_within_region +
> +        (iova - section->offset_within_address_space);

It's not clear to me why region_add and region_del have a different
set of address calculations.

> +    vfio_mem_unregister(container, vaddr, end - iova);
> +}
> +
> +const MemoryListener vfio_ram_listener = {
> +    .region_add = vfio_ram_region_add,
> +    .region_del = vfio_ram_region_del,
> +};
> +
> +static void vfio_spapr_listener_release(VFIOContainer *container)
> +{
> +    memory_listener_unregister(&container->iommu_data.type1.listener);
> +    memory_listener_unregister(&container->iommu_data.type1.ramlistener);

Accessing fields within type1 from a function whose name says it's
spapr specific seems very wrong.

> +}
> +
>  int vfio_mmap_region(Object *obj, VFIORegion *region,
>                       MemoryRegion *mem, MemoryRegion *submem,
>                       void **map, size_t size, off_t offset,
> @@ -705,6 +798,10 @@ static int vfio_connect_container(VFIOGroup *group, 
> AddressSpace *as)
>              goto free_container_exit;
>          }
>  
> +        container->iommu_data.type1.ramlistener = vfio_ram_listener;
> +        memory_listener_register(&container->iommu_data.type1.ramlistener,
> +                                 &address_space_memory);

Why two separate listeners, rather than doing both jobs from a single listener?

>          /*
>           * The host kernel code implementing VFIO_IOMMU_DISABLE is called
>           * when container fd is closed so we do not call it explicitly
> @@ -718,7 +815,7 @@ static int vfio_connect_container(VFIOGroup *group, 
> AddressSpace *as)
>          }
>  
>          container->iommu_data.type1.listener = vfio_memory_listener;
> -        container->iommu_data.release = vfio_listener_release;
> +        container->iommu_data.release = vfio_spapr_listener_release;
>  
>          memory_listener_register(&container->iommu_data.type1.listener,
>                                   container->space->as);
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 1d5bfe8..3f91216 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -66,6 +66,7 @@ struct VFIOGroup;
>  
>  typedef struct VFIOType1 {
>      MemoryListener listener;
> +    MemoryListener ramlistener;
>      int error;
>      bool initialized;
>  } VFIOType1;
> @@ -144,7 +145,7 @@ int vfio_get_device(VFIOGroup *group, const char *name,
>                      VFIODevice *vbasedev);
>  
>  extern const MemoryRegionOps vfio_region_ops;
> -extern const MemoryListener vfio_memory_listener;
> +extern const MemoryListener vfio_memory_listener, vfio_ram_listener;
>  extern QLIST_HEAD(vfio_group_head, VFIOGroup) vfio_group_list;
>  extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces;
>  

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: pgpg5qNTs_4KR.pgp
Description: PGP signature

Reply via email to