On Wed, Aug 21, 2024 at 2:56 PM Jonah Palmer <jonah.pal...@oracle.com> wrote:
>
> Implements a GPA->IOVA and IOVA->SVQ HVA tree for handling mapping,
> unmapping, and translations for guest and host-only memory,
> respectively.
>
> By splitting up a full IOVA->HVA tree (containing both guest and
> host-only memory mappings) into a GPA->IOVA tree (containing only guest
> memory mappings) and a IOVA->SVQ HVA tree (containing host-only memory
> mappings), we can avoid translating to the wrong IOVA when the guest has
> overlapping memory regions where different GPAs lead to the same HVA.
>
> In other words, if the guest has overlapping memory regions, translating
> an HVA to an IOVA may result in receiving an incorrect IOVA when
> searching the full IOVA->HVA tree. This would be due to one HVA range
> being contained (overlapping) in another HVA range in the IOVA->HVA
> tree.
>
> To avoid this issue, creating a GPA->IOVA tree and using it to translate
> a GPA to an IOVA ensures that the IOVA we receive is the correct one
> (instead of relying on a HVA->IOVA translation).
>
> As a byproduct of creating a GPA->IOVA tree, the full IOVA->HVA tree now
> becomes a partial IOVA->SVQ HVA tree. That is, since we're moving all
> guest memory mappings to the GPA->IOVA tree, the host-only memory
> mappings are now the only mappings being put into the IOVA->HVA tree.
>
> Furthermore, as an additional byproduct of splitting up guest and
> host-only memory mappings into separate trees, special attention needs
> to be paid to vhost_svq_translate_addr() when translating memory buffers
> from iovec. The memory buffers from iovec can be backed by guest memory
> or host-only memory, which means that we need to figure out who is
> backing these buffers and then decide which tree to use for translating
> it.
>
> In this patch we determine the backer of this buffer by first checking
> if a RAM block can be inferred from the buffer's HVA. That is, we use
> qemu_ram_block_from_host() and if a valid RAM block is returned, we know
> the buffer's HVA is backed by guest memory. Then we derive the GPA from
> it and translate the GPA to an IOVA using the GPA->IOVA tree.
>
> If an invalid RAM block is returned, the buffer's HVA is likely backed
> by host-only memory. In this case, we can then simply translate the HVA
> to an IOVA using the partial IOVA->SVQ HVA tree.
>
> However, this method is sub-optimal, especially for memory buffers
> backed by host-only memory, due to needing to iterate over some (if not
> all) RAMBlock structures and then searching either the GPA->IOVA tree or
> the IOVA->SVQ HVA tree. Optimizations to improve performance in this
> area should be revisited at some point.
>
> Signed-off-by: Jonah Palmer <jonah.pal...@oracle.com>
> ---
>  hw/virtio/vhost-iova-tree.c        | 53 +++++++++++++++++++++++++++++-
>  hw/virtio/vhost-iova-tree.h        |  5 ++-
>  hw/virtio/vhost-shadow-virtqueue.c | 48 +++++++++++++++++++++++----
>  hw/virtio/vhost-vdpa.c             | 18 +++++-----
>  include/qemu/iova-tree.h           | 22 +++++++++++++
>  util/iova-tree.c                   | 46 ++++++++++++++++++++++++++
>  6 files changed, 173 insertions(+), 19 deletions(-)
>
> diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
> index 32c03db2f5..5a3f6b5cd9 100644
> --- a/hw/virtio/vhost-iova-tree.c
> +++ b/hw/virtio/vhost-iova-tree.c
> @@ -26,15 +26,19 @@ struct VhostIOVATree {
>      /* Last addressable iova address in the device */
>      uint64_t iova_last;
>
> -    /* IOVA address to qemu memory maps. */
> +    /* IOVA address to qemu SVQ memory maps. */
>      IOVATree *iova_taddr_map;
>
>      /* IOVA tree (IOVA allocator) */
>      IOVATree *iova_map;
> +
> +    /* GPA->IOVA tree */
> +    IOVATree *gpa_map;
>  };
>
>  /**
>   * Create a new VhostIOVATree with a new set of IOVATree's:
> + * - GPA->IOVA tree (gpa_map)
>   * - IOVA allocator (iova_map)
>   * - IOVA->HVA tree (iova_taddr_map)
>   *
> @@ -50,6 +54,7 @@ VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, 
> hwaddr iova_last)
>
>      tree->iova_taddr_map = iova_tree_new();
>      tree->iova_map = iova_tree_new();
> +    tree->gpa_map = gpa_tree_new();
>      return tree;
>  }
>
> @@ -136,3 +141,49 @@ int vhost_iova_tree_insert(VhostIOVATree *iova_tree, 
> DMAMap *map)
>
>      return iova_tree_insert(iova_tree->iova_taddr_map, map);
>  }
> +
> +/**
> + * Insert a new GPA->IOVA mapping to the GPA->IOVA tree
> + *
> + * @iova_tree: The VhostIOVATree
> + * @map: The GPA->IOVA mapping
> + *
> + * Returns:
> + * - IOVA_OK if the map fits in the container
> + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
> + * - IOVA_ERR_OVERLAP if the GPA range overlaps with an existing range
> + */
> +int vhost_gpa_tree_insert(VhostIOVATree *iova_tree, DMAMap *map)
> +{
> +    if (map->iova + map->size < map->iova || map->perm == IOMMU_NONE) {
> +        return IOVA_ERR_INVALID;
> +    }
> +
> +    return gpa_tree_insert(iova_tree->gpa_map, map);
> +}
> +
> +/**
> + * Find the IOVA address stored from a guest memory address (GPA)
> + *
> + * @tree: The VhostIOVATree
> + * @map: The map with the guest memory address
> + *
> + * Return the stored mapping, or NULL if not found.
> + */
> +const DMAMap *vhost_gpa_tree_find_iova(const VhostIOVATree *tree,
> +                                       const DMAMap *map)
> +{
> +    return iova_tree_find_iova(tree->gpa_map, map);
> +}
> +
> +/**
> + * Remove existing mappings from the GPA->IOVA tree and IOVA tree
> + *
> + * @iova_tree: The VhostIOVATree
> + * @map: The map to remove
> + */
> +void vhost_gpa_tree_remove(VhostIOVATree *iova_tree, DMAMap map)
> +{
> +    iova_tree_remove(iova_tree->gpa_map, map);
> +    iova_tree_remove(iova_tree->iova_map, map);
> +}
> diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
> index 8bf7b64786..c22941db4f 100644
> --- a/hw/virtio/vhost-iova-tree.h
> +++ b/hw/virtio/vhost-iova-tree.h
> @@ -24,5 +24,8 @@ const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree 
> *iova_tree,
>  int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
>  void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map);
>  int vhost_iova_tree_insert(VhostIOVATree *iova_tree, DMAMap *map);
> -
> +int vhost_gpa_tree_insert(VhostIOVATree *iova_tree, DMAMap *map);
> +const DMAMap *vhost_gpa_tree_find_iova(const VhostIOVATree *iova_tree,
> +                                       const DMAMap *map);
> +void vhost_gpa_tree_remove(VhostIOVATree *iova_tree, DMAMap map);
>  #endif
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
> b/hw/virtio/vhost-shadow-virtqueue.c
> index fc5f408f77..12eabddaa6 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -16,6 +16,7 @@
>  #include "qemu/log.h"
>  #include "qemu/memalign.h"
>  #include "linux-headers/linux/vhost.h"
> +#include "exec/ramblock.h"
>
>  /**
>   * Validate the transport device features that both guests can use with the 
> SVQ
> @@ -88,14 +89,45 @@ static bool vhost_svq_translate_addr(const 
> VhostShadowVirtqueue *svq,
>      }
>
>      for (size_t i = 0; i < num; ++i) {
> -        DMAMap needle = {
> -            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
> -            .size = iovec[i].iov_len,
> -        };
> -        Int128 needle_last, map_last;
> -        size_t off;
> +        RAMBlock *rb;
> +        hwaddr gpa;
> +        ram_addr_t offset;
> +        const DMAMap *map;
> +        DMAMap needle;
> +
> +        /*
> +         * Determine if this HVA is backed by guest memory by attempting to
> +         * infer a RAM block from it. If a valid RAM block is returned, the
> +         * VA is backed by guest memory and we can derive the GPA from it.
> +         * Then search the GPA->IOVA tree for the corresponding IOVA.
> +         *
> +         * If the RAM block is invalid, the HVA is likely backed by host-only
> +         * memory. Use the HVA to search the IOVA->HVA tree for the
> +         * corresponding IOVA.
> +         *
> +         * TODO: This additional second lookup is sub-optimal when the HVA
> +         *       is backed by host-only memory. Find optimizations for this
> +         *       (e.g. using an HVA->IOVA tree).
> +         */
> +        rb = qemu_ram_block_from_host(iovec[i].iov_base, false, &offset);
> +        if (rb) {
> +            gpa = rb->offset + offset;
> +
> +            /* Search the GPA->IOVA tree */
> +            needle = (DMAMap) {
> +                .translated_addr = gpa,
> +                .size = iovec[i].iov_len,
> +            };
> +            map = vhost_gpa_tree_find_iova(svq->iova_tree, &needle);
> +        } else {
> +            /* Search the IOVA->HVA tree */
> +            needle = (DMAMap) {
> +                .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
> +                .size = iovec[i].iov_len,
> +            };
> +            map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
> +        }

I think that having this complex conditional here is a problem for
future users of SVQ.

>
> -        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, 
> &needle);
>          /*
>           * Map cannot be NULL since iova map contains all guest space and
>           * qemu already has a physical address mapped
> @@ -106,6 +138,8 @@ static bool vhost_svq_translate_addr(const 
> VhostShadowVirtqueue *svq,
>                            needle.translated_addr);
>              return false;
>          }
> +        Int128 needle_last, map_last;
> +        size_t off;
>
>          off = needle.translated_addr - map->translated_addr;
>          addrs[i] = map->iova + off;
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 6702459065..0da0a117dc 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -373,9 +373,9 @@ static void vhost_vdpa_listener_region_add(MemoryListener 
> *listener,
>
>          iova = mem_region.iova;
>
> -        /* Add mapping to the IOVA->HVA tree */
> -        mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr;
> -        r = vhost_iova_tree_insert(s->iova_tree, &mem_region);
> +        /* Add mapping to the GPA->IOVA tree */
> +        mem_region.translated_addr = section->offset_within_address_space;
> +        r = vhost_gpa_tree_insert(s->iova_tree, &mem_region);
>          if (unlikely(r != IOVA_OK)) {
>              error_report("Can't add listener region mapping (%d)", r);
>              goto fail_map;
> @@ -394,7 +394,7 @@ static void vhost_vdpa_listener_region_add(MemoryListener 
> *listener,
>
>  fail_map:
>      if (s->shadow_data) {
> -        vhost_iova_tree_remove(s->iova_tree, mem_region);
> +        vhost_gpa_tree_remove(s->iova_tree, mem_region);
>      }
>
>  fail:
> @@ -448,21 +448,19 @@ static void 
> vhost_vdpa_listener_region_del(MemoryListener *listener,
>
>      if (s->shadow_data) {
>          const DMAMap *result;
> -        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
> -            section->offset_within_region +
> -            (iova - section->offset_within_address_space);
>          DMAMap mem_region = {
> -            .translated_addr = (hwaddr)(uintptr_t)vaddr,
> +            .translated_addr = section->offset_within_address_space,
>              .size = int128_get64(llsize) - 1,
>          };
>
> -        result = vhost_iova_tree_find_iova(s->iova_tree, &mem_region);
> +        /* Search the GPA->IOVA tree */
> +        result = vhost_gpa_tree_find_iova(s->iova_tree, &mem_region);
>          if (!result) {
>              /* The memory listener map wasn't mapped */
>              return;
>          }
>          iova = result->iova;
> -        vhost_iova_tree_remove(s->iova_tree, *result);
> +        vhost_gpa_tree_remove(s->iova_tree, *result);
>      }
>      vhost_vdpa_iotlb_batch_begin_once(s);
>      /*
> diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
> index 2a10a7052e..57cfc63d33 100644
> --- a/include/qemu/iova-tree.h
> +++ b/include/qemu/iova-tree.h
> @@ -40,6 +40,15 @@ typedef struct DMAMap {
>  } QEMU_PACKED DMAMap;
>  typedef gboolean (*iova_tree_iterator)(DMAMap *map);
>
> +/**
> + * gpa_tree_new:
> + *
> + * Create a new GPA->IOVA tree.
> + *
> + * Returns: the tree pointer on success, or NULL otherwise.
> + */
> +IOVATree *gpa_tree_new(void);
> +
>  /**
>   * iova_tree_new:
>   *
> @@ -49,6 +58,19 @@ typedef gboolean (*iova_tree_iterator)(DMAMap *map);
>   */
>  IOVATree *iova_tree_new(void);
>
> +/**
> + * gpa_tree_insert:
> + *
> + * @tree: The GPA->IOVA tree we're inserting the mapping to
> + * @map: The GPA->IOVA mapping to insert
> + *
> + * Insert a GPA range to the GPA->IOVA tree. If there are overlapped
> + * ranges, IOVA_ERR_OVERLAP will be returned.
> + *
> + * Return: 0 if success, or < 0 if error.
> + */
> +int gpa_tree_insert(IOVATree *tree, const DMAMap *map);
> +

I'd keep this GPA tree in VhostIOVATree as other IOVATree users like
intel iommu do not use it.

>  /**
>   * iova_tree_insert:
>   *
> diff --git a/util/iova-tree.c b/util/iova-tree.c
> index 536789797e..e3f50fbf5c 100644
> --- a/util/iova-tree.c
> +++ b/util/iova-tree.c
> @@ -71,6 +71,22 @@ static int iova_tree_compare(gconstpointer a, 
> gconstpointer b, gpointer data)
>      return 0;
>  }
>
> +static int gpa_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
> +{
> +    const DMAMap *m1 = a, *m2 = b;
> +
> +    if (m1->translated_addr > m2->translated_addr + m2->size) {
> +        return 1;
> +    }
> +
> +    if (m1->translated_addr + m1->size < m2->translated_addr) {
> +        return -1;
> +    }
> +
> +    /* Overlapped */
> +    return 0;
> +}
> +
>  IOVATree *iova_tree_new(void)
>  {
>      IOVATree *iova_tree = g_new0(IOVATree, 1);
> @@ -81,6 +97,15 @@ IOVATree *iova_tree_new(void)
>      return iova_tree;
>  }
>
> +IOVATree *gpa_tree_new(void)
> +{
> +    IOVATree *gpa_tree = g_new0(IOVATree, 1);
> +
> +    gpa_tree->tree = g_tree_new_full(gpa_tree_compare, NULL, g_free, NULL);
> +
> +    return gpa_tree;
> +}
> +
>  const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
>  {
>      return g_tree_lookup(tree->tree, map);
> @@ -128,6 +153,27 @@ static inline void iova_tree_insert_internal(GTree 
> *gtree, DMAMap *range)
>      g_tree_insert(gtree, range, range);
>  }
>
> +int gpa_tree_insert(IOVATree *tree, const DMAMap *map)
> +{
> +    DMAMap *new;
> +
> +    if (map->translated_addr + map->size < map->translated_addr ||
> +        map->perm == IOMMU_NONE) {
> +        return IOVA_ERR_INVALID;
> +    }
> +
> +    /* We don't allow inserting ranges that overlap with existing ones */
> +    if (iova_tree_find(tree, map)) {
> +        return IOVA_ERR_OVERLAP;
> +    }
> +
> +    new = g_new0(DMAMap, 1);
> +    memcpy(new, map, sizeof(*new));
> +    iova_tree_insert_internal(tree->tree, new);
> +
> +    return IOVA_OK;
> +}
> +
>  int iova_tree_insert(IOVATree *tree, const DMAMap *map)
>  {
>      DMAMap *new;
> --
> 2.43.5
>


Reply via email to