On Wed, Aug 21, 2024 at 2:56 PM Jonah Palmer <jonah.pal...@oracle.com> wrote: > > Implements a GPA->IOVA and IOVA->SVQ HVA tree for handling mapping, > unmapping, and translations for guest and host-only memory, > respectively. > > By splitting up a full IOVA->HVA tree (containing both guest and > host-only memory mappings) into a GPA->IOVA tree (containing only guest > memory mappings) and a IOVA->SVQ HVA tree (containing host-only memory > mappings), we can avoid translating to the wrong IOVA when the guest has > overlapping memory regions where different GPAs lead to the same HVA. > > In other words, if the guest has overlapping memory regions, translating > an HVA to an IOVA may result in receiving an incorrect IOVA when > searching the full IOVA->HVA tree. This would be due to one HVA range > being contained (overlapping) in another HVA range in the IOVA->HVA > tree. > > To avoid this issue, creating a GPA->IOVA tree and using it to translate > a GPA to an IOVA ensures that the IOVA we receive is the correct one > (instead of relying on a HVA->IOVA translation). > > As a byproduct of creating a GPA->IOVA tree, the full IOVA->HVA tree now > becomes a partial IOVA->SVQ HVA tree. That is, since we're moving all > guest memory mappings to the GPA->IOVA tree, the host-only memory > mappings are now the only mappings being put into the IOVA->HVA tree. > > Furthermore, as an additional byproduct of splitting up guest and > host-only memory mappings into separate trees, special attention needs > to be paid to vhost_svq_translate_addr() when translating memory buffers > from iovec. The memory buffers from iovec can be backed by guest memory > or host-only memory, which means that we need to figure out who is > backing these buffers and then decide which tree to use for translating > it. > > In this patch we determine the backer of this buffer by first checking > if a RAM block can be inferred from the buffer's HVA. That is, we use > qemu_ram_block_from_host() and if a valid RAM block is returned, we know > the buffer's HVA is backed by guest memory. Then we derive the GPA from > it and translate the GPA to an IOVA using the GPA->IOVA tree. > > If an invalid RAM block is returned, the buffer's HVA is likely backed > by host-only memory. In this case, we can then simply translate the HVA > to an IOVA using the partial IOVA->SVQ HVA tree. > > However, this method is sub-optimal, especially for memory buffers > backed by host-only memory, due to needing to iterate over some (if not > all) RAMBlock structures and then searching either the GPA->IOVA tree or > the IOVA->SVQ HVA tree. Optimizations to improve performance in this > area should be revisited at some point. > > Signed-off-by: Jonah Palmer <jonah.pal...@oracle.com> > --- > hw/virtio/vhost-iova-tree.c | 53 +++++++++++++++++++++++++++++- > hw/virtio/vhost-iova-tree.h | 5 ++- > hw/virtio/vhost-shadow-virtqueue.c | 48 +++++++++++++++++++++++---- > hw/virtio/vhost-vdpa.c | 18 +++++----- > include/qemu/iova-tree.h | 22 +++++++++++++ > util/iova-tree.c | 46 ++++++++++++++++++++++++++ > 6 files changed, 173 insertions(+), 19 deletions(-) > > diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c > index 32c03db2f5..5a3f6b5cd9 100644 > --- a/hw/virtio/vhost-iova-tree.c > +++ b/hw/virtio/vhost-iova-tree.c > @@ -26,15 +26,19 @@ struct VhostIOVATree { > /* Last addressable iova address in the device */ > uint64_t iova_last; > > - /* IOVA address to qemu memory maps. */ > + /* IOVA address to qemu SVQ memory maps. */ > IOVATree *iova_taddr_map; > > /* IOVA tree (IOVA allocator) */ > IOVATree *iova_map; > + > + /* GPA->IOVA tree */ > + IOVATree *gpa_map; > }; > > /** > * Create a new VhostIOVATree with a new set of IOVATree's: > + * - GPA->IOVA tree (gpa_map) > * - IOVA allocator (iova_map) > * - IOVA->HVA tree (iova_taddr_map) > * > @@ -50,6 +54,7 @@ VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, > hwaddr iova_last) > > tree->iova_taddr_map = iova_tree_new(); > tree->iova_map = iova_tree_new(); > + tree->gpa_map = gpa_tree_new(); > return tree; > } > > @@ -136,3 +141,49 @@ int vhost_iova_tree_insert(VhostIOVATree *iova_tree, > DMAMap *map) > > return iova_tree_insert(iova_tree->iova_taddr_map, map); > } > + > +/** > + * Insert a new GPA->IOVA mapping to the GPA->IOVA tree > + * > + * @iova_tree: The VhostIOVATree > + * @map: The GPA->IOVA mapping > + * > + * Returns: > + * - IOVA_OK if the map fits in the container > + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow) > + * - IOVA_ERR_OVERLAP if the GPA range overlaps with an existing range > + */ > +int vhost_gpa_tree_insert(VhostIOVATree *iova_tree, DMAMap *map) > +{ > + if (map->iova + map->size < map->iova || map->perm == IOMMU_NONE) { > + return IOVA_ERR_INVALID; > + } > + > + return gpa_tree_insert(iova_tree->gpa_map, map); > +} > + > +/** > + * Find the IOVA address stored from a guest memory address (GPA) > + * > + * @tree: The VhostIOVATree > + * @map: The map with the guest memory address > + * > + * Return the stored mapping, or NULL if not found. > + */ > +const DMAMap *vhost_gpa_tree_find_iova(const VhostIOVATree *tree, > + const DMAMap *map) > +{ > + return iova_tree_find_iova(tree->gpa_map, map); > +} > + > +/** > + * Remove existing mappings from the GPA->IOVA tree and IOVA tree > + * > + * @iova_tree: The VhostIOVATree > + * @map: The map to remove > + */ > +void vhost_gpa_tree_remove(VhostIOVATree *iova_tree, DMAMap map) > +{ > + iova_tree_remove(iova_tree->gpa_map, map); > + iova_tree_remove(iova_tree->iova_map, map); > +} > diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h > index 8bf7b64786..c22941db4f 100644 > --- a/hw/virtio/vhost-iova-tree.h > +++ b/hw/virtio/vhost-iova-tree.h > @@ -24,5 +24,8 @@ const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree > *iova_tree, > int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map); > void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map); > int vhost_iova_tree_insert(VhostIOVATree *iova_tree, DMAMap *map); > - > +int vhost_gpa_tree_insert(VhostIOVATree *iova_tree, DMAMap *map); > +const DMAMap *vhost_gpa_tree_find_iova(const VhostIOVATree *iova_tree, > + const DMAMap *map); > +void vhost_gpa_tree_remove(VhostIOVATree *iova_tree, DMAMap map); > #endif > diff --git a/hw/virtio/vhost-shadow-virtqueue.c > b/hw/virtio/vhost-shadow-virtqueue.c > index fc5f408f77..12eabddaa6 100644 > --- a/hw/virtio/vhost-shadow-virtqueue.c > +++ b/hw/virtio/vhost-shadow-virtqueue.c > @@ -16,6 +16,7 @@ > #include "qemu/log.h" > #include "qemu/memalign.h" > #include "linux-headers/linux/vhost.h" > +#include "exec/ramblock.h" > > /** > * Validate the transport device features that both guests can use with the > SVQ > @@ -88,14 +89,45 @@ static bool vhost_svq_translate_addr(const > VhostShadowVirtqueue *svq, > } > > for (size_t i = 0; i < num; ++i) { > - DMAMap needle = { > - .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base, > - .size = iovec[i].iov_len, > - }; > - Int128 needle_last, map_last; > - size_t off; > + RAMBlock *rb; > + hwaddr gpa; > + ram_addr_t offset; > + const DMAMap *map; > + DMAMap needle; > + > + /* > + * Determine if this HVA is backed by guest memory by attempting to > + * infer a RAM block from it. If a valid RAM block is returned, the > + * VA is backed by guest memory and we can derive the GPA from it. > + * Then search the GPA->IOVA tree for the corresponding IOVA. > + * > + * If the RAM block is invalid, the HVA is likely backed by host-only > + * memory. Use the HVA to search the IOVA->HVA tree for the > + * corresponding IOVA. > + * > + * TODO: This additional second lookup is sub-optimal when the HVA > + * is backed by host-only memory. Find optimizations for this > + * (e.g. using an HVA->IOVA tree). > + */ > + rb = qemu_ram_block_from_host(iovec[i].iov_base, false, &offset); > + if (rb) { > + gpa = rb->offset + offset; > + > + /* Search the GPA->IOVA tree */ > + needle = (DMAMap) { > + .translated_addr = gpa, > + .size = iovec[i].iov_len, > + }; > + map = vhost_gpa_tree_find_iova(svq->iova_tree, &needle); > + } else { > + /* Search the IOVA->HVA tree */ > + needle = (DMAMap) { > + .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base, > + .size = iovec[i].iov_len, > + }; > + map = vhost_iova_tree_find_iova(svq->iova_tree, &needle); > + }
I think that having this complex conditional here is a problem for future users of SVQ. > > - const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, > &needle); > /* > * Map cannot be NULL since iova map contains all guest space and > * qemu already has a physical address mapped > @@ -106,6 +138,8 @@ static bool vhost_svq_translate_addr(const > VhostShadowVirtqueue *svq, > needle.translated_addr); > return false; > } > + Int128 needle_last, map_last; > + size_t off; > > off = needle.translated_addr - map->translated_addr; > addrs[i] = map->iova + off; > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > index 6702459065..0da0a117dc 100644 > --- a/hw/virtio/vhost-vdpa.c > +++ b/hw/virtio/vhost-vdpa.c > @@ -373,9 +373,9 @@ static void vhost_vdpa_listener_region_add(MemoryListener > *listener, > > iova = mem_region.iova; > > - /* Add mapping to the IOVA->HVA tree */ > - mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr; > - r = vhost_iova_tree_insert(s->iova_tree, &mem_region); > + /* Add mapping to the GPA->IOVA tree */ > + mem_region.translated_addr = section->offset_within_address_space; > + r = vhost_gpa_tree_insert(s->iova_tree, &mem_region); > if (unlikely(r != IOVA_OK)) { > error_report("Can't add listener region mapping (%d)", r); > goto fail_map; > @@ -394,7 +394,7 @@ static void vhost_vdpa_listener_region_add(MemoryListener > *listener, > > fail_map: > if (s->shadow_data) { > - vhost_iova_tree_remove(s->iova_tree, mem_region); > + vhost_gpa_tree_remove(s->iova_tree, mem_region); > } > > fail: > @@ -448,21 +448,19 @@ static void > vhost_vdpa_listener_region_del(MemoryListener *listener, > > if (s->shadow_data) { > const DMAMap *result; > - const void *vaddr = memory_region_get_ram_ptr(section->mr) + > - section->offset_within_region + > - (iova - section->offset_within_address_space); > DMAMap mem_region = { > - .translated_addr = (hwaddr)(uintptr_t)vaddr, > + .translated_addr = section->offset_within_address_space, > .size = int128_get64(llsize) - 1, > }; > > - result = vhost_iova_tree_find_iova(s->iova_tree, &mem_region); > + /* Search the GPA->IOVA tree */ > + result = vhost_gpa_tree_find_iova(s->iova_tree, &mem_region); > if (!result) { > /* The memory listener map wasn't mapped */ > return; > } > iova = result->iova; > - vhost_iova_tree_remove(s->iova_tree, *result); > + vhost_gpa_tree_remove(s->iova_tree, *result); > } > vhost_vdpa_iotlb_batch_begin_once(s); > /* > diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h > index 2a10a7052e..57cfc63d33 100644 > --- a/include/qemu/iova-tree.h > +++ b/include/qemu/iova-tree.h > @@ -40,6 +40,15 @@ typedef struct DMAMap { > } QEMU_PACKED DMAMap; > typedef gboolean (*iova_tree_iterator)(DMAMap *map); > > +/** > + * gpa_tree_new: > + * > + * Create a new GPA->IOVA tree. > + * > + * Returns: the tree pointer on success, or NULL otherwise. > + */ > +IOVATree *gpa_tree_new(void); > + > /** > * iova_tree_new: > * > @@ -49,6 +58,19 @@ typedef gboolean (*iova_tree_iterator)(DMAMap *map); > */ > IOVATree *iova_tree_new(void); > > +/** > + * gpa_tree_insert: > + * > + * @tree: The GPA->IOVA tree we're inserting the mapping to > + * @map: The GPA->IOVA mapping to insert > + * > + * Insert a GPA range to the GPA->IOVA tree. If there are overlapped > + * ranges, IOVA_ERR_OVERLAP will be returned. > + * > + * Return: 0 if success, or < 0 if error. > + */ > +int gpa_tree_insert(IOVATree *tree, const DMAMap *map); > + I'd keep this GPA tree in VhostIOVATree as other IOVATree users like intel iommu do not use it. > /** > * iova_tree_insert: > * > diff --git a/util/iova-tree.c b/util/iova-tree.c > index 536789797e..e3f50fbf5c 100644 > --- a/util/iova-tree.c > +++ b/util/iova-tree.c > @@ -71,6 +71,22 @@ static int iova_tree_compare(gconstpointer a, > gconstpointer b, gpointer data) > return 0; > } > > +static int gpa_tree_compare(gconstpointer a, gconstpointer b, gpointer data) > +{ > + const DMAMap *m1 = a, *m2 = b; > + > + if (m1->translated_addr > m2->translated_addr + m2->size) { > + return 1; > + } > + > + if (m1->translated_addr + m1->size < m2->translated_addr) { > + return -1; > + } > + > + /* Overlapped */ > + return 0; > +} > + > IOVATree *iova_tree_new(void) > { > IOVATree *iova_tree = g_new0(IOVATree, 1); > @@ -81,6 +97,15 @@ IOVATree *iova_tree_new(void) > return iova_tree; > } > > +IOVATree *gpa_tree_new(void) > +{ > + IOVATree *gpa_tree = g_new0(IOVATree, 1); > + > + gpa_tree->tree = g_tree_new_full(gpa_tree_compare, NULL, g_free, NULL); > + > + return gpa_tree; > +} > + > const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map) > { > return g_tree_lookup(tree->tree, map); > @@ -128,6 +153,27 @@ static inline void iova_tree_insert_internal(GTree > *gtree, DMAMap *range) > g_tree_insert(gtree, range, range); > } > > +int gpa_tree_insert(IOVATree *tree, const DMAMap *map) > +{ > + DMAMap *new; > + > + if (map->translated_addr + map->size < map->translated_addr || > + map->perm == IOMMU_NONE) { > + return IOVA_ERR_INVALID; > + } > + > + /* We don't allow inserting ranges that overlap with existing ones */ > + if (iova_tree_find(tree, map)) { > + return IOVA_ERR_OVERLAP; > + } > + > + new = g_new0(DMAMap, 1); > + memcpy(new, map, sizeof(*new)); > + iova_tree_insert_internal(tree->tree, new); > + > + return IOVA_OK; > +} > + > int iova_tree_insert(IOVATree *tree, const DMAMap *map) > { > DMAMap *new; > -- > 2.43.5 >