Hi Mike,

Thanks for  looking into this issue.

On 2/23/23 05:35, Mike Pattrick wrote:
The arguments passed to madvise should be aligned to the alignment of
the backing memory. Now we keep track of each regions alignment and use
then when setting coredump preferences. To facilitate this, a new member
was added to rte_vhost_mem_region. A new function was added to easily
translate memory address back to region alignment. Unneeded calls to
madvise were reduced, as the cache removal case should already be
covered by the cache insertion case. The previously inline function
mem_set_dump was removed from a header file and made not inline.

Fixes: 338ad77c9ed3 ("vhost: exclude VM hugepages from coredumps")

Signed-off-by: Mike Pattrick <m...@redhat.com>
---
Since v1:
  - Corrected a cast for 32bit compiles
---
  lib/vhost/iotlb.c      |  9 +++---
  lib/vhost/rte_vhost.h  |  1 +
  lib/vhost/vhost.h      | 12 ++------
  lib/vhost/vhost_user.c | 63 +++++++++++++++++++++++++++++++++++-------
  4 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/lib/vhost/iotlb.c b/lib/vhost/iotlb.c
index a0b8fd7302..5293507b63 100644
--- a/lib/vhost/iotlb.c
+++ b/lib/vhost/iotlb.c
@@ -149,7 +149,6 @@ vhost_user_iotlb_cache_remove_all(struct vhost_virtqueue 
*vq)
        rte_rwlock_write_lock(&vq->iotlb_lock);
RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
-               mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true);

Hmm, it should have been called with enable=false here since we are
removing the entry from the IOTLB cache. It should be kept in order to
"DONTDUMP" pages evicted from the cache.

                TAILQ_REMOVE(&vq->iotlb_list, node, next);
                vhost_user_iotlb_pool_put(vq, node);
        }
@@ -171,7 +170,6 @@ vhost_user_iotlb_cache_random_evict(struct vhost_virtqueue 
*vq)
RTE_TAILQ_FOREACH_SAFE(node, &vq->iotlb_list, next, temp_node) {
                if (!entry_idx) {
-                       mem_set_dump((void *)(uintptr_t)node->uaddr, 
node->size, true);

Same here.

                        TAILQ_REMOVE(&vq->iotlb_list, node, next);
                        vhost_user_iotlb_pool_put(vq, node);
                        vq->iotlb_cache_nr--;
@@ -224,14 +222,16 @@ vhost_user_iotlb_cache_insert(struct virtio_net *dev, 
struct vhost_virtqueue *vq
                        vhost_user_iotlb_pool_put(vq, new_node);
                        goto unlock;
                } else if (node->iova > new_node->iova) {
-                       mem_set_dump((void *)(uintptr_t)node->uaddr, 
node->size, true);
+                       mem_set_dump((void *)(uintptr_t)new_node->uaddr, 
new_node->size, true,
+                               hua_to_alignment(dev->mem, (void 
*)(uintptr_t)node->uaddr));
                        TAILQ_INSERT_BEFORE(node, new_node, next);
                        vq->iotlb_cache_nr++;
                        goto unlock;
                }
        }
- mem_set_dump((void *)(uintptr_t)node->uaddr, node->size, true);
+       mem_set_dump((void *)(uintptr_t)new_node->uaddr, new_node->size, true,
+               hua_to_alignment(dev->mem, (void *)(uintptr_t)new_node->uaddr));
        TAILQ_INSERT_TAIL(&vq->iotlb_list, new_node, next);
        vq->iotlb_cache_nr++;
@@ -259,7 +259,6 @@ vhost_user_iotlb_cache_remove(struct vhost_virtqueue *vq,
                        break;
if (iova < node->iova + node->size) {
-                       mem_set_dump((void *)(uintptr_t)node->uaddr, 
node->size, true);
                        TAILQ_REMOVE(&vq->iotlb_list, node, next);
                        vhost_user_iotlb_pool_put(vq, node);
                        vq->iotlb_cache_nr--;
diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h
index a395843fe9..c5c97ea67e 100644
--- a/lib/vhost/rte_vhost.h
+++ b/lib/vhost/rte_vhost.h
@@ -136,6 +136,7 @@ struct rte_vhost_mem_region {
        void     *mmap_addr;
        uint64_t mmap_size;
        int fd;
+       uint64_t alignment;

This is not possible to do this as it breaks the ABI.
You have to store the information somewhere else, or simply call
get_blk_size() in hua_to_alignment() since the fd is not closed.

  };
/**
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 5750f0c005..a2467ba509 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -1009,14 +1009,6 @@ mbuf_is_consumed(struct rte_mbuf *m)
        return true;
  }
-static __rte_always_inline void
-mem_set_dump(__rte_unused void *ptr, __rte_unused size_t size, __rte_unused 
bool enable)
-{
-#ifdef MADV_DONTDUMP
-       if (madvise(ptr, size, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) {
-               rte_log(RTE_LOG_INFO, vhost_config_log_level,
-                       "VHOST_CONFIG: could not set coredump preference 
(%s).\n", strerror(errno));
-       }
-#endif
-}
+uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr);
+void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment);
  #endif /* _VHOST_NET_CDEV_H_ */
diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index d702d082dd..6d09597fbe 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -737,6 +737,40 @@ log_addr_to_gpa(struct virtio_net *dev, struct 
vhost_virtqueue *vq)
        return log_gpa;
  }
+uint64_t
+hua_to_alignment(struct rte_vhost_memory *mem, void *ptr)
+{
+       struct rte_vhost_mem_region *r;
+       uint32_t i;
+       uintptr_t hua = (uintptr_t)ptr;
+
+       for (i = 0; i < mem->nregions; i++) {
+               r = &mem->regions[i];
+               if (hua >= r->host_user_addr &&
+                       hua < r->host_user_addr + r->size) {
+                       return r->alignment;
+               }
+       }
+
+       /* If region isn't found, don't align at all */
+       return 1;
+}
+
+void
+mem_set_dump(void *ptr, size_t size, bool enable, uint64_t pagesz)
+{
+#ifdef MADV_DONTDUMP
+       void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz);
+       uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz);
+       size_t len = end - (uintptr_t)start;
+
+       if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) {
+               rte_log(RTE_LOG_INFO, vhost_config_log_level,
+                       "VHOST_CONFIG: could not set coredump preference 
(%s).\n", strerror(errno));
+       }
+#endif
+}
+
  static void
  translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue 
**pvq)
  {
@@ -767,6 +801,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct 
vhost_virtqueue **pvq)
                        return;
                }
+ mem_set_dump(vq->desc_packed, len, true,
+                       hua_to_alignment(dev->mem, vq->desc_packed));
                numa_realloc(&dev, &vq);
                *pdev = dev;
                *pvq = vq;
@@ -782,6 +818,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct 
vhost_virtqueue **pvq)
                        return;
                }
+ mem_set_dump(vq->driver_event, len, true,
+                       hua_to_alignment(dev->mem, vq->driver_event));
                len = sizeof(struct vring_packed_desc_event);
                vq->device_event = (struct vring_packed_desc_event *)
                                        (uintptr_t)ring_addr_to_vva(dev,
@@ -793,9 +831,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct 
vhost_virtqueue **pvq)
                        return;
                }
- mem_set_dump(vq->desc_packed, len, true);
-               mem_set_dump(vq->driver_event, len, true);
-               mem_set_dump(vq->device_event, len, true);
+               mem_set_dump(vq->device_event, len, true,
+                       hua_to_alignment(dev->mem, vq->device_event));
                vq->access_ok = true;
                return;
        }
@@ -812,6 +849,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct 
vhost_virtqueue **pvq)
                return;
        }
+ mem_set_dump(vq->desc, len, true, hua_to_alignment(dev->mem, vq->desc));
        numa_realloc(&dev, &vq);
        *pdev = dev;
        *pvq = vq;
@@ -827,6 +865,7 @@ translate_ring_addresses(struct virtio_net **pdev, struct 
vhost_virtqueue **pvq)
                return;
        }
+ mem_set_dump(vq->avail, len, true, hua_to_alignment(dev->mem, vq->avail));
        len = sizeof(struct vring_used) +
                sizeof(struct vring_used_elem) * vq->size;
        if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
@@ -839,6 +878,8 @@ translate_ring_addresses(struct virtio_net **pdev, struct 
vhost_virtqueue **pvq)
                return;
        }
+ mem_set_dump(vq->used, len, true, hua_to_alignment(dev->mem, vq->used));
+
        if (vq->last_used_idx != vq->used->idx) {
                VHOST_LOG_CONFIG(dev->ifname, WARNING,
                        "last_used_idx (%u) and vq->used->idx (%u) 
mismatches;\n",
@@ -849,9 +890,6 @@ translate_ring_addresses(struct virtio_net **pdev, struct 
vhost_virtqueue **pvq)
                        "some packets maybe resent for Tx and dropped for 
Rx\n");
        }
- mem_set_dump(vq->desc, len, true);
-       mem_set_dump(vq->avail, len, true);
-       mem_set_dump(vq->used, len, true);
        vq->access_ok = true;
VHOST_LOG_CONFIG(dev->ifname, DEBUG, "mapped address desc: %p\n", vq->desc);
@@ -1230,7 +1268,8 @@ vhost_user_mmap_region(struct virtio_net *dev,
        region->mmap_addr = mmap_addr;
        region->mmap_size = mmap_size;
        region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
-       mem_set_dump(mmap_addr, mmap_size, false);
+       region->alignment = alignment;
+       mem_set_dump(mmap_addr, mmap_size, false, alignment);
if (dev->async_copy) {
                if (add_guest_pages(dev, region, alignment) < 0) {
@@ -1535,7 +1574,6 @@ inflight_mem_alloc(struct virtio_net *dev, const char 
*name, size_t size, int *f
                return NULL;
        }
- mem_set_dump(ptr, size, false);
        *fd = mfd;
        return ptr;
  }
@@ -1566,6 +1604,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev,
        uint64_t pervq_inflight_size, mmap_size;
        uint16_t num_queues, queue_size;
        struct virtio_net *dev = *pdev;
+       uint64_t alignment;
        int fd, i, j;
        int numa_node = SOCKET_ID_ANY;
        void *addr;
@@ -1628,6 +1667,8 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev,
                dev->inflight_info->fd = -1;
        }
+ alignment = get_blk_size(fd);
+       mem_set_dump(addr, mmap_size, false, alignment);
        dev->inflight_info->addr = addr;
        dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = 
mmap_size;
        dev->inflight_info->fd = ctx->fds[0] = fd;
@@ -1744,10 +1785,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev,
                dev->inflight_info->fd = -1;
        }
- mem_set_dump(addr, mmap_size, false);
        dev->inflight_info->fd = fd;
        dev->inflight_info->addr = addr;
        dev->inflight_info->size = mmap_size;
+       mem_set_dump(addr, mmap_size, false, get_blk_size(fd));
for (i = 0; i < num_queues; i++) {
                vq = dev->virtqueue[i];
@@ -2242,6 +2283,7 @@ vhost_user_set_log_base(struct virtio_net **pdev,
        struct virtio_net *dev = *pdev;
        int fd = ctx->fds[0];
        uint64_t size, off;
+       uint64_t alignment;
        void *addr;
        uint32_t i;
@@ -2280,6 +2322,7 @@ vhost_user_set_log_base(struct virtio_net **pdev,
         * fail when offset is not page size aligned.
         */
        addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       alignment = get_blk_size(fd);
        close(fd);
        if (addr == MAP_FAILED) {
                VHOST_LOG_CONFIG(dev->ifname, ERR, "mmap log base failed!\n");
@@ -2296,7 +2339,7 @@ vhost_user_set_log_base(struct virtio_net **pdev,
        dev->log_addr = (uint64_t)(uintptr_t)addr;
        dev->log_base = dev->log_addr + off;
        dev->log_size = size;
-       mem_set_dump(addr, size, false);
+       mem_set_dump(addr, size + off, false, alignment);
for (i = 0; i < dev->nr_vring; i++) {
                struct vhost_virtqueue *vq = dev->virtqueue[i];

Reply via email to