The branch main has been updated by bnovkov: URL: https://cgit.FreeBSD.org/src/commit/?id=08c7dd2fbe4fb7ae5cd6943afef04bd4cb350c1f
commit 08c7dd2fbe4fb7ae5cd6943afef04bd4cb350c1f Author: Bojan Novković <bnov...@freebsd.org> AuthorDate: 2024-09-08 15:57:55 +0000 Commit: Bojan Novković <bnov...@freebsd.org> CommitDate: 2025-07-27 16:31:48 +0000 libvmmapi: Add support for setting up and configuring guest NUMA domains This patch reworks libvmmapi to provide support for emulating NUMA domains in guests. More specifically, it reworks 'vm_setup_memory' to setup system memory segments for each guest NUMA domain. An emulated NUMA domain is described by a 'struct vmdom' in vmmapi.h. Aside from its size in bytes, each domain can be configured to use a specific domainset(9) policy and domain mask. 'vm_setup_memory' now takes two additional arguments - an array of struct vmdoms and the array's size. It then proceeds to set up a memory segment for each specified domain using the existing memory mapping scheme. If no domain info is passed, the memory setup falls back to the original, non-NUMA behaviour. Differential Revision: https://reviews.freebsd.org/D44566 Reviewed by: markj --- lib/libvmmapi/internal.h | 11 ++- lib/libvmmapi/vmmapi.c | 181 ++++++++++++++++++++++++++++++++++------------- lib/libvmmapi/vmmapi.h | 12 +++- 3 files changed, 145 insertions(+), 59 deletions(-) diff --git a/lib/libvmmapi/internal.h b/lib/libvmmapi/internal.h index aa7b1d8e6a93..4afe1cab3460 100644 --- a/lib/libvmmapi/internal.h +++ b/lib/libvmmapi/internal.h @@ -8,12 +8,7 @@ #define __VMMAPI_INTERNAL_H__ #include <sys/types.h> - -enum { - VM_MEMSEG_LOW, - VM_MEMSEG_HIGH, - VM_MEMSEG_COUNT, -}; +#include <dev/vmm/vmm_mem.h> struct vmctx { int fd; /* device file descriptor */ @@ -21,7 +16,9 @@ struct vmctx { struct { vm_paddr_t base; vm_size_t size; - } memsegs[VM_MEMSEG_COUNT]; + } memsegs[VM_MAX_MEMSEGS]; + size_t lowmem_size; + size_t highmem_size; int memflags; char *baseaddr; char *name; diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index a1a5d56ff8a2..77f0f8f5c581 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -28,13 +28,14 @@ #include <sys/param.h> #include <sys/capsicum.h> +#include <sys/cpuset.h> +#include <sys/domainset.h> #include <sys/sysctl.h> #include <sys/ioctl.h> #include <sys/mman.h> #include <sys/linker.h> #include <sys/module.h> #include <sys/_iovec.h> -#include <sys/cpuset.h> #include <capsicum_helpers.h> #include <err.h> @@ -322,8 +323,8 @@ vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, { *guest_baseaddr = ctx->baseaddr; - *lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size; - *highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size; + *lowmem_size = ctx->lowmem_size; + *highmem_size = ctx->highmem_size; return (0); } @@ -379,7 +380,8 @@ cmpseg(size_t len, const char *str, size_t len2, const char *str2) } static int -vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) +vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name, + int ds_policy, domainset_t *ds_mask, size_t ds_size) { struct vm_memseg memseg; size_t n; @@ -407,6 +409,13 @@ vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) bzero(&memseg, sizeof(struct vm_memseg)); memseg.segid = segid; memseg.len = len; + if (ds_mask == NULL) { + memseg.ds_policy = DOMAINSET_POLICY_INVALID; + } else { + memseg.ds_policy = ds_policy; + memseg.ds_mask = ds_mask; + memseg.ds_mask_size = ds_size; + } if (name != NULL) { n = strlcpy(memseg.name, name, sizeof(memseg.name)); if (n >= sizeof(memseg.name)) { @@ -442,13 +451,14 @@ vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, } static int -setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) +map_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len, + size_t segoff, char *base) { char *ptr; int error, flags; /* Map 'len' bytes starting at 'gpa' in the guest address space */ - error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); + error = vm_mmap_memseg(ctx, gpa, segid, segoff, len, PROT_ALL); if (error) return (error); @@ -464,65 +474,136 @@ setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) return (0); } +/* + * Allocates and maps virtual machine memory segments according + * to the NUMA topology specified by the 'doms' array. + * + * The domains are laid out sequentially in the guest's physical address space. + * The [VM_LOWMEM_LIMIT, VM_HIGHMEM_BASE) address range is skipped and + * left unmapped. + */ int -vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) +vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style vms, + struct vm_mem_domain *doms, int ndoms) { - size_t objsize, len; - vm_paddr_t gpa; + size_t low_len, len, totalsize; + struct vm_mem_domain *dom; + struct vm_memseg memseg; char *baseaddr, *ptr; - int error; + int error, i, segid; + vm_paddr_t gpa; + /* Sanity checks. */ assert(vms == VM_MMAP_ALL); - - /* - * If 'memsize' cannot fit entirely in the 'lowmem' segment then create - * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder. - */ - if (memsize > VM_LOWMEM_LIMIT) { - ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT; - ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT; - objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size; - } else { - ctx->memsegs[VM_MEMSEG_LOW].size = memsize; - ctx->memsegs[VM_MEMSEG_HIGH].size = 0; - objsize = memsize; + if (doms == NULL || ndoms <= 0 || ndoms > VM_MAXMEMDOM) { + errno = EINVAL; + return (-1); } - error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); - if (error) - return (error); + /* Calculate total memory size. */ + totalsize = 0; + for (i = 0; i < ndoms; i++) + totalsize += doms[i].size; + + if (totalsize > VM_LOWMEM_LIMIT) + totalsize = VM_HIGHMEM_BASE + (totalsize - VM_LOWMEM_LIMIT); /* * Stake out a contiguous region covering the guest physical memory * and the adjoining guard regions. */ - len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; + len = VM_MMAP_GUARD_SIZE + totalsize + VM_MMAP_GUARD_SIZE; ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (ptr == MAP_FAILED) return (-1); - baseaddr = ptr + VM_MMAP_GUARD_SIZE; - if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) { - gpa = VM_HIGHMEM_BASE; - len = ctx->memsegs[VM_MEMSEG_HIGH].size; - error = setup_memory_segment(ctx, gpa, len, baseaddr); - if (error) - return (error); - } - if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) { - gpa = 0; - len = ctx->memsegs[VM_MEMSEG_LOW].size; - error = setup_memory_segment(ctx, gpa, len, baseaddr); - if (error) - return (error); - } + /* + * Allocate and map memory segments for the virtual machine. + */ + gpa = VM_LOWMEM_LIMIT > 0 ? 0 : VM_HIGHMEM_BASE; + ctx->lowmem_size = 0; + ctx->highmem_size = 0; + for (i = 0; i < ndoms; i++) { + segid = VM_SYSMEM + i; + dom = &doms[i]; + + /* + * Check if the memory segment already exists. + * If 'ndoms' is greater than one, refuse to proceed if the + * memseg already exists. If only one domain was requested, use + * the existing segment to preserve the behaviour of the previous + * implementation. + * + * Splitting existing memory segments is tedious and + * error-prone, which is why we don't support NUMA + * domains for bhyveload(8)-loaded VMs. + */ + error = vm_get_memseg(ctx, segid, &len, memseg.name, + sizeof(memseg.name)); + if (error == 0 && len != 0) { + if (ndoms != 1) { + errno = EEXIST; + return (-1); + } else + doms[0].size = len; + } else { + error = vm_alloc_memseg(ctx, segid, dom->size, NULL, + dom->ds_policy, dom->ds_mask, dom->ds_size); + if (error) + return (error); + } + /* + * If a domain is split by VM_LOWMEM_LIMIT then break + * its segment mapping into two parts, one below VM_LOWMEM_LIMIT + * and one above VM_HIGHMEM_BASE. + */ + if (gpa <= VM_LOWMEM_LIMIT && + gpa + dom->size > VM_LOWMEM_LIMIT) { + low_len = VM_LOWMEM_LIMIT - gpa; + error = map_memory_segment(ctx, segid, gpa, low_len, 0, + baseaddr); + if (error) + return (error); + ctx->lowmem_size = VM_LOWMEM_LIMIT; + /* Map the remainder. */ + gpa = VM_HIGHMEM_BASE; + len = dom->size - low_len; + error = map_memory_segment(ctx, segid, gpa, len, + low_len, baseaddr); + if (error) + return (error); + } else { + len = dom->size; + error = map_memory_segment(ctx, segid, gpa, len, 0, + baseaddr); + if (error) + return (error); + } + if (gpa <= VM_LOWMEM_LIMIT) + ctx->lowmem_size += len; + else + ctx->highmem_size += len; + gpa += len; + } ctx->baseaddr = baseaddr; return (0); } +int +vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) +{ + struct vm_mem_domain dom0; + + memset(&dom0, 0, sizeof(dom0)); + dom0.ds_policy = DOMAINSET_POLICY_INVALID; + dom0.size = memsize; + + return (vm_setup_memory_domains(ctx, vms, &dom0, 1)); +} + /* * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in * the lowmem or highmem regions. @@ -535,13 +616,13 @@ vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { vm_size_t lowsize, highsize; - lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; + lowsize = ctx->lowmem_size; if (lowsize > 0) { if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) return (ctx->baseaddr + gaddr); } - highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; + highsize = ctx->highmem_size; if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && gaddr + len <= VM_HIGHMEM_BASE + highsize) @@ -559,12 +640,12 @@ vm_rev_map_gpa(struct vmctx *ctx, void *addr) offaddr = (char *)addr - ctx->baseaddr; - lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; + lowsize = ctx->lowmem_size; if (lowsize > 0) if (offaddr <= lowsize) return (offaddr); - highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; + highsize = ctx->highmem_size; if (highsize > 0) if (offaddr >= VM_HIGHMEM_BASE && offaddr < VM_HIGHMEM_BASE + highsize) @@ -583,8 +664,7 @@ vm_get_name(struct vmctx *ctx) size_t vm_get_lowmem_size(struct vmctx *ctx) { - - return (ctx->memsegs[VM_MEMSEG_LOW].size); + return (ctx->lowmem_size); } vm_paddr_t @@ -597,8 +677,7 @@ vm_get_highmem_base(struct vmctx *ctx __unused) size_t vm_get_highmem_size(struct vmctx *ctx) { - - return (ctx->memsegs[VM_MEMSEG_HIGH].size); + return (ctx->highmem_size); } void * @@ -616,7 +695,7 @@ vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) goto done; } - error = vm_alloc_memseg(ctx, segid, len, name); + error = vm_alloc_memseg(ctx, segid, len, name, 0, NULL, 0); if (error) goto done; diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index 2072c0105e37..b637c45d1eff 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -64,6 +64,14 @@ enum vm_mmap_style { #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ +/* Memory size and allocation policy for a single NUMA domain. */ +struct vm_mem_domain { + size_t size; + int ds_policy; + domainset_t *ds_mask; + size_t ds_size; +}; + __BEGIN_DECLS /* * Get the length and name of the memory segment identified by 'segid'. @@ -115,7 +123,9 @@ struct vcpu *vm_vcpu_open(struct vmctx *ctx, int vcpuid); void vm_vcpu_close(struct vcpu *vcpu); int vcpu_id(struct vcpu *vcpu); int vm_parse_memsize(const char *optarg, size_t *memsize); -int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); +int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); +int vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style s, + struct vm_mem_domain *doms, int ndoms); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); /* inverse operation to vm_map_gpa - extract guest address from host pointer */ vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr);