The branch main has been updated by bnovkov: URL: https://cgit.FreeBSD.org/src/commit/?id=a4197ea477771d525c2970d0c42acab727e43f16
commit a4197ea477771d525c2970d0c42acab727e43f16 Author: Bojan Novković <bnov...@freebsd.org> AuthorDate: 2024-09-08 16:04:33 +0000 Commit: Bojan Novković <bnov...@freebsd.org> CommitDate: 2025-07-27 16:31:48 +0000 vmm: Add support for guest NUMA emulation This change adds the necessary kernelspace bits required for supporting NUMA domains in bhyve VMs. The layout of system memory segments and how they're created has been reworked. Each guest NUMA domain will now have its own memory segment. Furthermore, this change allows users to tweak the domain's backing vm_object domainset(9) policy. Reviewed by: markj Differential Revision: https://reviews.freebsd.org/D44565 --- lib/libvmmapi/Makefile | 2 +- lib/libvmmapi/vmmapi.h | 14 +---- sys/amd64/include/vmm_dev.h | 7 ++- sys/arm64/include/vmm_dev.h | 5 ++ sys/dev/vmm/vmm_dev.c | 129 ++++++++++++++++++++++++++++++++++++++------ sys/dev/vmm/vmm_mem.c | 15 +++++- sys/dev/vmm/vmm_mem.h | 27 ++++++++-- sys/riscv/include/vmm_dev.h | 5 ++ 8 files changed, 168 insertions(+), 36 deletions(-) diff --git a/lib/libvmmapi/Makefile b/lib/libvmmapi/Makefile index 1866c8fa5e7c..6dd0deeaa9c0 100644 --- a/lib/libvmmapi/Makefile +++ b/lib/libvmmapi/Makefile @@ -1,6 +1,6 @@ PACKAGE=lib${LIB} LIB= vmmapi -SHLIB_MAJOR= 6 +SHLIB_MAJOR= 7 SRCS= vmmapi.c INCS= vmmapi.h diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index 440064ad13cb..2072c0105e37 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -40,7 +40,7 @@ * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ -#define VMMAPI_VERSION 0200 /* 2 digit major followed by 2 digit minor */ +#define VMMAPI_VERSION 0300 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vcpu; @@ -64,18 +64,6 @@ enum vm_mmap_style { #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ -/* - * Identifiers for memory segments: - * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. - * - the remaining identifiers can be used to create devmem segments. - */ -enum { - VM_SYSMEM, - VM_BOOTROM, - VM_FRAMEBUFFER, - VM_PCIROM, -}; - __BEGIN_DECLS /* * Get the length and name of the memory segment identified by 'segid'. diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 1f86538ce5f3..441330fd57b8 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -29,6 +29,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +#include <sys/domainset.h> + #include <machine/vmm.h> #include <machine/vmm_snapshot.h> @@ -52,7 +54,10 @@ struct vm_munmap { struct vm_memseg { int segid; size_t len; - char name[VM_MAX_SUFFIXLEN + 1]; + char name[VM_MAX_SUFFIXLEN + 1]; + domainset_t *ds_mask; + size_t ds_mask_size; + int ds_policy; }; struct vm_register { diff --git a/sys/arm64/include/vmm_dev.h b/sys/arm64/include/vmm_dev.h index 938bea47c7f8..219f1116c728 100644 --- a/sys/arm64/include/vmm_dev.h +++ b/sys/arm64/include/vmm_dev.h @@ -27,6 +27,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +#include <sys/domainset.h> + #include <machine/vmm.h> struct vm_memmap { @@ -49,6 +51,9 @@ struct vm_memseg { int segid; size_t len; char name[VM_MAX_SUFFIXLEN + 1]; + domainset_t *ds_mask; + size_t ds_mask_size; + int ds_policy; }; struct vm_register { diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index 2e2ae0a162d8..9f2b009d02ec 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -30,7 +30,8 @@ #include <dev/vmm/vmm_mem.h> #include <dev/vmm/vmm_stat.h> -#if defined(__amd64__) && defined(COMPAT_FREEBSD12) +#ifdef __amd64__ +#ifdef COMPAT_FREEBSD12 struct vm_memseg_12 { int segid; size_t len; @@ -42,7 +43,22 @@ _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI"); _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12) #define VM_GET_MEMSEG_12 \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12) -#endif +#endif /* COMPAT_FREEBSD12 */ +#ifdef COMPAT_FREEBSD14 +struct vm_memseg_14 { + int segid; + size_t len; + char name[VM_MAX_SUFFIXLEN + 1]; +}; +_Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16), + "COMPAT_FREEBSD14 ABI"); + +#define VM_ALLOC_MEMSEG_14 \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14) +#define VM_GET_MEMSEG_14 \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14) +#endif /* COMPAT_FREEBSD14 */ +#endif /* __amd64__ */ struct devmem_softc { int segid; @@ -257,7 +273,8 @@ get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) } static int -alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) +alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len, + struct domainset *domainset) { char *name; int error; @@ -278,8 +295,7 @@ alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) if (error) goto done; } - - error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); + error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset); if (error) goto done; @@ -295,6 +311,20 @@ done: return (error); } +#if defined(__amd64__) && \ + (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12)) +/* + * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts. + */ +static void +adjust_segid(struct vm_memseg *mseg) +{ + if (mseg->segid != VM_SYSMEM) { + mseg->segid += (VM_BOOTROM - 1); + } +} +#endif + static int vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) @@ -353,10 +383,16 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = { VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_STAT_DESC, 0), -#if defined(__amd64__) && defined(COMPAT_FREEBSD12) +#ifdef __amd64__ +#ifdef COMPAT_FREEBSD12 VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), #endif +#ifdef COMPAT_FREEBSD14 + VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14, + VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), +#endif +#endif /* __amd64__ */ VMMDEV_IOCTL(VM_ALLOC_MEMSEG, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), VMMDEV_IOCTL(VM_MMAP_MEMSEG, @@ -366,9 +402,14 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = { VMMDEV_IOCTL(VM_REINIT, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), -#if defined(__amd64__) && defined(COMPAT_FREEBSD12) +#ifdef __amd64__ +#if defined(COMPAT_FREEBSD12) VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS), #endif +#ifdef COMPAT_FREEBSD14 + VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS), +#endif +#endif /* __amd64__ */ VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS), VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS), @@ -388,6 +429,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vmmdev_softc *sc; struct vcpu *vcpu; const struct vmmdev_ioctl *ioctl; + struct vm_memseg *mseg; int error, vcpuid; sc = vmmdev_lookup2(cdev); @@ -499,20 +541,77 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); break; } -#if defined(__amd64__) && defined(COMPAT_FREEBSD12) +#ifdef __amd64__ +#ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_12: - error = alloc_memseg(sc, (struct vm_memseg *)data, - sizeof(((struct vm_memseg_12 *)0)->name)); + mseg = (struct vm_memseg *)data; + + adjust_segid(mseg); + error = alloc_memseg(sc, mseg, + sizeof(((struct vm_memseg_12 *)0)->name), NULL); break; case VM_GET_MEMSEG_12: - error = get_memseg(sc, (struct vm_memseg *)data, + mseg = (struct vm_memseg *)data; + + adjust_segid(mseg); + error = get_memseg(sc, mseg, sizeof(((struct vm_memseg_12 *)0)->name)); break; -#endif - case VM_ALLOC_MEMSEG: - error = alloc_memseg(sc, (struct vm_memseg *)data, - sizeof(((struct vm_memseg *)0)->name)); +#endif /* COMPAT_FREEBSD12 */ +#ifdef COMPAT_FREEBSD14 + case VM_ALLOC_MEMSEG_14: + mseg = (struct vm_memseg *)data; + + adjust_segid(mseg); + error = alloc_memseg(sc, mseg, + sizeof(((struct vm_memseg_14 *)0)->name), NULL); + break; + case VM_GET_MEMSEG_14: + mseg = (struct vm_memseg *)data; + + adjust_segid(mseg); + error = get_memseg(sc, mseg, + sizeof(((struct vm_memseg_14 *)0)->name)); + break; +#endif /* COMPAT_FREEBSD14 */ +#endif /* __amd64__ */ + case VM_ALLOC_MEMSEG: { + domainset_t *mask; + struct domainset *domainset, domain; + + domainset = NULL; + mseg = (struct vm_memseg *)data; + if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) { + if (mseg->ds_mask_size < sizeof(domainset_t) || + mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) { + error = ERANGE; + break; + } + memset(&domain, 0, sizeof(domain)); + mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK); + error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size); + if (error) { + free(mask, M_VMMDEV); + break; + } + error = domainset_populate(&domain, mask, mseg->ds_policy, + mseg->ds_mask_size); + if (error) { + free(mask, M_VMMDEV); + break; + } + domainset = domainset_create(&domain); + if (domainset == NULL) { + error = EINVAL; + free(mask, M_VMMDEV); + break; + } + free(mask, M_VMMDEV); + } + error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset); + break; + } case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); diff --git a/sys/dev/vmm/vmm_mem.c b/sys/dev/vmm/vmm_mem.c index c61ae2d44b96..be59e37de33d 100644 --- a/sys/dev/vmm/vmm_mem.c +++ b/sys/dev/vmm/vmm_mem.c @@ -7,6 +7,7 @@ #include <sys/types.h> #include <sys/lock.h> +#include <sys/malloc.h> #include <sys/sx.h> #include <sys/systm.h> @@ -156,10 +157,11 @@ vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa) } int -vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem, + struct domainset *obj_domainset) { - struct vm_mem *mem; struct vm_mem_seg *seg; + struct vm_mem *mem; vm_object_t obj; mem = vm_mem(vm); @@ -179,13 +181,22 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) return (EINVAL); } + /* + * When given an impossible policy, signal an + * error to the user. + */ + if (obj_domainset != NULL && domainset_empty_vm(obj_domainset)) + return (EINVAL); obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT); if (obj == NULL) return (ENOMEM); seg->len = len; seg->object = obj; + if (obj_domainset != NULL) + seg->object->domain.dr_policy = obj_domainset; seg->sysmem = sysmem; + return (0); } diff --git a/sys/dev/vmm/vmm_mem.h b/sys/dev/vmm/vmm_mem.h index a4be4c1c57aa..856470cf2590 100644 --- a/sys/dev/vmm/vmm_mem.h +++ b/sys/dev/vmm/vmm_mem.h @@ -8,6 +8,27 @@ #ifndef _DEV_VMM_MEM_H_ #define _DEV_VMM_MEM_H_ +/* Maximum number of NUMA domains in a guest. */ +#define VM_MAXMEMDOM 8 +#define VM_MAXSYSMEM VM_MAXMEMDOM + +/* + * Identifiers for memory segments. + * Each guest NUMA domain is represented by a single system + * memory segment from [VM_SYSMEM, VM_MAXSYSMEM). + * The remaining identifiers can be used to create devmem segments. + */ +enum { + VM_SYSMEM = 0, + VM_BOOTROM = VM_MAXSYSMEM, + VM_FRAMEBUFFER, + VM_PCIROM, + VM_MEMSEG_END +}; + +#define VM_MAX_MEMSEGS VM_MEMSEG_END +#define VM_MAX_MEMMAPS (VM_MAX_MEMSEGS * 2) + #ifdef _KERNEL #include <sys/types.h> @@ -31,9 +52,6 @@ struct vm_mem_map { int flags; }; -#define VM_MAX_MEMSEGS 4 -#define VM_MAX_MEMMAPS 8 - struct vm_mem { struct vm_mem_map mem_maps[VM_MAX_MEMMAPS]; struct vm_mem_seg mem_segs[VM_MAX_MEMSEGS]; @@ -55,7 +73,8 @@ void vm_assert_memseg_xlocked(struct vm *vm); int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot, int flags); int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len); -int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem, + struct domainset *obj_domainset); void vm_free_memseg(struct vm *vm, int ident); /* diff --git a/sys/riscv/include/vmm_dev.h b/sys/riscv/include/vmm_dev.h index 856ff0778b95..4d30d5a1c35b 100644 --- a/sys/riscv/include/vmm_dev.h +++ b/sys/riscv/include/vmm_dev.h @@ -34,6 +34,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +#include <sys/domainset.h> + #include <machine/vmm.h> struct vm_memmap { @@ -56,6 +58,9 @@ struct vm_memseg { int segid; size_t len; char name[VM_MAX_SUFFIXLEN + 1]; + domainset_t *ds_mask; + size_t ds_mask_size; + int ds_policy; }; struct vm_register {