The branch main has been updated by bnovkov:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=a4197ea477771d525c2970d0c42acab727e43f16

commit a4197ea477771d525c2970d0c42acab727e43f16
Author:     Bojan Novković <bnov...@freebsd.org>
AuthorDate: 2024-09-08 16:04:33 +0000
Commit:     Bojan Novković <bnov...@freebsd.org>
CommitDate: 2025-07-27 16:31:48 +0000

    vmm: Add support for guest NUMA emulation
    
    This change adds the necessary kernelspace bits required for
    supporting NUMA domains in bhyve VMs.
    
    The layout of system memory segments and how they're created has
    been reworked. Each guest NUMA domain will now have its own memory
    segment. Furthermore, this change allows users to tweak the domain's
    backing vm_object domainset(9) policy.
    
    Reviewed by:    markj
    Differential Revision:  https://reviews.freebsd.org/D44565
---
 lib/libvmmapi/Makefile      |   2 +-
 lib/libvmmapi/vmmapi.h      |  14 +----
 sys/amd64/include/vmm_dev.h |   7 ++-
 sys/arm64/include/vmm_dev.h |   5 ++
 sys/dev/vmm/vmm_dev.c       | 129 ++++++++++++++++++++++++++++++++++++++------
 sys/dev/vmm/vmm_mem.c       |  15 +++++-
 sys/dev/vmm/vmm_mem.h       |  27 ++++++++--
 sys/riscv/include/vmm_dev.h |   5 ++
 8 files changed, 168 insertions(+), 36 deletions(-)

diff --git a/lib/libvmmapi/Makefile b/lib/libvmmapi/Makefile
index 1866c8fa5e7c..6dd0deeaa9c0 100644
--- a/lib/libvmmapi/Makefile
+++ b/lib/libvmmapi/Makefile
@@ -1,6 +1,6 @@
 PACKAGE=lib${LIB}
 LIB=   vmmapi
-SHLIB_MAJOR=   6
+SHLIB_MAJOR=   7
 SRCS=  vmmapi.c
 INCS=  vmmapi.h
 
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 440064ad13cb..2072c0105e37 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -40,7 +40,7 @@
  * API version for out-of-tree consumers like grub-bhyve for making compile
  * time decisions.
  */
-#define        VMMAPI_VERSION  0200    /* 2 digit major followed by 2 digit 
minor */
+#define        VMMAPI_VERSION  0300    /* 2 digit major followed by 2 digit 
minor */
 
 struct iovec;
 struct vcpu;
@@ -64,18 +64,6 @@ enum vm_mmap_style {
 #define        VM_MEM_F_INCORE 0x01    /* include guest memory in core file */
 #define        VM_MEM_F_WIRED  0x02    /* guest memory is wired */
 
-/*
- * Identifiers for memory segments:
- * - vm_setup_memory() uses VM_SYSMEM for the system memory segment.
- * - the remaining identifiers can be used to create devmem segments.
- */
-enum {
-       VM_SYSMEM,
-       VM_BOOTROM,
-       VM_FRAMEBUFFER,
-       VM_PCIROM,
-};
-
 __BEGIN_DECLS
 /*
  * Get the length and name of the memory segment identified by 'segid'.
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 1f86538ce5f3..441330fd57b8 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -29,6 +29,8 @@
 #ifndef        _VMM_DEV_H_
 #define        _VMM_DEV_H_
 
+#include <sys/domainset.h>
+
 #include <machine/vmm.h>
 #include <machine/vmm_snapshot.h>
 
@@ -52,7 +54,10 @@ struct vm_munmap {
 struct vm_memseg {
        int             segid;
        size_t          len;
-       char            name[VM_MAX_SUFFIXLEN + 1];
+       char            name[VM_MAX_SUFFIXLEN + 1];
+       domainset_t     *ds_mask;
+       size_t          ds_mask_size;
+       int             ds_policy;
 };
 
 struct vm_register {
diff --git a/sys/arm64/include/vmm_dev.h b/sys/arm64/include/vmm_dev.h
index 938bea47c7f8..219f1116c728 100644
--- a/sys/arm64/include/vmm_dev.h
+++ b/sys/arm64/include/vmm_dev.h
@@ -27,6 +27,8 @@
 #ifndef        _VMM_DEV_H_
 #define        _VMM_DEV_H_
 
+#include <sys/domainset.h>
+
 #include <machine/vmm.h>
 
 struct vm_memmap {
@@ -49,6 +51,9 @@ struct vm_memseg {
        int             segid;
        size_t          len;
        char            name[VM_MAX_SUFFIXLEN + 1];
+       domainset_t     *ds_mask;
+       size_t          ds_mask_size;
+       int             ds_policy;
 };
 
 struct vm_register {
diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c
index 2e2ae0a162d8..9f2b009d02ec 100644
--- a/sys/dev/vmm/vmm_dev.c
+++ b/sys/dev/vmm/vmm_dev.c
@@ -30,7 +30,8 @@
 #include <dev/vmm/vmm_mem.h>
 #include <dev/vmm/vmm_stat.h>
 
-#if defined(__amd64__) && defined(COMPAT_FREEBSD12)
+#ifdef __amd64__
+#ifdef COMPAT_FREEBSD12
 struct vm_memseg_12 {
        int             segid;
        size_t          len;
@@ -42,7 +43,22 @@ _Static_assert(sizeof(struct vm_memseg_12) == 80, 
"COMPAT_FREEBSD12 ABI");
        _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
 #define        VM_GET_MEMSEG_12        \
        _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
-#endif
+#endif /* COMPAT_FREEBSD12 */
+#ifdef COMPAT_FREEBSD14
+struct vm_memseg_14 {
+       int             segid;
+       size_t          len;
+       char            name[VM_MAX_SUFFIXLEN + 1];
+};
+_Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16),
+    "COMPAT_FREEBSD14 ABI");
+
+#define        VM_ALLOC_MEMSEG_14      \
+       _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14)
+#define        VM_GET_MEMSEG_14        \
+       _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14)
+#endif /* COMPAT_FREEBSD14 */
+#endif /* __amd64__ */
 
 struct devmem_softc {
        int     segid;
@@ -257,7 +273,8 @@ get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, 
size_t len)
 }
 
 static int
-alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
+alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len,
+    struct domainset *domainset)
 {
        char *name;
        int error;
@@ -278,8 +295,7 @@ alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg 
*mseg, size_t len)
                if (error)
                        goto done;
        }
-
-       error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
+       error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, 
domainset);
        if (error)
                goto done;
 
@@ -295,6 +311,20 @@ done:
        return (error);
 }
 
+#if defined(__amd64__) && \
+    (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12))
+/*
+ * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts.
+ */
+static void
+adjust_segid(struct vm_memseg *mseg)
+{
+       if (mseg->segid != VM_SYSMEM) {
+               mseg->segid += (VM_BOOTROM - 1);
+       }
+}
+#endif
+
 static int
 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
     uint64_t *regval)
@@ -353,10 +383,16 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = {
        VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
        VMMDEV_IOCTL(VM_STAT_DESC, 0),
 
-#if defined(__amd64__) && defined(COMPAT_FREEBSD12)
+#ifdef __amd64__
+#ifdef COMPAT_FREEBSD12
        VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
            VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
 #endif
+#ifdef COMPAT_FREEBSD14
+       VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14,
+           VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
+#endif
+#endif /* __amd64__ */
        VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
            VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
        VMMDEV_IOCTL(VM_MMAP_MEMSEG,
@@ -366,9 +402,14 @@ static const struct vmmdev_ioctl vmmdev_ioctls[] = {
        VMMDEV_IOCTL(VM_REINIT,
            VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
 
-#if defined(__amd64__) && defined(COMPAT_FREEBSD12)
+#ifdef __amd64__
+#if defined(COMPAT_FREEBSD12)
        VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
 #endif
+#ifdef COMPAT_FREEBSD14
+       VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS),
+#endif
+#endif /* __amd64__ */
        VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
        VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
 
@@ -388,6 +429,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, 
int fflag,
        struct vmmdev_softc *sc;
        struct vcpu *vcpu;
        const struct vmmdev_ioctl *ioctl;
+       struct vm_memseg *mseg;
        int error, vcpuid;
 
        sc = vmmdev_lookup2(cdev);
@@ -499,20 +541,77 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, 
int fflag,
                error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
                break;
        }
-#if defined(__amd64__) && defined(COMPAT_FREEBSD12)
+#ifdef __amd64__
+#ifdef COMPAT_FREEBSD12
        case VM_ALLOC_MEMSEG_12:
-               error = alloc_memseg(sc, (struct vm_memseg *)data,
-                   sizeof(((struct vm_memseg_12 *)0)->name));
+               mseg = (struct vm_memseg *)data;
+
+               adjust_segid(mseg);
+               error = alloc_memseg(sc, mseg,
+                   sizeof(((struct vm_memseg_12 *)0)->name), NULL);
                break;
        case VM_GET_MEMSEG_12:
-               error = get_memseg(sc, (struct vm_memseg *)data,
+               mseg = (struct vm_memseg *)data;
+
+               adjust_segid(mseg);
+               error = get_memseg(sc, mseg,
                    sizeof(((struct vm_memseg_12 *)0)->name));
                break;
-#endif
-       case VM_ALLOC_MEMSEG:
-               error = alloc_memseg(sc, (struct vm_memseg *)data,
-                   sizeof(((struct vm_memseg *)0)->name));
+#endif /* COMPAT_FREEBSD12 */
+#ifdef COMPAT_FREEBSD14
+       case VM_ALLOC_MEMSEG_14:
+               mseg = (struct vm_memseg *)data;
+
+               adjust_segid(mseg);
+               error = alloc_memseg(sc, mseg,
+                   sizeof(((struct vm_memseg_14 *)0)->name), NULL);
+               break;
+       case VM_GET_MEMSEG_14:
+               mseg = (struct vm_memseg *)data;
+
+               adjust_segid(mseg);
+               error = get_memseg(sc, mseg,
+                   sizeof(((struct vm_memseg_14 *)0)->name));
+               break;
+#endif /* COMPAT_FREEBSD14 */
+#endif /* __amd64__ */
+       case VM_ALLOC_MEMSEG: {
+               domainset_t *mask;
+               struct domainset *domainset, domain;
+
+               domainset = NULL;
+               mseg = (struct vm_memseg *)data;
+               if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && 
mseg->ds_mask != NULL) {
+                       if (mseg->ds_mask_size < sizeof(domainset_t) ||
+                           mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) {
+                               error = ERANGE;
+                               break;
+                       }
+                       memset(&domain, 0, sizeof(domain));
+                       mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK);
+                       error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size);
+                       if (error) {
+                               free(mask, M_VMMDEV);
+                               break;
+                       }
+                       error = domainset_populate(&domain, mask, 
mseg->ds_policy,
+                           mseg->ds_mask_size);
+                       if (error) {
+                               free(mask, M_VMMDEV);
+                               break;
+                       }
+                       domainset = domainset_create(&domain);
+                       if (domainset == NULL) {
+                               error = EINVAL;
+                               free(mask, M_VMMDEV);
+                               break;
+                       }
+                       free(mask, M_VMMDEV);
+               }
+               error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset);
+
                break;
+       }
        case VM_GET_MEMSEG:
                error = get_memseg(sc, (struct vm_memseg *)data,
                    sizeof(((struct vm_memseg *)0)->name));
diff --git a/sys/dev/vmm/vmm_mem.c b/sys/dev/vmm/vmm_mem.c
index c61ae2d44b96..be59e37de33d 100644
--- a/sys/dev/vmm/vmm_mem.c
+++ b/sys/dev/vmm/vmm_mem.c
@@ -7,6 +7,7 @@
 
 #include <sys/types.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/sx.h>
 #include <sys/systm.h>
 
@@ -156,10 +157,11 @@ vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
 }
 
 int
-vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
+vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem,
+    struct domainset *obj_domainset)
 {
-       struct vm_mem *mem;
        struct vm_mem_seg *seg;
+       struct vm_mem *mem;
        vm_object_t obj;
 
        mem = vm_mem(vm);
@@ -179,13 +181,22 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, 
bool sysmem)
                        return (EINVAL);
        }
 
+       /*
+        * When given an impossible policy, signal an
+        * error to the user.
+        */
+       if (obj_domainset != NULL && domainset_empty_vm(obj_domainset))
+               return (EINVAL);
        obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT);
        if (obj == NULL)
                return (ENOMEM);
 
        seg->len = len;
        seg->object = obj;
+       if (obj_domainset != NULL)
+               seg->object->domain.dr_policy = obj_domainset;
        seg->sysmem = sysmem;
+
        return (0);
 }
 
diff --git a/sys/dev/vmm/vmm_mem.h b/sys/dev/vmm/vmm_mem.h
index a4be4c1c57aa..856470cf2590 100644
--- a/sys/dev/vmm/vmm_mem.h
+++ b/sys/dev/vmm/vmm_mem.h
@@ -8,6 +8,27 @@
 #ifndef _DEV_VMM_MEM_H_
 #define        _DEV_VMM_MEM_H_
 
+/* Maximum number of NUMA domains in a guest. */
+#define VM_MAXMEMDOM 8
+#define VM_MAXSYSMEM VM_MAXMEMDOM
+
+/*
+ * Identifiers for memory segments.
+ * Each guest NUMA domain is represented by a single system
+ * memory segment from [VM_SYSMEM, VM_MAXSYSMEM).
+ * The remaining identifiers can be used to create devmem segments.
+ */
+enum {
+        VM_SYSMEM = 0,
+        VM_BOOTROM = VM_MAXSYSMEM,
+        VM_FRAMEBUFFER,
+        VM_PCIROM,
+        VM_MEMSEG_END
+};
+
+#define        VM_MAX_MEMSEGS  VM_MEMSEG_END
+#define        VM_MAX_MEMMAPS  (VM_MAX_MEMSEGS * 2)
+
 #ifdef _KERNEL
 
 #include <sys/types.h>
@@ -31,9 +52,6 @@ struct vm_mem_map {
        int             flags;
 };
 
-#define        VM_MAX_MEMSEGS  4
-#define        VM_MAX_MEMMAPS  8
-
 struct vm_mem {
        struct vm_mem_map       mem_maps[VM_MAX_MEMMAPS];
        struct vm_mem_seg       mem_segs[VM_MAX_MEMSEGS];
@@ -55,7 +73,8 @@ void vm_assert_memseg_xlocked(struct vm *vm);
 int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
     size_t len, int prot, int flags);
 int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len);
-int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
+int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem,
+    struct domainset *obj_domainset);
 void vm_free_memseg(struct vm *vm, int ident);
 
 /*
diff --git a/sys/riscv/include/vmm_dev.h b/sys/riscv/include/vmm_dev.h
index 856ff0778b95..4d30d5a1c35b 100644
--- a/sys/riscv/include/vmm_dev.h
+++ b/sys/riscv/include/vmm_dev.h
@@ -34,6 +34,8 @@
 #ifndef        _VMM_DEV_H_
 #define        _VMM_DEV_H_
 
+#include <sys/domainset.h>
+
 #include <machine/vmm.h>
 
 struct vm_memmap {
@@ -56,6 +58,9 @@ struct vm_memseg {
        int             segid;
        size_t          len;
        char            name[VM_MAX_SUFFIXLEN + 1];
+       domainset_t     *ds_mask;
+       size_t          ds_mask_size;
+       int             ds_policy;
 };
 
 struct vm_register {

Reply via email to