On Sun, 30 Jun 2024 12:40:24 -0700 Steve Sistare <steven.sist...@oracle.com> wrote:
> Allocate anonymous memory using mmap MAP_ANON or memfd_create depending > on the value of the anon-alloc machine property. This affects > memory-backend-ram objects, guest RAM created with the global -m option > but without an associated memory-backend object and without the -mem-path > option nowadays, all machines were converted to use memory backend for VM RAM. so -m option implicitly creates memory-backend object, which will be either MEMORY_BACKEND_FILE if -mem-path present or MEMORY_BACKEND_RAM otherwise. > To access the same memory in the old and new QEMU processes, the memory > must be mapped shared. Therefore, the implementation always sets > RAM_SHARED if alloc-anon=memfd, except for memory-backend-ram, where the > user must explicitly specify the share option. In lieu of defining a new so statement at the top that memory-backend-ram is affected is not really valid? > RAM flag, at the lowest level the implementation uses RAM_SHARED with fd=-1 > as the condition for calling memfd_create. In general I do dislike adding yet another option that will affect guest RAM allocation (memory-backends should be sufficient). However I do see that you need memfd for device memory (vram, roms, ...). Can we just use memfd/shared unconditionally for those and avoid introducing a new confusing option? > Signed-off-by: Steve Sistare <steven.sist...@oracle.com> > --- > hw/core/machine.c | 24 ++++++++++++++++++++++++ > include/hw/boards.h | 1 + > qapi/machine.json | 14 ++++++++++++++ > qemu-options.hx | 13 +++++++++++++ > system/memory.c | 12 +++++++++--- > system/physmem.c | 38 +++++++++++++++++++++++++++++++++++++- > system/trace-events | 3 +++ > 7 files changed, 101 insertions(+), 4 deletions(-) > > diff --git a/hw/core/machine.c b/hw/core/machine.c > index 655d75c..7ca2ad0 100644 > --- a/hw/core/machine.c > +++ b/hw/core/machine.c > @@ -454,6 +454,20 @@ static void machine_set_mem_merge(Object *obj, bool > value, Error **errp) > ms->mem_merge = value; > } > > +static int machine_get_anon_alloc(Object *obj, Error **errp) > +{ > + MachineState *ms = MACHINE(obj); > + > + return ms->anon_alloc; > +} > + > +static void machine_set_anon_alloc(Object *obj, int value, Error **errp) > +{ > + MachineState *ms = MACHINE(obj); > + > + ms->anon_alloc = value; > +} > + > static bool machine_get_usb(Object *obj, Error **errp) > { > MachineState *ms = MACHINE(obj); > @@ -1066,6 +1080,11 @@ static void machine_class_init(ObjectClass *oc, void > *data) > object_class_property_set_description(oc, "mem-merge", > "Enable/disable memory merge support"); > > + object_class_property_add_enum(oc, "anon-alloc", "AnonAllocOption", > + &AnonAllocOption_lookup, > + machine_get_anon_alloc, > + machine_set_anon_alloc); > + > object_class_property_add_bool(oc, "usb", > machine_get_usb, machine_set_usb); > object_class_property_set_description(oc, "usb", > @@ -1416,6 +1435,11 @@ static bool create_default_memdev(MachineState *ms, > const char *path, Error **er > if (!object_property_set_int(obj, "size", ms->ram_size, errp)) { > goto out; > } > + if (!object_property_set_bool(obj, "share", > + ms->anon_alloc == ANON_ALLOC_OPTION_MEMFD, > + errp)) { > + goto out; > + } > object_property_add_child(object_get_objects_root(), mc->default_ram_id, > obj); > /* Ensure backend's memory region name is equal to mc->default_ram_id */ > diff --git a/include/hw/boards.h b/include/hw/boards.h > index 73ad319..77f16ad 100644 > --- a/include/hw/boards.h > +++ b/include/hw/boards.h > @@ -383,6 +383,7 @@ struct MachineState { > bool enable_graphics; > ConfidentialGuestSupport *cgs; > HostMemoryBackend *memdev; > + AnonAllocOption anon_alloc; > /* > * convenience alias to ram_memdev_id backend memory region > * or to numa container memory region > diff --git a/qapi/machine.json b/qapi/machine.json > index 2fd3e9c..9173953 100644 > --- a/qapi/machine.json > +++ b/qapi/machine.json > @@ -1881,3 +1881,17 @@ > { 'command': 'x-query-interrupt-controllers', > 'returns': 'HumanReadableText', > 'features': [ 'unstable' ]} > + > +## > +# @AnonAllocOption: > +# > +# An enumeration of the options for allocating anonymous guest memory. > +# > +# @mmap: allocate using mmap MAP_ANON > +# > +# @memfd: allocate using memfd_create > +# > +# Since: 9.1 > +## > +{ 'enum': 'AnonAllocOption', > + 'data': [ 'mmap', 'memfd' ] } > diff --git a/qemu-options.hx b/qemu-options.hx > index 8ca7f34..595b693 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -38,6 +38,7 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ > " nvdimm=on|off controls NVDIMM support (default=off)\n" > " memory-encryption=@var{} memory encryption object to > use (default=none)\n" > " hmat=on|off controls ACPI HMAT support (default=off)\n" > + " anon-alloc=mmap|memfd allocate anonymous guest RAM > using mmap MAP_ANON or memfd_create (default: mmap)\n" > " memory-backend='backend-id' specifies explicitly > provided backend for main RAM (default=none)\n" > " > cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n", > QEMU_ARCH_ALL) > @@ -101,6 +102,18 @@ SRST > Enables or disables ACPI Heterogeneous Memory Attribute Table > (HMAT) support. The default is off. > > + ``anon-alloc=mmap|memfd`` > + Allocate anonymous guest RAM using mmap MAP_ANON (the default) > + or memfd_create. This affects memory-backend-ram objects, > + RAM created with the global -m option but without an > + associated memory-backend object and without the -mem-path > + option, and various memory regions such as ROMs that are > + allocated when devices are created. This option does not > + affect memory-backend-file, memory-backend-memfd, or > + memory-backend-epc objects. > + > + Some migration modes require anon-alloc=memfd. > + > ``memory-backend='id'`` > An alternative to legacy ``-mem-path`` and ``mem-prealloc`` options. > Allows to use a memory backend as main RAM. > diff --git a/system/memory.c b/system/memory.c > index 2d69521..28a837d 100644 > --- a/system/memory.c > +++ b/system/memory.c > @@ -1552,8 +1552,10 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr, > uint64_t size, > Error **errp) > { > + uint32_t flags = (current_machine->anon_alloc == > ANON_ALLOC_OPTION_MEMFD) ? > + RAM_SHARED : 0; > return memory_region_init_ram_flags_nomigrate(mr, owner, name, > - size, 0, errp); > + size, flags, errp); > } > > bool memory_region_init_ram_flags_nomigrate(MemoryRegion *mr, > @@ -1713,8 +1715,10 @@ bool memory_region_init_rom_nomigrate(MemoryRegion *mr, > uint64_t size, > Error **errp) > { > + uint32_t flags = (current_machine->anon_alloc == > ANON_ALLOC_OPTION_MEMFD) ? > + RAM_SHARED : 0; > if (!memory_region_init_ram_flags_nomigrate(mr, owner, name, > - size, 0, errp)) { > + size, flags, errp)) { > return false; > } > mr->readonly = true; > @@ -1731,6 +1735,8 @@ bool > memory_region_init_rom_device_nomigrate(MemoryRegion *mr, > Error **errp) > { > Error *err = NULL; > + uint32_t flags = (current_machine->anon_alloc == > ANON_ALLOC_OPTION_MEMFD) ? > + RAM_SHARED : 0; > assert(ops); > memory_region_init(mr, owner, name, size); > mr->ops = ops; > @@ -1738,7 +1744,7 @@ bool > memory_region_init_rom_device_nomigrate(MemoryRegion *mr, > mr->terminates = true; > mr->rom_device = true; > mr->destructor = memory_region_destructor_ram; > - mr->ram_block = qemu_ram_alloc(size, 0, mr, &err); > + mr->ram_block = qemu_ram_alloc(size, flags, mr, &err); > if (err) { > mr->size = int128_zero(); > object_unparent(OBJECT(mr)); > diff --git a/system/physmem.c b/system/physmem.c > index 33d09f7..efe95ff 100644 > --- a/system/physmem.c > +++ b/system/physmem.c > @@ -47,6 +47,7 @@ > #include "qemu/qemu-print.h" > #include "qemu/log.h" > #include "qemu/memalign.h" > +#include "qemu/memfd.h" > #include "exec/memory.h" > #include "exec/ioport.h" > #include "sysemu/dma.h" > @@ -54,6 +55,7 @@ > #include "sysemu/hw_accel.h" > #include "sysemu/xen-mapcache.h" > #include "trace/trace-root.h" > +#include "trace.h" > > #ifdef CONFIG_FALLOCATE_PUNCH_HOLE > #include <linux/falloc.h> > @@ -69,6 +71,8 @@ > > #include "qemu/pmem.h" > > +#include "qapi/qapi-types-migration.h" > +#include "migration/options.h" > #include "migration/vmstate.h" > > #include "qemu/range.h" > @@ -1828,6 +1832,32 @@ static void ram_block_add(RAMBlock *new_block, Error > **errp) > qemu_mutex_unlock_ramlist(); > return; > } > + > + } else if (new_block->flags & RAM_SHARED) { > + size_t max_length = new_block->max_length; > + MemoryRegion *mr = new_block->mr; > + const char *name = memory_region_name(mr); > + > + new_block->mr->align = QEMU_VMALLOC_ALIGN; > + > + if (new_block->fd == -1) { > + new_block->fd = qemu_memfd_create(name, max_length + > mr->align, > + 0, 0, 0, errp); > + } > + > + if (new_block->fd >= 0) { > + int mfd = new_block->fd; > + qemu_set_cloexec(mfd); > + new_block->host = file_ram_alloc(new_block, max_length, mfd, > + false, 0, errp); > + } > + if (!new_block->host) { > + qemu_mutex_unlock_ramlist(); > + return; > + } > + memory_try_enable_merging(new_block->host, > new_block->max_length); > + free_on_error = true; > + > } else { > new_block->host = qemu_anon_ram_alloc(new_block->max_length, > &new_block->mr->align, > @@ -1911,6 +1941,9 @@ static void ram_block_add(RAMBlock *new_block, Error > **errp) > ram_block_notify_add(new_block->host, new_block->used_length, > new_block->max_length); > } > + trace_ram_block_add(memory_region_name(new_block->mr), new_block->flags, > + new_block->fd, new_block->used_length, > + new_block->max_length); > return; > > out_free: > @@ -2097,8 +2130,11 @@ RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, > ram_addr_t maxsz, > void *host), > MemoryRegion *mr, Error **errp) > { > + uint32_t flags = (current_machine->anon_alloc == > ANON_ALLOC_OPTION_MEMFD) ? > + RAM_SHARED : 0; > + flags |= RAM_RESIZEABLE; > return qemu_ram_alloc_internal(size, maxsz, resized, NULL, > - RAM_RESIZEABLE, mr, errp); > + flags, mr, errp); > } > > static void reclaim_ramblock(RAMBlock *block) > diff --git a/system/trace-events b/system/trace-events > index 69c9044..f8ebf42 100644 > --- a/system/trace-events > +++ b/system/trace-events > @@ -38,3 +38,6 @@ dirtylimit_state_finalize(void) > dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) > "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us" > dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page > rate limit %"PRIu64 > dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep > %"PRIi64 " us" > + > +#physmem.c > +ram_block_add(const char *name, uint32_t flags, int fd, size_t used_length, > size_t max_length) "%s, flags %u, fd %d, len %lu, maxlen %lu"