On 5/28/2024 5:12 PM, Peter Xu wrote:
On Mon, Apr 29, 2024 at 08:55:26AM -0700, Steve Sistare wrote:
Allocate anonymous memory using memfd_create if the memfd-alloc machine
option is set.

Signed-off-by: Steve Sistare <steven.sist...@oracle.com>
---
  hw/core/machine.c   | 22 ++++++++++++++++++++++
  include/hw/boards.h |  1 +
  qemu-options.hx     |  6 ++++++
  system/memory.c     |  9 ++++++---
  system/physmem.c    | 18 +++++++++++++++++-
  system/trace-events |  1 +
  6 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 582c2df..9567b97 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -443,6 +443,20 @@ static void machine_set_mem_merge(Object *obj, bool value, 
Error **errp)
      ms->mem_merge = value;
  }
+static bool machine_get_memfd_alloc(Object *obj, Error **errp)
+{
+    MachineState *ms = MACHINE(obj);
+
+    return ms->memfd_alloc;
+}
+
+static void machine_set_memfd_alloc(Object *obj, bool value, Error **errp)
+{
+    MachineState *ms = MACHINE(obj);
+
+    ms->memfd_alloc = value;
+}
+
  static bool machine_get_usb(Object *obj, Error **errp)
  {
      MachineState *ms = MACHINE(obj);
@@ -1044,6 +1058,11 @@ static void machine_class_init(ObjectClass *oc, void 
*data)
      object_class_property_set_description(oc, "mem-merge",
          "Enable/disable memory merge support");
+ object_class_property_add_bool(oc, "memfd-alloc",
+        machine_get_memfd_alloc, machine_set_memfd_alloc);
+    object_class_property_set_description(oc, "memfd-alloc",
+        "Enable/disable allocating anonymous memory using memfd_create");
+
      object_class_property_add_bool(oc, "usb",
          machine_get_usb, machine_set_usb);
      object_class_property_set_description(oc, "usb",
@@ -1387,6 +1406,9 @@ static bool create_default_memdev(MachineState *ms, const 
char *path, Error **er
      if (!object_property_set_int(obj, "size", ms->ram_size, errp)) {
          goto out;
      }
+    if (!object_property_set_bool(obj, "share", ms->memfd_alloc, errp)) {
+        goto out;
+    }
      object_property_add_child(object_get_objects_root(), mc->default_ram_id,
                                obj);
      /* Ensure backend's memory region name is equal to mc->default_ram_id */
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 69c1ba4..96259c3 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -372,6 +372,7 @@ struct MachineState {
      bool dump_guest_core;
      bool mem_merge;
      bool require_guest_memfd;
+    bool memfd_alloc;
      bool usb;
      bool usb_disabled;
      char *firmware;
diff --git a/qemu-options.hx b/qemu-options.hx
index cf61f6b..f0dfda5 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -32,6 +32,7 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
      "                vmport=on|off|auto controls emulation of vmport (default: 
auto)\n"
      "                dump-guest-core=on|off include guest memory in a core dump 
(default=on)\n"
      "                mem-merge=on|off controls memory merge support (default: 
on)\n"
+    "                memfd-alloc=on|off controls allocating anonymous guest RAM 
using memfd_create (default: off)\n"
      "                aes-key-wrap=on|off controls support for AES key wrapping 
(default=on)\n"
      "                dea-key-wrap=on|off controls support for DEA key wrapping 
(default=on)\n"
      "                suppress-vmdesc=on|off disables self-describing migration 
(default=off)\n"
@@ -79,6 +80,11 @@ SRST
          supported by the host, de-duplicates identical memory pages
          among VMs instances (enabled by default).
+ ``memfd-alloc=on|off``
+        Enables or disables allocation of anonymous guest RAM using
+        memfd_create.  Any associated memory-backend objects are created with
+        share=on.  The memfd-alloc default is off.
+
      ``aes-key-wrap=on|off``
          Enables or disables AES key wrapping support on s390-ccw hosts.
          This feature controls whether AES wrapping keys will be created
diff --git a/system/memory.c b/system/memory.c
index 49f1cb2..ca04a0e 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -1552,8 +1552,9 @@ bool memory_region_init_ram_nomigrate(MemoryRegion *mr,
                                        uint64_t size,
                                        Error **errp)
  {
+    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;

If there's a machine option to "use memfd for allocations", then it's
shared mem... Hmm..

It is a bit confusing to me in quite a few levels:

   - Why memory allocation method will be defined by a machine property,
     even if we have memory-backend-* which should cover everything?

Some memory regions are implicitly created, and have no explicit representation
on the qemu command line.  memfd-alloc affects those.

More generally, memfd-alloc affects all ramblock allocations that are
not explicitly represented by memory-backend object.  Thus the simple
command line "qemu -m 1G" does not explicitly describe an object, so it
goes through the anonymous allocation path, and is affected by memfd-alloc.

Internally, create_default_memdev does create a memory-backend object.
That is what my doc comment above refers to:
  Any associated memory-backend objects are created with share=on

An explicit "qemu -object memory-backend-*" is not affected by memfd-alloc.

The qapi comments in patch "migration: cpr-exec mode" attempt to say all that:

+#     Memory backend objects must have the share=on attribute, and
+#     must be mmap'able in the new QEMU process.  For example,
+#     memory-backend-file is acceptable, but memory-backend-ram is
+#     not.
+#
+#     The VM must be started with the '-machine memfd-alloc=on'
+#     option.  This causes implicit ram blocks -- those not explicitly
+#     described by a memory-backend object -- to be allocated by
+#     mmap'ing a memfd.  Examples include VGA, ROM, and even guest
+#     RAM when it is specified without a memory-backend object.

   - Even if we have such a machine property, why setting "memfd" will
     always imply shared?  why not private?  After all it's not called
     "memfd-shared-alloc", and we can create private mappings using
     e.g. memory-backend-memfd,share=off.

There is no use case for memfd-alloc with share=off, so no point IMO in
making the option more verbose.  For cpr, the mapping with all its modifications
must be visible to new qemu when qemu mmaps it.

      return memory_region_init_ram_flags_nomigrate(mr, owner, name,
-                                                  size, 0, errp);
+                                                  size, flags, errp);
  }
bool memory_region_init_ram_flags_nomigrate(MemoryRegion *mr,
@@ -1713,8 +1714,9 @@ bool memory_region_init_rom_nomigrate(MemoryRegion *mr,
                                        uint64_t size,
                                        Error **errp)
  {
+    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
      if (!memory_region_init_ram_flags_nomigrate(mr, owner, name,
-                                                size, 0, errp)) {
+                                                size, flags, errp)) {
           return false;
      }
      mr->readonly = true;
@@ -1731,6 +1733,7 @@ bool memory_region_init_rom_device_nomigrate(MemoryRegion 
*mr,
                                               Error **errp)
  {
      Error *err = NULL;
+    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
      assert(ops);
      memory_region_init(mr, owner, name, size);
      mr->ops = ops;
@@ -1738,7 +1741,7 @@ bool memory_region_init_rom_device_nomigrate(MemoryRegion 
*mr,
      mr->terminates = true;
      mr->rom_device = true;
      mr->destructor = memory_region_destructor_ram;
-    mr->ram_block = qemu_ram_alloc(size, 0, mr, &err);
+    mr->ram_block = qemu_ram_alloc(size, flags, mr, &err);
      if (err) {
          mr->size = int128_zero();
          object_unparent(OBJECT(mr));
diff --git a/system/physmem.c b/system/physmem.c
index c736af5..36d97ec 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -45,6 +45,7 @@
  #include "qemu/qemu-print.h"
  #include "qemu/log.h"
  #include "qemu/memalign.h"
+#include "qemu/memfd.h"
  #include "exec/memory.h"
  #include "exec/ioport.h"
  #include "sysemu/dma.h"
@@ -1825,6 +1826,19 @@ static void *ram_block_alloc_host(RAMBlock *rb, Error 
**errp)
      if (xen_enabled()) {
          xen_ram_alloc(rb->offset, rb->max_length, mr, errp);
+ } else if (rb->flags & RAM_SHARED) {
+        if (rb->fd == -1) {
+            mr->align = QEMU_VMALLOC_ALIGN;
+            rb->fd = qemu_memfd_create(rb->idstr, rb->max_length + mr->align,
+                                       0, 0, 0, errp);
+        }

We used to have case where RAM_SHARED && rb->fd==-1, I think.

file_ram_alloc is the only other function that assigns to rb->fd, and
hence the only function that cares about fd > 0.  It does not
interfere with memfd_alloc.  All calls to create ram blocks funnel
through these two functions:

qemu_ram_alloc_from_fd()
  ram_block_create()
  file_ram_alloc()

qemu_ram_alloc_internal()
  ram_block_create()
  ram_block_alloc_host()
    if (rb->flags & RAM_SHARED) {
        if (rb->fd == -1) {
            rb->fd = qemu_memfd_create()
        }
        if (rb->fd >= 0) {
            file_ram_alloc(rb->fd)

We have some code that checks explicitly on rb->fd against -1 to know
whether it's a fd based.  I'm not sure whether there'll be implications to
affect those codes.

That's OK, the memfd allocation completely acts like an fd based ramblock.
  rb->fd = qemu_memfd_create()

Maybe it's mostly fine, OTOH I worry more on the whole idea.  I'm not sure
whether this is relevant to "we want to be able to share the mem with the
new process", in this case can we simply require the user to use file based
memory backends, rather than such change?

That does not work for implicitly created memory regions.

- Steve

+        if (rb->fd >= 0) {
+            int mfd = rb->fd;
+            qemu_set_cloexec(mfd);
+            host = file_ram_alloc(rb, rb->max_length, mfd, false, 0, errp);
+            trace_qemu_anon_memfd_alloc(rb->idstr, rb->max_length, mfd, host);
+        }
+
      } else {
          host = qemu_anon_ram_alloc(rb->max_length, &mr->align,
                                     qemu_ram_is_shared(rb),
@@ -2106,8 +2120,10 @@ RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, 
ram_addr_t maxsz,
                                                       void *host),
                                       MemoryRegion *mr, Error **errp)
  {
+    uint32_t flags = current_machine->memfd_alloc ? RAM_SHARED : 0;
+    flags |= RAM_RESIZEABLE;
      return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
-                                   RAM_RESIZEABLE, mr, errp);
+                                   flags, mr, errp);
  }
static void reclaim_ramblock(RAMBlock *block)
diff --git a/system/trace-events b/system/trace-events
index f0a80ba..0092734 100644
--- a/system/trace-events
+++ b/system/trace-events
@@ -41,3 +41,4 @@ dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) 
"CPU[%d] sleep %"P
# physmem.c
  ram_block_create(const char *name, uint32_t flags, int fd, size_t used_length, size_t 
max_length, size_t align) "%s, flags %u, fd %d, len %lu, maxlen %lu, align %lu"
+qemu_anon_memfd_alloc(const char *name, size_t size, int fd, void *ptr) "%s size %zu 
fd %d -> %p"
--
1.8.3.1



Reply via email to