On 7/3/2022 4:15 AM, Peng Liang wrote: > On 6/15/2022 10:52 PM, Steve Sistare wrote: >> Provide the cpr-save restart mode, which preserves the guest VM across a >> restart of the qemu process. After cpr-save, the caller passes qemu >> command-line arguments to cpr-exec, which directly exec's the new qemu >> binary. The arguments must include -S so new qemu starts in a paused state. >> The caller resumes the guest by calling cpr-load. >> >> To use the restart mode, guest RAM must be backed by a memory-backend-file >> with share=on. The '-cpr-enable restart' option causes secondary guest >> ram blocks (those not specified on the command line) to be allocated by >> mmap'ing a memfd. The memfd values are saved in special cpr state which >> is retrieved after exec, and are kept open across exec, after which they >> are retrieved and re-mmap'd. Hence guest RAM is preserved in place, albeit >> with new virtual addresses in the qemu process. >> >> The restart mode supports vfio devices and memory-backend-memfd in >> subsequent patches. >> >> cpr-exec syntax: >> { 'command': 'cpr-exec', 'data': { 'argv': [ 'str' ] } } >> >> Add the restart mode: >> { 'enum': 'CprMode', 'data': [ 'reboot', 'restart' ] } >> >> Signed-off-by: Steve Sistare <steven.sist...@oracle.com> >> --- >> migration/cpr.c | 35 +++++++++++++++++++++++++++++++++++ >> qapi/cpr.json | 26 +++++++++++++++++++++++++- >> qemu-options.hx | 2 +- >> softmmu/physmem.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- >> trace-events | 1 + >> 5 files changed, 107 insertions(+), 3 deletions(-) >> >> diff --git a/migration/cpr.c b/migration/cpr.c >> index 1cc8738..8b3fffd 100644 >> --- a/migration/cpr.c >> +++ b/migration/cpr.c >> @@ -22,6 +22,7 @@ static int cpr_enabled_modes; >> void cpr_init(int modes) >> { >> cpr_enabled_modes = modes; >> + cpr_state_load(&error_fatal); >> } >> >> bool cpr_enabled(CprMode mode) >> @@ -153,6 +154,37 @@ err: >> cpr_set_mode(CPR_MODE_NONE); >> } >> >> +static int preserve_fd(const char *name, int id, int fd, void *opaque) >> +{ >> + qemu_clear_cloexec(fd); >> + return 0; >> +} >> + >> +static int unpreserve_fd(const char *name, int id, int fd, void *opaque) >> +{ >> + qemu_set_cloexec(fd); >> + return 0; >> +} >> + >> +void qmp_cpr_exec(strList *args, Error **errp) >> +{ >> + if (!runstate_check(RUN_STATE_SAVE_VM)) { >> + error_setg(errp, "runstate is not save-vm"); >> + return; >> + } >> + if (cpr_get_mode() != CPR_MODE_RESTART) { >> + error_setg(errp, "cpr-exec requires cpr-save with restart mode"); >> + return; >> + } >> + >> + cpr_walk_fd(preserve_fd, 0); >> + if (cpr_state_save(errp)) { >> + return; >> + } >> + >> + assert(qemu_system_exec_request(args, errp) == 0); >> +} >> + >> void qmp_cpr_load(const char *filename, CprMode mode, Error **errp) >> { >> QEMUFile *f; >> @@ -189,6 +221,9 @@ void qmp_cpr_load(const char *filename, CprMode mode, >> Error **errp) >> goto out; >> } >> >> + /* Clear cloexec to prevent fd leaks until the next cpr-save */ >> + cpr_walk_fd(unpreserve_fd, 0); >> + >> state = global_state_get_runstate(); >> if (state == RUN_STATE_RUNNING) { >> vm_start(); >> diff --git a/qapi/cpr.json b/qapi/cpr.json >> index 11c6f88..47ee4ff 100644 >> --- a/qapi/cpr.json >> +++ b/qapi/cpr.json >> @@ -15,11 +15,12 @@ >> # @CprMode: >> # >> # @reboot: checkpoint can be cpr-load'ed after a host reboot. >> +# @restart: checkpoint can be cpr-load'ed after restarting qemu. >> # >> # Since: 7.1 >> ## >> { 'enum': 'CprMode', >> - 'data': [ 'none', 'reboot' ] } >> + 'data': [ 'none', 'reboot', 'restart' ] } >> >> ## >> # @cpr-save: >> @@ -38,6 +39,11 @@ >> # issue the quit command, reboot the system, start qemu using the same >> # arguments plus -S, and issue the cpr-load command. >> # >> +# If @mode is 'restart', the checkpoint remains valid after restarting >> +# qemu using a subsequent cpr-exec. Guest RAM must be backed by a >> +# memory-backend-file with share=on. >> +# To resume from the checkpoint, issue the cpr-load command. >> +# >> # @filename: name of checkpoint file >> # @mode: @CprMode mode >> # >> @@ -48,6 +54,24 @@ >> 'mode': 'CprMode' } } >> >> ## >> +# @cpr-exec: >> +# >> +# Restart qemu by directly exec'ing @argv[0], replacing the qemu process. >> +# The PID remains the same. Must be called after cpr-save restart. >> +# >> +# @argv[0] should be the path of a new qemu binary, or a prefix command that >> +# in turn exec's the new qemu binary. The arguments must match those used >> +# to initially start qemu, plus the -S option so new qemu starts in a paused >> +# state. >> +# >> +# @argv: arguments to be passed to exec(). >> +# >> +# Since: 7.1 >> +## >> +{ 'command': 'cpr-exec', >> + 'data': { 'argv': [ 'str' ] } } >> + >> +## >> # @cpr-load: >> # >> # Load a virtual machine from the checkpoint file @filename that was created >> diff --git a/qemu-options.hx b/qemu-options.hx >> index 6e51c33..1b49360 100644 >> --- a/qemu-options.hx >> +++ b/qemu-options.hx >> @@ -4484,7 +4484,7 @@ SRST >> ERST >> >> DEF("cpr-enable", HAS_ARG, QEMU_OPTION_cpr_enable, \ >> - "-cpr-enable reboot enable the cpr mode\n", >> + "-cpr-enable reboot|restart enable the cpr mode\n", >> QEMU_ARCH_ALL) >> SRST >> ``-cpr-enable reboot`` >> diff --git a/softmmu/physmem.c b/softmmu/physmem.c >> index 822c424..412cc80 100644 >> --- a/softmmu/physmem.c >> +++ b/softmmu/physmem.c >> @@ -44,6 +44,7 @@ >> #include "qemu/qemu-print.h" >> #include "qemu/log.h" >> #include "qemu/memalign.h" >> +#include "qemu/memfd.h" >> #include "exec/memory.h" >> #include "exec/ioport.h" >> #include "sysemu/dma.h" >> @@ -1962,6 +1963,40 @@ static void dirty_memory_extend(ram_addr_t >> old_ram_size, >> } >> } >> >> +static bool memory_region_is_backend(MemoryRegion *mr) >> +{ >> + return !!object_dynamic_cast(mr->parent_obj.parent, >> TYPE_MEMORY_BACKEND); >> +} > > Maybe or mr->owner is more readable?
Maybe OBJECT(mr)->parent. mr->owner is not always the same as mr->parent_obj.parent. - Steve >> + >> +static void *qemu_anon_memfd_alloc(RAMBlock *rb, size_t maxlen, Error >> **errp) >> +{ >> + size_t len, align; >> + void *addr; >> + struct MemoryRegion *mr = rb->mr; >> + const char *name = memory_region_name(mr); >> + int mfd = cpr_find_memfd(name, &len, &maxlen, &align); >> + >> + if (mfd >= 0) { >> + rb->used_length = len; >> + rb->max_length = maxlen; >> + mr->align = align; >> + } else { >> + len = rb->used_length; >> + maxlen = rb->max_length; >> + mr->align = QEMU_VMALLOC_ALIGN; >> + mfd = qemu_memfd_create(name, maxlen + mr->align, 0, 0, 0, errp); >> + if (mfd < 0) { >> + return NULL; >> + } >> + cpr_save_memfd(name, mfd, len, maxlen, mr->align); >> + } >> + rb->flags |= RAM_SHARED; >> + qemu_set_cloexec(mfd); >> + addr = file_ram_alloc(rb, maxlen, mfd, false, false, 0, errp); >> + trace_anon_memfd_alloc(name, maxlen, addr, mfd); >> + return addr; >> +} >> + >> static void ram_block_add(RAMBlock *new_block, Error **errp) >> { >> const bool noreserve = qemu_ram_is_noreserve(new_block); >> @@ -1986,6 +2021,14 @@ static void ram_block_add(RAMBlock *new_block, Error >> **errp) >> qemu_mutex_unlock_ramlist(); >> return; >> } >> + } else if (cpr_enabled(CPR_MODE_RESTART) && >> + !memory_region_is_backend(new_block->mr)) { >> + new_block->host = qemu_anon_memfd_alloc(new_block, >> + new_block->max_length, >> + errp); >> + if (!new_block->host) { >> + return; >> + } >> } else { >> new_block->host = qemu_anon_ram_alloc(new_block->max_length, >> &new_block->mr->align, >> @@ -1997,8 +2040,8 @@ static void ram_block_add(RAMBlock *new_block, Error >> **errp) >> qemu_mutex_unlock_ramlist(); >> return; >> } >> - memory_try_enable_merging(new_block->host, >> new_block->max_length); >> } >> + memory_try_enable_merging(new_block->host, new_block->max_length); >> } >> >> new_ram_size = MAX(old_ram_size, >> @@ -2231,6 +2274,7 @@ void qemu_ram_free(RAMBlock *block) >> } >> >> qemu_mutex_lock_ramlist(); >> + cpr_delete_memfd(memory_region_name(block->mr)); >> QLIST_REMOVE_RCU(block, next); >> ram_list.mru_block = NULL; >> /* Write list before version */ >> diff --git a/trace-events b/trace-events >> index bc71006..07369bb 100644 >> --- a/trace-events >> +++ b/trace-events >> @@ -45,6 +45,7 @@ ram_block_discard_range(const char *rbname, void *hva, >> size_t length, bool need_ >> # accel/tcg/cputlb.c >> memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned >> size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u" >> memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64 >> +anon_memfd_alloc(const char *name, size_t size, void *ptr, int fd) "%s size >> %zu ptr %p fd %d" >> >> # gdbstub.c >> gdbstub_op_start(const char *device) "Starting gdbstub using device %s"