[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH V8 20/39] cpr: restart mode
From: |
Steven Sistare |
Subject: |
Re: [PATCH V8 20/39] cpr: restart mode |
Date: |
Tue, 5 Jul 2022 14:29:02 -0400 |
User-agent: |
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Thunderbird/91.11.0 |
On 7/3/2022 4:15 AM, Peng Liang wrote:
> On 6/15/2022 10:52 PM, Steve Sistare wrote:
>> Provide the cpr-save restart mode, which preserves the guest VM across a
>> restart of the qemu process. After cpr-save, the caller passes qemu
>> command-line arguments to cpr-exec, which directly exec's the new qemu
>> binary. The arguments must include -S so new qemu starts in a paused state.
>> The caller resumes the guest by calling cpr-load.
>>
>> To use the restart mode, guest RAM must be backed by a memory-backend-file
>> with share=on. The '-cpr-enable restart' option causes secondary guest
>> ram blocks (those not specified on the command line) to be allocated by
>> mmap'ing a memfd. The memfd values are saved in special cpr state which
>> is retrieved after exec, and are kept open across exec, after which they
>> are retrieved and re-mmap'd. Hence guest RAM is preserved in place, albeit
>> with new virtual addresses in the qemu process.
>>
>> The restart mode supports vfio devices and memory-backend-memfd in
>> subsequent patches.
>>
>> cpr-exec syntax:
>> { 'command': 'cpr-exec', 'data': { 'argv': [ 'str' ] } }
>>
>> Add the restart mode:
>> { 'enum': 'CprMode', 'data': [ 'reboot', 'restart' ] }
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>> migration/cpr.c | 35 +++++++++++++++++++++++++++++++++++
>> qapi/cpr.json | 26 +++++++++++++++++++++++++-
>> qemu-options.hx | 2 +-
>> softmmu/physmem.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>> trace-events | 1 +
>> 5 files changed, 107 insertions(+), 3 deletions(-)
>>
>> diff --git a/migration/cpr.c b/migration/cpr.c
>> index 1cc8738..8b3fffd 100644
>> --- a/migration/cpr.c
>> +++ b/migration/cpr.c
>> @@ -22,6 +22,7 @@ static int cpr_enabled_modes;
>> void cpr_init(int modes)
>> {
>> cpr_enabled_modes = modes;
>> + cpr_state_load(&error_fatal);
>> }
>>
>> bool cpr_enabled(CprMode mode)
>> @@ -153,6 +154,37 @@ err:
>> cpr_set_mode(CPR_MODE_NONE);
>> }
>>
>> +static int preserve_fd(const char *name, int id, int fd, void *opaque)
>> +{
>> + qemu_clear_cloexec(fd);
>> + return 0;
>> +}
>> +
>> +static int unpreserve_fd(const char *name, int id, int fd, void *opaque)
>> +{
>> + qemu_set_cloexec(fd);
>> + return 0;
>> +}
>> +
>> +void qmp_cpr_exec(strList *args, Error **errp)
>> +{
>> + if (!runstate_check(RUN_STATE_SAVE_VM)) {
>> + error_setg(errp, "runstate is not save-vm");
>> + return;
>> + }
>> + if (cpr_get_mode() != CPR_MODE_RESTART) {
>> + error_setg(errp, "cpr-exec requires cpr-save with restart mode");
>> + return;
>> + }
>> +
>> + cpr_walk_fd(preserve_fd, 0);
>> + if (cpr_state_save(errp)) {
>> + return;
>> + }
>> +
>> + assert(qemu_system_exec_request(args, errp) == 0);
>> +}
>> +
>> void qmp_cpr_load(const char *filename, CprMode mode, Error **errp)
>> {
>> QEMUFile *f;
>> @@ -189,6 +221,9 @@ void qmp_cpr_load(const char *filename, CprMode mode,
>> Error **errp)
>> goto out;
>> }
>>
>> + /* Clear cloexec to prevent fd leaks until the next cpr-save */
>> + cpr_walk_fd(unpreserve_fd, 0);
>> +
>> state = global_state_get_runstate();
>> if (state == RUN_STATE_RUNNING) {
>> vm_start();
>> diff --git a/qapi/cpr.json b/qapi/cpr.json
>> index 11c6f88..47ee4ff 100644
>> --- a/qapi/cpr.json
>> +++ b/qapi/cpr.json
>> @@ -15,11 +15,12 @@
>> # @CprMode:
>> #
>> # @reboot: checkpoint can be cpr-load'ed after a host reboot.
>> +# @restart: checkpoint can be cpr-load'ed after restarting qemu.
>> #
>> # Since: 7.1
>> ##
>> { 'enum': 'CprMode',
>> - 'data': [ 'none', 'reboot' ] }
>> + 'data': [ 'none', 'reboot', 'restart' ] }
>>
>> ##
>> # @cpr-save:
>> @@ -38,6 +39,11 @@
>> # issue the quit command, reboot the system, start qemu using the same
>> # arguments plus -S, and issue the cpr-load command.
>> #
>> +# If @mode is 'restart', the checkpoint remains valid after restarting
>> +# qemu using a subsequent cpr-exec. Guest RAM must be backed by a
>> +# memory-backend-file with share=on.
>> +# To resume from the checkpoint, issue the cpr-load command.
>> +#
>> # @filename: name of checkpoint file
>> # @mode: @CprMode mode
>> #
>> @@ -48,6 +54,24 @@
>> 'mode': 'CprMode' } }
>>
>> ##
>> +# @cpr-exec:
>> +#
>> +# Restart qemu by directly exec'ing @argv[0], replacing the qemu process.
>> +# The PID remains the same. Must be called after cpr-save restart.
>> +#
>> +# @argv[0] should be the path of a new qemu binary, or a prefix command that
>> +# in turn exec's the new qemu binary. The arguments must match those used
>> +# to initially start qemu, plus the -S option so new qemu starts in a paused
>> +# state.
>> +#
>> +# @argv: arguments to be passed to exec().
>> +#
>> +# Since: 7.1
>> +##
>> +{ 'command': 'cpr-exec',
>> + 'data': { 'argv': [ 'str' ] } }
>> +
>> +##
>> # @cpr-load:
>> #
>> # Load a virtual machine from the checkpoint file @filename that was created
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index 6e51c33..1b49360 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -4484,7 +4484,7 @@ SRST
>> ERST
>>
>> DEF("cpr-enable", HAS_ARG, QEMU_OPTION_cpr_enable, \
>> - "-cpr-enable reboot enable the cpr mode\n",
>> + "-cpr-enable reboot|restart enable the cpr mode\n",
>> QEMU_ARCH_ALL)
>> SRST
>> ``-cpr-enable reboot``
>> diff --git a/softmmu/physmem.c b/softmmu/physmem.c
>> index 822c424..412cc80 100644
>> --- a/softmmu/physmem.c
>> +++ b/softmmu/physmem.c
>> @@ -44,6 +44,7 @@
>> #include "qemu/qemu-print.h"
>> #include "qemu/log.h"
>> #include "qemu/memalign.h"
>> +#include "qemu/memfd.h"
>> #include "exec/memory.h"
>> #include "exec/ioport.h"
>> #include "sysemu/dma.h"
>> @@ -1962,6 +1963,40 @@ static void dirty_memory_extend(ram_addr_t
>> old_ram_size,
>> }
>> }
>>
>> +static bool memory_region_is_backend(MemoryRegion *mr)
>> +{
>> + return !!object_dynamic_cast(mr->parent_obj.parent,
>> TYPE_MEMORY_BACKEND);
>> +}
>
> Maybe or mr->owner is more readable?
Maybe OBJECT(mr)->parent.
mr->owner is not always the same as mr->parent_obj.parent.
- Steve
>> +
>> +static void *qemu_anon_memfd_alloc(RAMBlock *rb, size_t maxlen, Error
>> **errp)
>> +{
>> + size_t len, align;
>> + void *addr;
>> + struct MemoryRegion *mr = rb->mr;
>> + const char *name = memory_region_name(mr);
>> + int mfd = cpr_find_memfd(name, &len, &maxlen, &align);
>> +
>> + if (mfd >= 0) {
>> + rb->used_length = len;
>> + rb->max_length = maxlen;
>> + mr->align = align;
>> + } else {
>> + len = rb->used_length;
>> + maxlen = rb->max_length;
>> + mr->align = QEMU_VMALLOC_ALIGN;
>> + mfd = qemu_memfd_create(name, maxlen + mr->align, 0, 0, 0, errp);
>> + if (mfd < 0) {
>> + return NULL;
>> + }
>> + cpr_save_memfd(name, mfd, len, maxlen, mr->align);
>> + }
>> + rb->flags |= RAM_SHARED;
>> + qemu_set_cloexec(mfd);
>> + addr = file_ram_alloc(rb, maxlen, mfd, false, false, 0, errp);
>> + trace_anon_memfd_alloc(name, maxlen, addr, mfd);
>> + return addr;
>> +}
>> +
>> static void ram_block_add(RAMBlock *new_block, Error **errp)
>> {
>> const bool noreserve = qemu_ram_is_noreserve(new_block);
>> @@ -1986,6 +2021,14 @@ static void ram_block_add(RAMBlock *new_block, Error
>> **errp)
>> qemu_mutex_unlock_ramlist();
>> return;
>> }
>> + } else if (cpr_enabled(CPR_MODE_RESTART) &&
>> + !memory_region_is_backend(new_block->mr)) {
>> + new_block->host = qemu_anon_memfd_alloc(new_block,
>> + new_block->max_length,
>> + errp);
>> + if (!new_block->host) {
>> + return;
>> + }
>> } else {
>> new_block->host = qemu_anon_ram_alloc(new_block->max_length,
>> &new_block->mr->align,
>> @@ -1997,8 +2040,8 @@ static void ram_block_add(RAMBlock *new_block, Error
>> **errp)
>> qemu_mutex_unlock_ramlist();
>> return;
>> }
>> - memory_try_enable_merging(new_block->host,
>> new_block->max_length);
>> }
>> + memory_try_enable_merging(new_block->host, new_block->max_length);
>> }
>>
>> new_ram_size = MAX(old_ram_size,
>> @@ -2231,6 +2274,7 @@ void qemu_ram_free(RAMBlock *block)
>> }
>>
>> qemu_mutex_lock_ramlist();
>> + cpr_delete_memfd(memory_region_name(block->mr));
>> QLIST_REMOVE_RCU(block, next);
>> ram_list.mru_block = NULL;
>> /* Write list before version */
>> diff --git a/trace-events b/trace-events
>> index bc71006..07369bb 100644
>> --- a/trace-events
>> +++ b/trace-events
>> @@ -45,6 +45,7 @@ ram_block_discard_range(const char *rbname, void *hva,
>> size_t length, bool need_
>> # accel/tcg/cputlb.c
>> memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned
>> size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
>> memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
>> +anon_memfd_alloc(const char *name, size_t size, void *ptr, int fd) "%s size
>> %zu ptr %p fd %d"
>>
>> # gdbstub.c
>> gdbstub_op_start(const char *device) "Starting gdbstub using device %s"