This commit adds functions used to open the checkpoint saved by the dump-guest-memory command and populate the hash table used by the checkpoint-assisted migration mechanism. SHA256 is used to checkpoint the pages. Only ELF memory dump format is supported at the moment.
Signed-off-by: Bohdan Trach <bohdan.tr...@mailbox.tu-dresden.de> --- include/migration/migration.h | 4 ++ migration/ram.c | 157 ++++++++++++++++++++++++++++++++++++++++++ qemu-options.hx | 9 +++ trace-events | 3 + vl.c | 9 +++ 5 files changed, 182 insertions(+) diff --git a/include/migration/migration.h b/include/migration/migration.h index fd018b7..4904c85 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -321,4 +321,8 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname, PostcopyState postcopy_state_get(void); /* Set the state and return the old state */ PostcopyState postcopy_state_set(PostcopyState new_state); + +void allocate_checksum_table(void); +void init_checksum_lookup_table(const char *checkpoint_path); +extern const char *checkpoint_path; #endif diff --git a/migration/ram.c b/migration/ram.c index 1eb155a..379a381 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -27,6 +27,7 @@ */ #include <stdint.h> #include <zlib.h> +#include <elf.h> #include "qemu/bitops.h" #include "qemu/bitmap.h" #include "qemu/timer.h" @@ -39,6 +40,7 @@ #include "trace.h" #include "exec/ram_addr.h" #include "qemu/rcu_queue.h" +#include "crypto/hash.h" #ifdef DEBUG_MIGRATION_RAM #define DPRINTF(fmt, ...) \ @@ -48,6 +50,159 @@ do { } while (0) #endif +#define SHA256_DIGEST_LENGTH 32 +static int fd_checkpoint = -1; +/* indexed by page number */ +static uint64_t hashes_size = 0; +static uint64_t hashes_entries = 0; +static uint8_t *hashes = 0; + +typedef struct { + uint8_t hash[SHA256_DIGEST_LENGTH]; + uint64_t offset; +} hash_offset_entry; + +static uint64_t hash_offset_entries = 0; +static uint64_t max_hash_offset_entries; +static hash_offset_entry* hash_offset_array = 0; +static uint8_t all_zeroes_hash[SHA256_DIGEST_LENGTH]; + +static inline void SHA256(void *data, size_t data_len, void* digest) +{ + uint8_t *out = NULL; + size_t rlen = 0; + qcrypto_hash_bytes(QCRYPTO_HASH_ALG_SHA256, data, data_len, &out, &rlen, NULL); + assert(rlen == SHA256_DIGEST_LENGTH); + memcpy(digest, out, rlen); + g_free(out); +} + +static char* sha256s(const uint8_t *digest) { + /* SHA256 is 32 bytes, i.e., 64 hexadecimal digits. + 1 for trailing \0. */ + static const size_t size = 64 + 1; + static char hex_digits[64 + 1]; + int digit; + + for (digit = 0; digit < 64; digit += 2) { + snprintf(hex_digits+digit, 3, "%02x", digest[digit/2]); + } + + hex_digits[size-1] = '\0'; + return hex_digits; +} + +static int uint256_compare(const void* x, const void* y) +{ + return memcmp(x, y, SHA256_DIGEST_LENGTH); +} + +static int cmp_hash_offset_entry(const void* a, const void* b) { + hash_offset_entry* e = (hash_offset_entry*) a; + hash_offset_entry* f = (hash_offset_entry*) b; + + return memcmp(e->hash, f->hash, SHA256_DIGEST_LENGTH); +} + +void allocate_checksum_table(void) { + RAMBlock *block; + size_t sz = 0; + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + sz += block->used_length; + } + + max_hash_offset_entries = hashes_entries = (sz / TARGET_PAGE_SIZE); + trace_allocate_checksum_table(hashes_entries); + hashes_size = hashes_entries * SHA256_DIGEST_LENGTH; + + hashes = g_try_malloc0(hashes_size); + if (!hashes) { + error_report("Error allocating hashes"); + return; + } + + uint8_t all_zeroes[TARGET_PAGE_SIZE]; + bzero(all_zeroes, TARGET_PAGE_SIZE); + SHA256(all_zeroes, TARGET_PAGE_SIZE, all_zeroes_hash); + + hash_offset_array = g_try_malloc0(max_hash_offset_entries * sizeof(hash_offset_entry)); + if (!hash_offset_array) { + error_report("Error allocating hash_offset_array"); + return; + } +} + +/* phdr.p_offset + phdr.p_memsz is the beginning of the dumped memory */ +static off_t seek_elf64(int f) +{ + Elf64_Ehdr elf; + Elf64_Phdr phdr; + off_t off; + + assert(sizeof(elf) == read(f, &elf, sizeof(elf))); + assert(sizeof(phdr) == read(f, &phdr, sizeof(phdr))); + off = lseek(f, phdr.p_offset + phdr.p_memsz, SEEK_SET); + return off; +} + +static off_t seek_elf32(int f) +{ + Elf32_Ehdr elf; + Elf32_Phdr phdr; + off_t off; + + assert(sizeof(elf) == read(f, &elf, sizeof(elf))); + assert(sizeof(phdr) == read(f, &phdr, sizeof(phdr))); + off = lseek(f, phdr.p_offset + phdr.p_memsz, SEEK_SET); + return off; +} + +static off_t seek_to_memory(int checkpoint_fd) +{ + char ident[16]; + assert(16 == read(checkpoint_fd, ident, sizeof(ident))); + /* seek_elf* expect zero offset */ + lseek(checkpoint_fd, 0, SEEK_SET); + if (ident[EI_CLASS] == ELFCLASS64) { + return seek_elf64(checkpoint_fd); + } else { + return seek_elf32(checkpoint_fd); + } +} + +void init_checksum_lookup_table(const char *checkpoint_path) +{ + ssize_t rc; + uint8_t* pg; + struct stat sb; + uint64_t idx; + + trace_init_checksum_lookup_table_start(ram_size); + + rc = stat(checkpoint_path, &sb); + if (rc == -1 && errno == ENOENT) return; + assert(rc == 0); + + pg = g_malloc0(TARGET_PAGE_SIZE); + fd_checkpoint = qemu_open(checkpoint_path, O_RDONLY); + assert(fd_checkpoint != -1); + + for (idx = seek_to_memory(fd_checkpoint); idx < sb.st_size; + idx += TARGET_PAGE_SIZE) { + rc = read(fd_checkpoint, pg, TARGET_PAGE_SIZE); + assert(rc == TARGET_PAGE_SIZE); + assert(hash_offset_entries < max_hash_offset_entries); + SHA256(pg, TARGET_PAGE_SIZE, hash_offset_array[hash_offset_entries].hash); + hash_offset_array[hash_offset_entries].offset = idx; + trace_init_checksum_lookup_table_hash( + sha256s(hash_offset_array[hash_offset_entries].hash), + hash_offset_array[hash_offset_entries].offset); + hash_offset_entries++; + }; + + qsort(hash_offset_array, hash_offset_entries, sizeof(hash_offset_entry), + cmp_hash_offset_entry); + g_free(pg); +} static int dirty_rate_high_cnt; static uint64_t bitmap_sync_count; @@ -1874,6 +2029,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque) migration_bitmap_sync_init(); qemu_mutex_init(&migration_bitmap_mutex); + qsort(hashes, hashes_entries, SHA256_DIGEST_LENGTH, uint256_compare); + if (migrate_use_xbzrle()) { XBZRLE_cache_lock(); XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / diff --git a/qemu-options.hx b/qemu-options.hx index 0eea4ee..1913375 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -3557,6 +3557,15 @@ Dump json-encoded vmstate information for current machine type to file in @var{file} ETEXI +DEF("checkpoint", HAS_ARG, QEMU_OPTION_checkpoint, + "-checkpoint file path to checkpoint file\n", QEMU_ARCH_ALL) +STEXI +@item -checkpoint @var{path} +@findex -checkpoint +Checkpoint file to use during incoming migrations. Reduces network +traffic and total migration time. +ETEXI + DEFHEADING(Generic object creation) DEF("object", HAS_ARG, QEMU_OPTION_object, diff --git a/trace-events b/trace-events index 0b0ff02..eee060b 100644 --- a/trace-events +++ b/trace-events @@ -1264,6 +1264,9 @@ migration_throttle(void) "" ram_load_postcopy_loop(uint64_t addr, int flags) "@%" PRIx64 " %x" ram_postcopy_send_discard_bitmap(void) "" ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %zx len: %zx" +allocate_checksum_table(uint64_t npages) "pages=%" PRIu64 +init_checksum_lookup_table_start(uint64_t ram_size) "ram_size=%" PRIu64 +init_checksum_lookup_table_hash(const char* hash, uint64_t offset) "hash=%s offset=%" PRIu64 # hw/display/qxl.c disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d" diff --git a/vl.c b/vl.c index 525929b..2dfac86 100644 --- a/vl.c +++ b/vl.c @@ -138,6 +138,7 @@ int display_opengl; static int display_remote; const char* keyboard_layout = NULL; ram_addr_t ram_size; +const char *checkpoint_path = NULL; const char *mem_path = NULL; int mem_prealloc = 0; /* force preallocation of physical target memory */ bool enable_mlock = false; @@ -3355,6 +3356,9 @@ int main(int argc, char **argv, char **envp) } break; #endif + case QEMU_OPTION_checkpoint: + checkpoint_path = optarg; + break; case QEMU_OPTION_mempath: mem_path = optarg; break; @@ -4653,6 +4657,7 @@ int main(int argc, char **argv, char **envp) } } + allocate_checksum_table(); qdev_prop_check_globals(); if (vmstate_dump_file) { /* dump and exit */ @@ -4662,6 +4667,10 @@ int main(int argc, char **argv, char **envp) if (incoming) { Error *local_err = NULL; + if (checkpoint_path) { + init_checksum_lookup_table(checkpoint_path); + } + qemu_start_incoming_migration(incoming, &local_err); if (local_err) { error_report("-incoming %s: %s", incoming, -- 2.4.10