On Wed, Feb 21, 2024 at 8:00 AM Elena Ufimtseva <ufimts...@gmail.com> wrote: > > > > On Fri, Feb 16, 2024 at 2:42 PM Hao Xiang <hao.xi...@bytedance.com> wrote: >> >> 1. Implements the zero page detection and handling on the multifd >> threads for non-compression, zlib and zstd compression backends. >> 2. Added a new value 'multifd' in ZeroPageDetection enumeration. >> 3. Add proper asserts to ensure pages->normal are used for normal pages >> in all scenarios. >> >> Signed-off-by: Hao Xiang <hao.xi...@bytedance.com> >> --- >> migration/meson.build | 1 + >> migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++ >> migration/multifd-zlib.c | 26 ++++++++++++--- >> migration/multifd-zstd.c | 25 ++++++++++++--- >> migration/multifd.c | 50 +++++++++++++++++++++++------ >> migration/multifd.h | 7 +++++ >> qapi/migration.json | 4 ++- >> 7 files changed, 151 insertions(+), 21 deletions(-) >> create mode 100644 migration/multifd-zero-page.c >> >> diff --git a/migration/meson.build b/migration/meson.build >> index 92b1cc4297..1eeb915ff6 100644 >> --- a/migration/meson.build >> +++ b/migration/meson.build >> @@ -22,6 +22,7 @@ system_ss.add(files( >> 'migration.c', >> 'multifd.c', >> 'multifd-zlib.c', >> + 'multifd-zero-page.c', >> 'ram-compress.c', >> 'options.c', >> 'postcopy-ram.c', >> diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c >> new file mode 100644 >> index 0000000000..f0cd8e2c53 >> --- /dev/null >> +++ b/migration/multifd-zero-page.c >> @@ -0,0 +1,59 @@ >> +/* >> + * Multifd zero page detection implementation. >> + * >> + * Copyright (c) 2024 Bytedance Inc >> + * >> + * Authors: >> + * Hao Xiang <hao.xi...@bytedance.com> >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2 or later. >> + * See the COPYING file in the top-level directory. >> + */ >> + >> +#include "qemu/osdep.h" >> +#include "qemu/cutils.h" >> +#include "exec/ramblock.h" >> +#include "migration.h" >> +#include "multifd.h" >> +#include "options.h" >> +#include "ram.h" >> + >> +void multifd_zero_page_check_send(MultiFDSendParams *p) >> +{ >> + /* >> + * QEMU older than 9.0 don't understand zero page >> + * on multifd channel. This switch is required to >> + * maintain backward compatibility. >> + */ >> + bool use_multifd_zero_page = >> + (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD); >> + MultiFDPages_t *pages = p->pages; >> + RAMBlock *rb = pages->block; >> + >> + assert(pages->num != 0); > > > Not needed, the check is done right before calling send_prepare. > >> >> + assert(pages->normal_num == 0); >> + assert(pages->zero_num == 0); > > > Why these asserts are needed?
The idea is that when multifd_zero_page_check_send is called, I want to make sure zero page checking was not processed on this packet before. It is perhaps redundant. It won't compile in free build. >> >> + >> >> + for (int i = 0; i < pages->num; i++) { >> + uint64_t offset = pages->offset[i]; >> + if (use_multifd_zero_page && >> + buffer_is_zero(rb->host + offset, p->page_size)) { >> + pages->zero[pages->zero_num] = offset; >> + pages->zero_num++; >> + ram_release_page(rb->idstr, offset); >> + } else { >> + pages->normal[pages->normal_num] = offset; >> + pages->normal_num++; >> + } >> + } >> +} >> + >> +void multifd_zero_page_check_recv(MultiFDRecvParams *p) >> +{ >> + for (int i = 0; i < p->zero_num; i++) { >> + void *page = p->host + p->zero[i]; >> + if (!buffer_is_zero(page, p->page_size)) { >> + memset(page, 0, p->page_size); >> + } >> + } >> +} >> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c >> index 012e3bdea1..cdfe0fa70e 100644 >> --- a/migration/multifd-zlib.c >> +++ b/migration/multifd-zlib.c >> @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, >> Error **errp) >> int ret; >> uint32_t i; >> >> + multifd_zero_page_check_send(p); >> + >> + if (!pages->normal_num) { >> + p->next_packet_size = 0; >> + goto out; >> + } >> + >> multifd_send_prepare_header(p); >> >> - for (i = 0; i < pages->num; i++) { >> + for (i = 0; i < pages->normal_num; i++) { >> uint32_t available = z->zbuff_len - out_size; >> int flush = Z_NO_FLUSH; >> >> - if (i == pages->num - 1) { >> + if (i == pages->normal_num - 1) { >> flush = Z_SYNC_FLUSH; >> } >> >> @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error >> **errp) >> * with compression. zlib does not guarantee that this is safe, >> * therefore copy the page before calling deflate(). >> */ >> - memcpy(z->buf, p->pages->block->host + pages->offset[i], >> p->page_size); >> + memcpy(z->buf, p->pages->block->host + pages->normal[i], >> p->page_size); >> zs->avail_in = p->page_size; >> zs->next_in = z->buf; >> >> @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, >> Error **errp) >> p->iov[p->iovs_num].iov_len = out_size; >> p->iovs_num++; >> p->next_packet_size = out_size; >> - p->flags |= MULTIFD_FLAG_ZLIB; >> >> +out: >> + p->flags |= MULTIFD_FLAG_ZLIB; >> multifd_send_fill_packet(p); >> - > > Spurious? We need to set the flag anyway otherwise the receiver side will complain. > >> return 0; >> } >> >> @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error >> **errp) >> p->id, flags, MULTIFD_FLAG_ZLIB); >> return -1; >> } >> + >> + multifd_zero_page_check_recv(p); >> + >> + if (!p->normal_num) { >> + assert(in_size == 0); >> + return 0; > > > return here will have no effect. Also, why is assert needed here? We return here so we don't end up calling qio_channel_read_all with buflen = 0. The assert makes sure that normal_num/next_packet_size states are consistent. > This change also does not seem to fit the description of the patch, probaby > separate patch will be better. These changes are needed to ensure basic functionalities are working after enabling the multifd zero page format. > >> >> + } >> + >> ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); >> >> if (ret != 0) { >> @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error >> **errp) >> p->id, out_size, expected_size); >> return -1; >> } >> + >> return 0; >> } >> >> diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c >> index dc8fe43e94..27a1eba075 100644 >> --- a/migration/multifd-zstd.c >> +++ b/migration/multifd-zstd.c >> @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p, >> Error **errp) >> int ret; >> uint32_t i; >> >> + multifd_zero_page_check_send(p); >> + >> + if (!pages->normal_num) { >> + p->next_packet_size = 0; >> + goto out; >> + } >> + >> multifd_send_prepare_header(p); >> >> z->out.dst = z->zbuff; >> z->out.size = z->zbuff_len; >> z->out.pos = 0; >> >> - for (i = 0; i < pages->num; i++) { >> + for (i = 0; i < pages->normal_num; i++) { >> ZSTD_EndDirective flush = ZSTD_e_continue; >> >> - if (i == pages->num - 1) { >> + if (i == pages->normal_num - 1) { >> flush = ZSTD_e_flush; >> } >> - z->in.src = p->pages->block->host + pages->offset[i]; >> + z->in.src = p->pages->block->host + pages->normal[i]; >> z->in.size = p->page_size; >> z->in.pos = 0; >> >> @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, >> Error **errp) >> p->iov[p->iovs_num].iov_len = z->out.pos; >> p->iovs_num++; >> p->next_packet_size = z->out.pos; >> - p->flags |= MULTIFD_FLAG_ZSTD; >> >> +out: >> + p->flags |= MULTIFD_FLAG_ZSTD; >> multifd_send_fill_packet(p); >> - > > Spurious removal. Can you elaborate on this? > >> >> return 0; >> } >> >> @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error >> **errp) >> p->id, flags, MULTIFD_FLAG_ZSTD); >> return -1; >> } >> + >> + multifd_zero_page_check_recv(p); >> + >> + if (!p->normal_num) { >> + assert(in_size == 0); >> + return 0; >> + } >> + > > Same question here about assert. > >> >> ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); >> >> if (ret != 0) { >> diff --git a/migration/multifd.c b/migration/multifd.c >> index a33dba40d9..fbb40ea10b 100644 >> --- a/migration/multifd.c >> +++ b/migration/multifd.c >> @@ -11,6 +11,7 @@ >> */ >> >> #include "qemu/osdep.h" >> +#include "qemu/cutils.h" >> #include "qemu/rcu.h" >> #include "exec/target_page.h" >> #include "sysemu/sysemu.h" >> @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, >> Error **errp) >> MultiFDPages_t *pages = p->pages; >> int ret; >> >> + multifd_zero_page_check_send(p); >> + >> if (!use_zero_copy_send) { >> /* >> * Only !zerocopy needs the header in IOV; zerocopy will >> @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, >> Error **errp) >> multifd_send_prepare_header(p); >> } >> >> - for (int i = 0; i < pages->num; i++) { >> - p->iov[p->iovs_num].iov_base = pages->block->host + >> pages->offset[i]; >> + for (int i = 0; i < pages->normal_num; i++) { >> + p->iov[p->iovs_num].iov_base = pages->block->host + >> pages->normal[i]; >> p->iov[p->iovs_num].iov_len = p->page_size; >> p->iovs_num++; >> } >> >> - p->next_packet_size = pages->num * p->page_size; >> + p->next_packet_size = pages->normal_num * p->page_size; >> p->flags |= MULTIFD_FLAG_NOCOMP; >> >> multifd_send_fill_packet(p); >> @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, >> Error **errp) >> p->id, flags, MULTIFD_FLAG_NOCOMP); >> return -1; >> } >> + >> + multifd_zero_page_check_recv(p); >> + >> + if (!p->normal_num) { >> + return 0; >> + } >> + >> for (int i = 0; i < p->normal_num; i++) { >> p->iov[i].iov_base = p->host + p->normal[i]; >> p->iov[i].iov_len = p->page_size; >> @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p) >> >> packet->flags = cpu_to_be32(p->flags); >> packet->pages_alloc = cpu_to_be32(p->pages->allocated); >> - packet->normal_pages = cpu_to_be32(pages->num); >> + packet->normal_pages = cpu_to_be32(pages->normal_num); >> packet->zero_pages = cpu_to_be32(pages->zero_num); >> packet->next_packet_size = cpu_to_be32(p->next_packet_size); >> >> @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p) >> strncpy(packet->ramblock, pages->block->idstr, 256); >> } >> >> - for (i = 0; i < pages->num; i++) { >> + for (i = 0; i < pages->normal_num; i++) { >> /* there are architectures where ram_addr_t is 32 bit */ >> - uint64_t temp = pages->offset[i]; >> + uint64_t temp = pages->normal[i]; >> >> packet->offset[i] = cpu_to_be64(temp); >> } >> >> + for (i = 0; i < pages->zero_num; i++) { >> + /* there are architectures where ram_addr_t is 32 bit */ >> + uint64_t temp = pages->zero[i]; >> + >> + packet->offset[pages->normal_num + i] = cpu_to_be64(temp); >> + } >> + >> p->packets_sent++; >> - p->total_normal_pages += pages->num; >> + p->total_normal_pages += pages->normal_num; >> p->total_zero_pages += pages->zero_num; >> >> - trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num, >> + trace_multifd_send(p->id, packet_num, pages->normal_num, >> pages->zero_num, >> p->flags, p->next_packet_size); >> } >> >> @@ -451,6 +468,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams >> *p, Error **errp) >> p->normal[i] = offset; >> } >> >> + for (i = 0; i < p->zero_num; i++) { >> + uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); >> + >> + if (offset > (p->block->used_length - p->page_size)) { >> + error_setg(errp, "multifd: offset too long %" PRIu64 >> + " (max " RAM_ADDR_FMT ")", >> + offset, p->block->used_length); >> + return -1; >> + } >> + p->zero[i] = offset; >> + } >> + >> return 0; >> } >> >> @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque) >> >> stat64_add(&mig_stats.multifd_bytes, >> p->next_packet_size + p->packet_len); >> - stat64_add(&mig_stats.normal_pages, pages->num); >> + stat64_add(&mig_stats.normal_pages, pages->normal_num); >> stat64_add(&mig_stats.zero_pages, pages->zero_num); >> >> multifd_pages_reset(p->pages); >> @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque) >> p->flags &= ~MULTIFD_FLAG_SYNC; >> qemu_mutex_unlock(&p->mutex); >> >> - if (p->normal_num) { >> + if (p->normal_num + p->zero_num) { >> + assert(!(flags & MULTIFD_FLAG_SYNC)); > > This assertion seems to be not relevant to this patch. Could you post it > separately and explain why it's needed here? I thought this is nice to have because it ensures that sync packet and data packet are mutually exclusive. I agree this is not needed for things to work but does this deserve its own patch? > >> >> ret = multifd_recv_state->ops->recv_pages(p, &local_err); >> if (ret != 0) { >> break; >> diff --git a/migration/multifd.h b/migration/multifd.h >> index 9822ff298a..125f0bbe60 100644 >> --- a/migration/multifd.h >> +++ b/migration/multifd.h >> @@ -53,6 +53,11 @@ typedef struct { >> uint32_t unused32[1]; /* Reserved for future use */ >> uint64_t unused64[3]; /* Reserved for future use */ >> char ramblock[256]; >> + /* >> + * This array contains the pointers to: >> + * - normal pages (initial normal_pages entries) >> + * - zero pages (following zero_pages entries) >> + */ >> uint64_t offset[]; >> } __attribute__((packed)) MultiFDPacket_t; >> >> @@ -224,6 +229,8 @@ typedef struct { >> >> void multifd_register_ops(int method, MultiFDMethods *ops); >> void multifd_send_fill_packet(MultiFDSendParams *p); >> +void multifd_zero_page_check_send(MultiFDSendParams *p); >> +void multifd_zero_page_check_recv(MultiFDRecvParams *p); >> >> static inline void multifd_send_prepare_header(MultiFDSendParams *p) >> { >> diff --git a/qapi/migration.json b/qapi/migration.json >> index 99843a8e95..e2450b92d4 100644 >> --- a/qapi/migration.json >> +++ b/qapi/migration.json >> @@ -660,9 +660,11 @@ >> # >> # @none: Do not perform zero page checking. >> # >> +# @multifd: Perform zero page checking on the multifd sender thread. (since >> 9.0) >> +# >> ## >> { 'enum': 'ZeroPageDetection', >> - 'data': [ 'legacy', 'none' ] } >> + 'data': [ 'legacy', 'none', 'multifd' ] } >> >> ## >> # @BitmapMigrationBitmapAliasTransform: >> -- >> 2.30.2 >> >> > > > -- > Elena