On 1/31/24 08:53, Mark Kanda wrote:
> QEMU initializes preallocated backend memory as the objects are parsed from
> the command line. This is not optimal in some cases (e.g. memory spanning
> multiple NUMA nodes) because the memory objects are initialized in series.
>
> Allow the initialization to occur in parallel (asynchronously). In order to
> ensure optimal thread placement, asynchronous initialization requires prealloc
> context threads to be in use.
>
> Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
> Signed-off-by: David Hildenbrand <da...@redhat.com>
> ---
> backends/hostmem.c | 7 ++-
> hw/virtio/virtio-mem.c | 4 +-
> include/hw/qdev-core.h | 5 ++
> include/qemu/osdep.h | 18 +++++-
> system/vl.c | 9 +++
> util/oslib-posix.c | 131 +++++++++++++++++++++++++++++++----------
> util/oslib-win32.c | 8 ++-
> 7 files changed, 145 insertions(+), 37 deletions(-)
>
> diff --git a/backends/hostmem.c b/backends/hostmem.c
> index 30f69b2cb5..17221e422a 100644
> --- a/backends/hostmem.c
> +++ b/backends/hostmem.c
> @@ -20,6 +20,7 @@
> #include "qom/object_interfaces.h"
> #include "qemu/mmap-alloc.h"
> #include "qemu/madvise.h"
> +#include "hw/qdev-core.h"
>
> #ifdef CONFIG_NUMA
> #include <numaif.h>
> @@ -237,7 +238,7 @@ static void host_memory_backend_set_prealloc(Object *obj,
> bool value,
> uint64_t sz = memory_region_size(&backend->mr);
>
> if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
> - backend->prealloc_context, errp)) {
> + backend->prealloc_context, false, errp)) {
> return;
> }
> backend->prealloc = true;
> @@ -323,6 +324,7 @@ host_memory_backend_memory_complete(UserCreatable *uc,
> Error **errp)
> HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
> void *ptr;
> uint64_t sz;
> + bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
>
> if (!bc->alloc) {
> return;
> @@ -398,7 +400,8 @@ host_memory_backend_memory_complete(UserCreatable *uc,
> Error **errp)
> if (backend->prealloc &&
> !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
> ptr, sz,
> backend->prealloc_threads,
> - backend->prealloc_context,
> errp)) {
> + backend->prealloc_context,
> + async, errp)) {
> return;
> }
> }
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index 99ab989852..ffd119ebac 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem,
> uint64_t start_gpa,
> int fd = memory_region_get_fd(&vmem->memdev->mr);
> Error *local_err = NULL;
>
> - if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
> + if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
> static bool warned;
>
> /*
> @@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM
> *vmem, void *arg,
> int fd = memory_region_get_fd(&vmem->memdev->mr);
> Error *local_err = NULL;
>
> - if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
> + if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
> error_report_err(local_err);
> return -ENOMEM;
> }
> diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
> index 151d968238..83dd9e2485 100644
> --- a/include/hw/qdev-core.h
> +++ b/include/hw/qdev-core.h
> @@ -1071,6 +1071,11 @@ typedef enum MachineInitPhase {
> */
> PHASE_ACCEL_CREATED,
>
> + /*
> + * Late backend objects have been created and initialized.
> + */
> + PHASE_LATE_BACKENDS_CREATED,
> +
> /*
> * machine_class->init has been called, thus creating any embedded
> * devices and validating machine properties. Devices created at
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index c9692cc314..7d359dabc4 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
> * @area: start address of the are to preallocate
> * @sz: the size of the area to preallocate
> * @max_threads: maximum number of threads to use
> + * @tc: prealloc context threads pointer, NULL if not in use
> + * @async: request asynchronous preallocation, requires @tc
> * @errp: returns an error if this function fails
> *
> * Preallocate memory (populate/prefault page tables writable) for the
> virtual
> @@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
> * each page in the area was faulted in writable at least once, for example,
> * after allocating file blocks for mapped files.
> *
> + * When setting @async, allocation might be performed asynchronously.
> + * qemu_finish_async_prealloc_mem() must be called to finish any asynchronous
> + * preallocation.
> + *
> * Return: true on success, else false setting @errp with error.
> */
> bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> - ThreadContext *tc, Error **errp);
> + ThreadContext *tc, bool async, Error **errp);
> +
> +/**
> + * qemu_finish_async_prealloc_mem:
> + * @errp: returns an error if this function fails
> + *
> + * Finish all outstanding asynchronous memory preallocation.
> + *
> + * Return: true on success, else false setting @errp with error.
> + */
> +bool qemu_finish_async_prealloc_mem(Error **errp);
>
> /**
> * qemu_get_pid_name:
> diff --git a/system/vl.c b/system/vl.c
> index 788d88ea03..e6bc5d9dd9 100644
> --- a/system/vl.c
> +++ b/system/vl.c
> @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)
>
> object_option_foreach_add(object_create_late);
>
> + /*
> + * Wait for any outstanding memory prealloc from created memory
> + * backends to complete.
> + */
> + if (!qemu_finish_async_prealloc_mem(&error_fatal)) {
> + exit(1);
> + }
> +
> if (tpm_init() < 0) {
> exit(1);
> }
> @@ -3695,6 +3703,7 @@ void qemu_init(int argc, char **argv)
> * over memory-backend-file objects).
> */
> qemu_create_late_backends();
> + phase_advance(PHASE_LATE_BACKENDS_CREATED);
>
> /*
> * Note: creates a QOM object, must run only after global and
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 7c297003b9..dada4722f6 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -42,6 +42,7 @@
> #include "qemu/cutils.h"
> #include "qemu/units.h"
> #include "qemu/thread-context.h"
> +#include "qemu/main-loop.h"
>
> #ifdef CONFIG_LINUX
> #include <sys/syscall.h>
> @@ -63,11 +64,15 @@
>
> struct MemsetThread;
>
> +static QLIST_HEAD(, MemsetContext) memset_contexts =
> + QLIST_HEAD_INITIALIZER(memset_contexts);
> +
> typedef struct MemsetContext {
> bool all_threads_created;
> bool any_thread_failed;
> struct MemsetThread *threads;
> int num_threads;
> + QLIST_ENTRY(MemsetContext) next;
> } MemsetContext;
>
> struct MemsetThread {
> @@ -412,19 +417,44 @@ static inline int get_memset_num_threads(size_t
> hpagesize, size_t numpages,
> return ret;
> }
>
> +static int wait_and_free_mem_prealloc_context(MemsetContext *context)
> +{
> + int i, ret = 0, tmp;
> +
> + for (i = 0; i < context->num_threads; i++) {
> + tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
> +
> + if (tmp) {
> + ret = tmp;
> + }
> + }
> + g_free(context->threads);
> + g_free(context);
> + return ret;
> +}
> +
> static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> - int max_threads, ThreadContext *tc,
> + int max_threads, ThreadContext *tc, bool async,
> bool use_madv_populate_write)
> {
> static gsize initialized = 0;
> - MemsetContext context = {
> - .num_threads = get_memset_num_threads(hpagesize, numpages,
> max_threads),
> - };
> + MemsetContext *context = g_malloc0(sizeof(MemsetContext));
> size_t numpages_per_thread, leftover;
> void *(*touch_fn)(void *);
> - int ret = 0, i = 0;
> + int ret, i = 0;
> char *addr = area;
>
> + /*
> + * Asynchronous preallocation is only allowed when using
> MADV_POPULATE_WRITE
> + * and prealloc context for thread placement.
> + */
> + if (!use_madv_populate_write || !tc) {
> + async = false;
> + }
> +
> + context->num_threads =
> + get_memset_num_threads(hpagesize, numpages, max_threads);
> +
> if (g_once_init_enter(&initialized)) {
> qemu_mutex_init(&page_mutex);
> qemu_cond_init(&page_cond);
> @@ -432,8 +462,11 @@ static int touch_all_pages(char *area, size_t hpagesize,
> size_t numpages,
> }
>
> if (use_madv_populate_write) {
> - /* Avoid creating a single thread for MADV_POPULATE_WRITE */
> - if (context.num_threads == 1) {
> + /*
> + * Avoid creating a single thread for MADV_POPULATE_WRITE when
> + * preallocating synchronously.
> + */
> + if (context->num_threads == 1 && !async) {
> if (qemu_madvise(area, hpagesize * numpages,
> QEMU_MADV_POPULATE_WRITE)) {
> return -errno;
> @@ -445,50 +478,86 @@ static int touch_all_pages(char *area, size_t
> hpagesize, size_t numpages,
> touch_fn = do_touch_pages;
> }
>
> - context.threads = g_new0(MemsetThread, context.num_threads);
> - numpages_per_thread = numpages / context.num_threads;
> - leftover = numpages % context.num_threads;
> - for (i = 0; i < context.num_threads; i++) {
> - context.threads[i].addr = addr;
> - context.threads[i].numpages = numpages_per_thread + (i < leftover);
> - context.threads[i].hpagesize = hpagesize;
> - context.threads[i].context = &context;
> + context->threads = g_new0(MemsetThread, context->num_threads);
> + numpages_per_thread = numpages / context->num_threads;
> + leftover = numpages % context->num_threads;
> + for (i = 0; i < context->num_threads; i++) {
> + context->threads[i].addr = addr;
> + context->threads[i].numpages = numpages_per_thread + (i < leftover);
> + context->threads[i].hpagesize = hpagesize;
> + context->threads[i].context = context;
> if (tc) {
> - thread_context_create_thread(tc, &context.threads[i].pgthread,
> + thread_context_create_thread(tc, &context->threads[i].pgthread,
> "touch_pages",
> - touch_fn, &context.threads[i],
> + touch_fn, &context->threads[i],
> QEMU_THREAD_JOINABLE);
> } else {
> - qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
> - touch_fn, &context.threads[i],
> + qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
> + touch_fn, &context->threads[i],
> QEMU_THREAD_JOINABLE);
> }
> - addr += context.threads[i].numpages * hpagesize;
> + addr += context->threads[i].numpages * hpagesize;
> + }
> +
> + if (async) {
> + /*
> + * async requests currently require the BQL. Add it to the list and
> kick
> + * preallocation off during qemu_finish_async_prealloc_mem().
> + */
> + assert(bql_locked());
> + QLIST_INSERT_HEAD(&memset_contexts, context, next);
> + return 0;
> }
>
> if (!use_madv_populate_write) {
> - sigbus_memset_context = &context;
> + sigbus_memset_context = context;
> }
>
> qemu_mutex_lock(&page_mutex);
> - context.all_threads_created = true;
> + context->all_threads_created = true;
> qemu_cond_broadcast(&page_cond);
> qemu_mutex_unlock(&page_mutex);
>
> - for (i = 0; i < context.num_threads; i++) {
> - int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
> + ret = wait_and_free_mem_prealloc_context(context);
>
> + if (!use_madv_populate_write) {
> + sigbus_memset_context = NULL;
> + }
> + return ret;
> +}
> +
> +bool qemu_finish_async_prealloc_mem(Error **errp)
> +{
> + int ret, tmp;
The above should be initialized?
I did a build test and encounter:
In file included from ../util/oslib-posix.c:36:
../util/oslib-posix.c: In function ‘qemu_finish_async_prealloc_mem’:
/home/libvirt/vm/software/qemu/include/qapi/error.h:334:5: error: ‘ret’ may be
used uninitialized in this function [-Werror=maybe-uninitialized]
334 | error_setg_errno_internal((errp), __FILE__, __LINE__, __func__,
\
| ^~~~~~~~~~~~~~~~~~~~~~~~~
../util/oslib-posix.c:531:9: note: ‘ret’ was declared here
531 | int ret, tmp;
| ^~~
cc1: all warnings being treated as errors
ninja: build stopped: subcommand failed.
make: *** [Makefile:162: run-ninja] Error 1
Thank you very much!
Dongli Zhang
> + MemsetContext *context, *next_context;
> +
> + /* Waiting for preallocation requires the BQL. */
> + assert(bql_locked());
> + if (QLIST_EMPTY(&memset_contexts)) {
> + return true;
> + }
> +
> + qemu_mutex_lock(&page_mutex);
> + QLIST_FOREACH(context, &memset_contexts, next) {
> + context->all_threads_created = true;
> + }
> + qemu_cond_broadcast(&page_cond);
> + qemu_mutex_unlock(&page_mutex);
> +
> + QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
> + QLIST_REMOVE(context, next);
> + tmp = wait_and_free_mem_prealloc_context(context);
> if (tmp) {
> ret = tmp;
> }
> }
>
> - if (!use_madv_populate_write) {
> - sigbus_memset_context = NULL;
> + if (ret) {
> + error_setg_errno(errp, -ret,
> + "qemu_prealloc_mem: preallocating memory failed");
> + return false;
> }
> - g_free(context.threads);
> -
> - return ret;
> + return true;
> }
>
> static bool madv_populate_write_possible(char *area, size_t pagesize)
> @@ -498,7 +567,7 @@ static bool madv_populate_write_possible(char *area,
> size_t pagesize)
> }
>
> bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> - ThreadContext *tc, Error **errp)
> + ThreadContext *tc, bool async, Error **errp)
> {
> static gsize initialized;
> int ret;
> @@ -540,7 +609,7 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int
> max_threads,
> }
>
> /* touch pages simultaneously */
> - ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc,
> + ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
> use_madv_populate_write);
> if (ret) {
> error_setg_errno(errp, -ret,
> diff --git a/util/oslib-win32.c b/util/oslib-win32.c
> index c4a5f05a49..b623830d62 100644
> --- a/util/oslib-win32.c
> +++ b/util/oslib-win32.c
> @@ -265,7 +265,7 @@ int getpagesize(void)
> }
>
> bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> - ThreadContext *tc, Error **errp)
> + ThreadContext *tc, bool async, Error **errp)
> {
> int i;
> size_t pagesize = qemu_real_host_page_size();
> @@ -278,6 +278,12 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz,
> int max_threads,
> return true;
> }
>
> +bool qemu_finish_async_prealloc_mem(Error **errp)
> +{
> + /* async prealloc not supported, there is nothing to finish */
> + return true;
> +}
> +
> char *qemu_get_pid_name(pid_t pid)
> {
> /* XXX Implement me */