Re: [PATCH v4 1/1] oslib-posix: initialize backend memory objects in parallel

Dongli Zhang Sat, 03 Feb 2024 14:44:28 -0800

On 1/31/24 08:53, Mark Kanda wrote:
> QEMU initializes preallocated backend memory as the objects are parsed from
> the command line. This is not optimal in some cases (e.g. memory spanning
> multiple NUMA nodes) because the memory objects are initialized in series.
> 
> Allow the initialization to occur in parallel (asynchronously). In order to
> ensure optimal thread placement, asynchronous initialization requires prealloc
> context threads to be in use.
> 
> Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
> Signed-off-by: David Hildenbrand <da...@redhat.com>
> ---
>  backends/hostmem.c     |   7 ++-
>  hw/virtio/virtio-mem.c |   4 +-
>  include/hw/qdev-core.h |   5 ++
>  include/qemu/osdep.h   |  18 +++++-
>  system/vl.c            |   9 +++
>  util/oslib-posix.c     | 131 +++++++++++++++++++++++++++++++----------
>  util/oslib-win32.c     |   8 ++-
>  7 files changed, 145 insertions(+), 37 deletions(-)
> 
> diff --git a/backends/hostmem.c b/backends/hostmem.c
> index 30f69b2cb5..17221e422a 100644
> --- a/backends/hostmem.c
> +++ b/backends/hostmem.c
> @@ -20,6 +20,7 @@
>  #include "qom/object_interfaces.h"
>  #include "qemu/mmap-alloc.h"
>  #include "qemu/madvise.h"
> +#include "hw/qdev-core.h"
>  
>  #ifdef CONFIG_NUMA
>  #include <numaif.h>
> @@ -237,7 +238,7 @@ static void host_memory_backend_set_prealloc(Object *obj, 
> bool value,
>          uint64_t sz = memory_region_size(&backend->mr);
>  
>          if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
> -                               backend->prealloc_context, errp)) {
> +                               backend->prealloc_context, false, errp)) {
>              return;
>          }
>          backend->prealloc = true;
> @@ -323,6 +324,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
> Error **errp)
>      HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
>      void *ptr;
>      uint64_t sz;
> +    bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
>  
>      if (!bc->alloc) {
>          return;
> @@ -398,7 +400,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, 
> Error **errp)
>      if (backend->prealloc && 
> !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
>                                                  ptr, sz,
>                                                  backend->prealloc_threads,
> -                                                backend->prealloc_context, 
> errp)) {
> +                                                backend->prealloc_context,
> +                                                async, errp)) {
>          return;
>      }
>  }
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index 99ab989852..ffd119ebac 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, 
> uint64_t start_gpa,
>          int fd = memory_region_get_fd(&vmem->memdev->mr);
>          Error *local_err = NULL;
>  
> -        if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
> +        if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
>              static bool warned;
>  
>              /*
> @@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM 
> *vmem, void *arg,
>      int fd = memory_region_get_fd(&vmem->memdev->mr);
>      Error *local_err = NULL;
>  
> -    if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
> +    if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
>          error_report_err(local_err);
>          return -ENOMEM;
>      }
> diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
> index 151d968238..83dd9e2485 100644
> --- a/include/hw/qdev-core.h
> +++ b/include/hw/qdev-core.h
> @@ -1071,6 +1071,11 @@ typedef enum MachineInitPhase {
>       */
>      PHASE_ACCEL_CREATED,
>  
> +    /*
> +     * Late backend objects have been created and initialized.
> +     */
> +    PHASE_LATE_BACKENDS_CREATED,
> +
>      /*
>       * machine_class->init has been called, thus creating any embedded
>       * devices and validating machine properties.  Devices created at
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index c9692cc314..7d359dabc4 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
>   * @area: start address of the are to preallocate
>   * @sz: the size of the area to preallocate
>   * @max_threads: maximum number of threads to use
> + * @tc: prealloc context threads pointer, NULL if not in use
> + * @async: request asynchronous preallocation, requires @tc
>   * @errp: returns an error if this function fails
>   *
>   * Preallocate memory (populate/prefault page tables writable) for the 
> virtual
> @@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
>   * each page in the area was faulted in writable at least once, for example,
>   * after allocating file blocks for mapped files.
>   *
> + * When setting @async, allocation might be performed asynchronously.
> + * qemu_finish_async_prealloc_mem() must be called to finish any asynchronous
> + * preallocation.
> + *
>   * Return: true on success, else false setting @errp with error.
>   */
>  bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> -                       ThreadContext *tc, Error **errp);
> +                       ThreadContext *tc, bool async, Error **errp);
> +
> +/**
> + * qemu_finish_async_prealloc_mem:
> + * @errp: returns an error if this function fails
> + *
> + * Finish all outstanding asynchronous memory preallocation.
> + *
> + * Return: true on success, else false setting @errp with error.
> + */
> +bool qemu_finish_async_prealloc_mem(Error **errp);
>  
>  /**
>   * qemu_get_pid_name:
> diff --git a/system/vl.c b/system/vl.c
> index 788d88ea03..e6bc5d9dd9 100644
> --- a/system/vl.c
> +++ b/system/vl.c
> @@ -2009,6 +2009,14 @@ static void qemu_create_late_backends(void)
>  
>      object_option_foreach_add(object_create_late);
>  
> +    /*
> +     * Wait for any outstanding memory prealloc from created memory
> +     * backends to complete.
> +     */
> +    if (!qemu_finish_async_prealloc_mem(&error_fatal)) {
> +        exit(1);
> +    }
> +
>      if (tpm_init() < 0) {
>          exit(1);
>      }
> @@ -3695,6 +3703,7 @@ void qemu_init(int argc, char **argv)
>       * over memory-backend-file objects).
>       */
>      qemu_create_late_backends();
> +    phase_advance(PHASE_LATE_BACKENDS_CREATED);
>  
>      /*
>       * Note: creates a QOM object, must run only after global and
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 7c297003b9..dada4722f6 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -42,6 +42,7 @@
>  #include "qemu/cutils.h"
>  #include "qemu/units.h"
>  #include "qemu/thread-context.h"
> +#include "qemu/main-loop.h"
>  
>  #ifdef CONFIG_LINUX
>  #include <sys/syscall.h>
> @@ -63,11 +64,15 @@
>  
>  struct MemsetThread;
>  
> +static QLIST_HEAD(, MemsetContext) memset_contexts =
> +    QLIST_HEAD_INITIALIZER(memset_contexts);
> +
>  typedef struct MemsetContext {
>      bool all_threads_created;
>      bool any_thread_failed;
>      struct MemsetThread *threads;
>      int num_threads;
> +    QLIST_ENTRY(MemsetContext) next;
>  } MemsetContext;
>  
>  struct MemsetThread {
> @@ -412,19 +417,44 @@ static inline int get_memset_num_threads(size_t 
> hpagesize, size_t numpages,
>      return ret;
>  }
>  
> +static int wait_and_free_mem_prealloc_context(MemsetContext *context)
> +{
> +    int i, ret = 0, tmp;
> +
> +    for (i = 0; i < context->num_threads; i++) {
> +        tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
> +
> +        if (tmp) {
> +            ret = tmp;
> +        }
> +    }
> +    g_free(context->threads);
> +    g_free(context);
> +    return ret;
> +}
> +
>  static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> -                           int max_threads, ThreadContext *tc,
> +                           int max_threads, ThreadContext *tc, bool async,
>                             bool use_madv_populate_write)
>  {
>      static gsize initialized = 0;
> -    MemsetContext context = {
> -        .num_threads = get_memset_num_threads(hpagesize, numpages, 
> max_threads),
> -    };
> +    MemsetContext *context = g_malloc0(sizeof(MemsetContext));
>      size_t numpages_per_thread, leftover;
>      void *(*touch_fn)(void *);
> -    int ret = 0, i = 0;
> +    int ret, i = 0;
>      char *addr = area;
>  
> +    /*
> +     * Asynchronous preallocation is only allowed when using 
> MADV_POPULATE_WRITE
> +     * and prealloc context for thread placement.
> +     */
> +    if (!use_madv_populate_write || !tc) {
> +        async = false;
> +    }
> +
> +    context->num_threads =
> +        get_memset_num_threads(hpagesize, numpages, max_threads);
> +
>      if (g_once_init_enter(&initialized)) {
>          qemu_mutex_init(&page_mutex);
>          qemu_cond_init(&page_cond);
> @@ -432,8 +462,11 @@ static int touch_all_pages(char *area, size_t hpagesize, 
> size_t numpages,
>      }
>  
>      if (use_madv_populate_write) {
> -        /* Avoid creating a single thread for MADV_POPULATE_WRITE */
> -        if (context.num_threads == 1) {
> +        /*
> +         * Avoid creating a single thread for MADV_POPULATE_WRITE when
> +         * preallocating synchronously.
> +         */
> +        if (context->num_threads == 1 && !async) {
>              if (qemu_madvise(area, hpagesize * numpages,
>                               QEMU_MADV_POPULATE_WRITE)) {
>                  return -errno;
> @@ -445,50 +478,86 @@ static int touch_all_pages(char *area, size_t 
> hpagesize, size_t numpages,
>          touch_fn = do_touch_pages;
>      }
>  
> -    context.threads = g_new0(MemsetThread, context.num_threads);
> -    numpages_per_thread = numpages / context.num_threads;
> -    leftover = numpages % context.num_threads;
> -    for (i = 0; i < context.num_threads; i++) {
> -        context.threads[i].addr = addr;
> -        context.threads[i].numpages = numpages_per_thread + (i < leftover);
> -        context.threads[i].hpagesize = hpagesize;
> -        context.threads[i].context = &context;
> +    context->threads = g_new0(MemsetThread, context->num_threads);
> +    numpages_per_thread = numpages / context->num_threads;
> +    leftover = numpages % context->num_threads;
> +    for (i = 0; i < context->num_threads; i++) {
> +        context->threads[i].addr = addr;
> +        context->threads[i].numpages = numpages_per_thread + (i < leftover);
> +        context->threads[i].hpagesize = hpagesize;
> +        context->threads[i].context = context;
>          if (tc) {
> -            thread_context_create_thread(tc, &context.threads[i].pgthread,
> +            thread_context_create_thread(tc, &context->threads[i].pgthread,
>                                           "touch_pages",
> -                                         touch_fn, &context.threads[i],
> +                                         touch_fn, &context->threads[i],
>                                           QEMU_THREAD_JOINABLE);
>          } else {
> -            qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
> -                               touch_fn, &context.threads[i],
> +            qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
> +                               touch_fn, &context->threads[i],
>                                 QEMU_THREAD_JOINABLE);
>          }
> -        addr += context.threads[i].numpages * hpagesize;
> +        addr += context->threads[i].numpages * hpagesize;
> +    }
> +
> +    if (async) {
> +        /*
> +         * async requests currently require the BQL. Add it to the list and 
> kick
> +         * preallocation off during qemu_finish_async_prealloc_mem().
> +         */
> +        assert(bql_locked());
> +        QLIST_INSERT_HEAD(&memset_contexts, context, next);
> +        return 0;
>      }
>  
>      if (!use_madv_populate_write) {
> -        sigbus_memset_context = &context;
> +        sigbus_memset_context = context;
>      }
>  
>      qemu_mutex_lock(&page_mutex);
> -    context.all_threads_created = true;
> +    context->all_threads_created = true;
>      qemu_cond_broadcast(&page_cond);
>      qemu_mutex_unlock(&page_mutex);
>  
> -    for (i = 0; i < context.num_threads; i++) {
> -        int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
> +    ret = wait_and_free_mem_prealloc_context(context);
>  
> +    if (!use_madv_populate_write) {
> +        sigbus_memset_context = NULL;
> +    }
> +    return ret;
> +}
> +
> +bool qemu_finish_async_prealloc_mem(Error **errp)
> +{
> +    int ret, tmp;

The above should be initialized?

I did a build test and encounter:

In file included from ../util/oslib-posix.c:36:
../util/oslib-posix.c: In function ‘qemu_finish_async_prealloc_mem’:
/home/libvirt/vm/software/qemu/include/qapi/error.h:334:5: error: ‘ret’ may be 
used uninitialized in this function [-Werror=maybe-uninitialized]
  334 |     error_setg_errno_internal((errp), __FILE__, __LINE__, __func__,     
\
      |     ^~~~~~~~~~~~~~~~~~~~~~~~~
../util/oslib-posix.c:531:9: note: ‘ret’ was declared here
  531 |     int ret, tmp;
      |         ^~~
cc1: all warnings being treated as errors
ninja: build stopped: subcommand failed.
make: *** [Makefile:162: run-ninja] Error 1

Thank you very much!

Dongli Zhang

> +    MemsetContext *context, *next_context;
> +
> +    /* Waiting for preallocation requires the BQL. */
> +    assert(bql_locked());
> +    if (QLIST_EMPTY(&memset_contexts)) {
> +        return true;
> +    }
> +
> +    qemu_mutex_lock(&page_mutex);
> +    QLIST_FOREACH(context, &memset_contexts, next) {
> +        context->all_threads_created = true;
> +    }
> +    qemu_cond_broadcast(&page_cond);
> +    qemu_mutex_unlock(&page_mutex);
> +
> +    QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
> +        QLIST_REMOVE(context, next);
> +        tmp = wait_and_free_mem_prealloc_context(context);
>          if (tmp) {
>              ret = tmp;
>          }
>      }
>  
> -    if (!use_madv_populate_write) {
> -        sigbus_memset_context = NULL;
> +    if (ret) {
> +        error_setg_errno(errp, -ret,
> +                         "qemu_prealloc_mem: preallocating memory failed");
> +        return false;
>      }
> -    g_free(context.threads);
> -
> -    return ret;
> +    return true;
>  }
>  
>  static bool madv_populate_write_possible(char *area, size_t pagesize)
> @@ -498,7 +567,7 @@ static bool madv_populate_write_possible(char *area, 
> size_t pagesize)
>  }
>  
>  bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> -                       ThreadContext *tc, Error **errp)
> +                       ThreadContext *tc, bool async, Error **errp)
>  {
>      static gsize initialized;
>      int ret;
> @@ -540,7 +609,7 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int 
> max_threads,
>      }
>  
>      /* touch pages simultaneously */
> -    ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc,
> +    ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
>                            use_madv_populate_write);
>      if (ret) {
>          error_setg_errno(errp, -ret,
> diff --git a/util/oslib-win32.c b/util/oslib-win32.c
> index c4a5f05a49..b623830d62 100644
> --- a/util/oslib-win32.c
> +++ b/util/oslib-win32.c
> @@ -265,7 +265,7 @@ int getpagesize(void)
>  }
>  
>  bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
> -                       ThreadContext *tc, Error **errp)
> +                       ThreadContext *tc, bool async, Error **errp)
>  {
>      int i;
>      size_t pagesize = qemu_real_host_page_size();
> @@ -278,6 +278,12 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, 
> int max_threads,
>      return true;
>  }
>  
> +bool qemu_finish_async_prealloc_mem(Error **errp)
> +{
> +    /* async prealloc not supported, there is nothing to finish */
> +    return true;
> +}
> +
>  char *qemu_get_pid_name(pid_t pid)
>  {
>      /* XXX Implement me */
Re: [PATCH v4 1/1] oslib-posix: initialize backend memory objects in parallel

Reply via email to