Let's sense support and use it for preallocation. MADV_POPULATE_WRITE does not require a SIGBUS handler, doesn't actually touch page content, and avoids context switches; it is, therefore, faster and easier to handle than our current approach.
This resolves the TODO in do_touch_pages(). Signed-off-by: David Hildenbrand <da...@redhat.com> --- include/qemu/osdep.h | 7 ++++ util/oslib-posix.c | 84 +++++++++++++++++++++++++++++++++----------- 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 60718fc342..d1660d67fa 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -471,6 +471,11 @@ static inline void qemu_cleanup_generic_vfree(void *p) #else #define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED #endif +#ifdef MADV_POPULATE_WRITE +#define QEMU_MADV_POPULATE_WRITE MADV_POPULATE_WRITE +#else +#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID +#endif #elif defined(CONFIG_POSIX_MADVISE) @@ -484,6 +489,7 @@ static inline void qemu_cleanup_generic_vfree(void *p) #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED +#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID #else /* no-op */ @@ -497,6 +503,7 @@ static inline void qemu_cleanup_generic_vfree(void *p) #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_REMOVE QEMU_MADV_INVALID +#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID #endif diff --git a/util/oslib-posix.c b/util/oslib-posix.c index e8bdb02e1d..679796ac1f 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -484,10 +484,6 @@ static void *do_touch_pages(void *arg) * * 'volatile' to stop compiler optimizing this away * to a no-op - * - * TODO: get a better solution from kernel so we - * don't need to write at all so we don't cause - * wear on the storage backing the region... */ *(volatile char *)addr = *addr; addr += hpagesize; @@ -497,6 +493,27 @@ static void *do_touch_pages(void *arg) return NULL; } +static void *do_madv_populate_write_pages(void *arg) +{ + MemsetThread *memset_args = (MemsetThread *)arg; + const size_t size = memset_args->numpages * memset_args->hpagesize; + char * const addr = memset_args->addr; + int ret; + + /* See do_touch_pages(). */ + qemu_mutex_lock(&page_mutex); + while (!threads_created_flag) { + qemu_cond_wait(&page_cond, &page_mutex); + } + qemu_mutex_unlock(&page_mutex); + + ret = qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE); + if (ret) { + memset_thread_failed = true; + } + return NULL; +} + static inline int get_memset_num_threads(int smp_cpus) { long host_procs = sysconf(_SC_NPROCESSORS_ONLN); @@ -510,10 +527,11 @@ static inline int get_memset_num_threads(int smp_cpus) } static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages, - int smp_cpus) + int smp_cpus, bool use_madv_populate_write) { static gsize initialized = 0; size_t numpages_per_thread, leftover; + void *(*touch_fn)(void *); char *addr = area; int i = 0; @@ -523,6 +541,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages, g_once_init_leave(&initialized, 1); } + if (use_madv_populate_write) { + touch_fn = do_madv_populate_write_pages; + } else { + touch_fn = do_touch_pages; + } + memset_thread_failed = false; threads_created_flag = false; memset_num_threads = get_memset_num_threads(smp_cpus); @@ -534,7 +558,7 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages, memset_thread[i].numpages = numpages_per_thread + (i < leftover); memset_thread[i].hpagesize = hpagesize; qemu_thread_create(&memset_thread[i].pgthread, "touch_pages", - do_touch_pages, &memset_thread[i], + touch_fn, &memset_thread[i], QEMU_THREAD_JOINABLE); addr += memset_thread[i].numpages * hpagesize; } @@ -553,6 +577,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages, return memset_thread_failed; } +static bool madv_populate_write_possible(char *area, size_t pagesize) +{ + return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) || + errno != EINVAL; +} + void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, Error **errp) { @@ -560,29 +590,43 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, struct sigaction act, oldact; size_t hpagesize = qemu_fd_getpagesize(fd); size_t numpages = DIV_ROUND_UP(memory, hpagesize); + bool use_madv_populate_write; + + /* + * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for + * some special mappings, such as mapping /dev/mem. + */ + if (madv_populate_write_possible(area, hpagesize)) { + use_madv_populate_write = true; + } - memset(&act, 0, sizeof(act)); - act.sa_handler = &sigbus_handler; - act.sa_flags = 0; + if (!use_madv_populate_write) { + memset(&act, 0, sizeof(act)); + act.sa_handler = &sigbus_handler; + act.sa_flags = 0; - ret = sigaction(SIGBUS, &act, &oldact); - if (ret) { - error_setg_errno(errp, errno, - "os_mem_prealloc: failed to install signal handler"); - return; + ret = sigaction(SIGBUS, &act, &oldact); + if (ret) { + error_setg_errno(errp, errno, + "os_mem_prealloc: failed to install signal handler"); + return; + } } /* touch pages simultaneously */ - if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) { + if (touch_all_pages(area, hpagesize, numpages, smp_cpus, + use_madv_populate_write)) { error_setg(errp, "os_mem_prealloc: Insufficient free host memory " "pages available to allocate guest RAM"); } - ret = sigaction(SIGBUS, &oldact, NULL); - if (ret) { - /* Terminate QEMU since it can't recover from error */ - perror("os_mem_prealloc: failed to reinstall signal handler"); - exit(1); + if (!use_madv_populate_write) { + ret = sigaction(SIGBUS, &oldact, NULL); + if (ret) { + /* Terminate QEMU since it can't recover from error */ + perror("os_mem_prealloc: failed to reinstall signal handler"); + exit(1); + } } } -- 2.31.1