On Wed, Jul 14, 2021 at 01:23:04PM +0200, David Hildenbrand wrote: > Let's sense support and use it for preallocation. MADV_POPULATE_WRITE > does not require a SIGBUS handler, doesn't actually touch page content, > and avoids context switches; it is, therefore, faster and easier to handle > than our current approach. > > This resolves the TODO in do_touch_pages(). > > Signed-off-by: David Hildenbrand <da...@redhat.com> > --- > include/qemu/osdep.h | 7 ++++ > util/oslib-posix.c | 84 +++++++++++++++++++++++++++++++++----------- > 2 files changed, 71 insertions(+), 20 deletions(-) >
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c > index e8bdb02e1d..679796ac1f 100644 > --- a/util/oslib-posix.c > +++ b/util/oslib-posix.c > @@ -484,10 +484,6 @@ static void *do_touch_pages(void *arg) > * > * 'volatile' to stop compiler optimizing this away > * to a no-op > - * > - * TODO: get a better solution from kernel so we > - * don't need to write at all so we don't cause > - * wear on the storage backing the region... > */ > *(volatile char *)addr = *addr; > addr += hpagesize; > @@ -497,6 +493,27 @@ static void *do_touch_pages(void *arg) > return NULL; > } > > +static void *do_madv_populate_write_pages(void *arg) > +{ > + MemsetThread *memset_args = (MemsetThread *)arg; > + const size_t size = memset_args->numpages * memset_args->hpagesize; > + char * const addr = memset_args->addr; > + int ret; > + > + /* See do_touch_pages(). */ > + qemu_mutex_lock(&page_mutex); > + while (!threads_created_flag) { > + qemu_cond_wait(&page_cond, &page_mutex); > + } > + qemu_mutex_unlock(&page_mutex); > + > + ret = qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE); > + if (ret) { > + memset_thread_failed = true; > + } > + return NULL; > +} > + > static inline int get_memset_num_threads(int smp_cpus) > { > long host_procs = sysconf(_SC_NPROCESSORS_ONLN); > @@ -510,10 +527,11 @@ static inline int get_memset_num_threads(int smp_cpus) > } > > static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages, > - int smp_cpus) > + int smp_cpus, bool use_madv_populate_write) > { > static gsize initialized = 0; > size_t numpages_per_thread, leftover; > + void *(*touch_fn)(void *); > char *addr = area; > int i = 0; > > @@ -523,6 +541,12 @@ static bool touch_all_pages(char *area, size_t > hpagesize, size_t numpages, > g_once_init_leave(&initialized, 1); > } > > + if (use_madv_populate_write) { > + touch_fn = do_madv_populate_write_pages; > + } else { > + touch_fn = do_touch_pages; > + } > + > memset_thread_failed = false; > threads_created_flag = false; > memset_num_threads = get_memset_num_threads(smp_cpus); > @@ -534,7 +558,7 @@ static bool touch_all_pages(char *area, size_t hpagesize, > size_t numpages, > memset_thread[i].numpages = numpages_per_thread + (i < leftover); > memset_thread[i].hpagesize = hpagesize; > qemu_thread_create(&memset_thread[i].pgthread, "touch_pages", > - do_touch_pages, &memset_thread[i], > + touch_fn, &memset_thread[i], > QEMU_THREAD_JOINABLE); > addr += memset_thread[i].numpages * hpagesize; > } Do you have an indication of what the speed differential is for the old read/write dance vs the kernel madvise. We needed to use threads previously because the read/write dance is pretty terribly slow. Is that still a problem with the madvise approach ? I would (perhaps naively), expect that the kernel would be able to do this efficiently for arbitrarily large memory regions, such that QEMU would not need to play games with threads. > @@ -553,6 +577,12 @@ static bool touch_all_pages(char *area, size_t > hpagesize, size_t numpages, > return memset_thread_failed; > } > > +static bool madv_populate_write_possible(char *area, size_t pagesize) > +{ > + return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) || > + errno != EINVAL; > +} > + > void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, > Error **errp) > { > @@ -560,29 +590,43 @@ void os_mem_prealloc(int fd, char *area, size_t memory, > int smp_cpus, > struct sigaction act, oldact; > size_t hpagesize = qemu_fd_getpagesize(fd); > size_t numpages = DIV_ROUND_UP(memory, hpagesize); > + bool use_madv_populate_write; Initialized with random garbage from the stack > + > + /* > + * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for > + * some special mappings, such as mapping /dev/mem. > + */ > + if (madv_populate_write_possible(area, hpagesize)) { > + use_madv_populate_write = true; > + } but this implicitly assumes it was initialized to false. > > - memset(&act, 0, sizeof(act)); > - act.sa_handler = &sigbus_handler; > - act.sa_flags = 0; > + if (!use_madv_populate_write) { > + memset(&act, 0, sizeof(act)); > + act.sa_handler = &sigbus_handler; > + act.sa_flags = 0; > > - ret = sigaction(SIGBUS, &act, &oldact); > - if (ret) { > - error_setg_errno(errp, errno, > - "os_mem_prealloc: failed to install signal handler"); > - return; > + ret = sigaction(SIGBUS, &act, &oldact); > + if (ret) { > + error_setg_errno(errp, errno, > + "os_mem_prealloc: failed to install signal handler"); > + return; > + } > } > > /* touch pages simultaneously */ > - if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) { > + if (touch_all_pages(area, hpagesize, numpages, smp_cpus, > + use_madv_populate_write)) { > error_setg(errp, "os_mem_prealloc: Insufficient free host memory " > "pages available to allocate guest RAM"); > } > > - ret = sigaction(SIGBUS, &oldact, NULL); > - if (ret) { > - /* Terminate QEMU since it can't recover from error */ > - perror("os_mem_prealloc: failed to reinstall signal handler"); > - exit(1); > + if (!use_madv_populate_write) { > + ret = sigaction(SIGBUS, &oldact, NULL); > + if (ret) { > + /* Terminate QEMU since it can't recover from error */ > + perror("os_mem_prealloc: failed to reinstall signal handler"); > + exit(1); > + } > } > } > > -- > 2.31.1 > > Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|