From: "Dr. David Alan Gilbert" <dgilb...@redhat.com> userfaultfd is a Linux syscall that gives an fd that receives a stream of notifications of accesses to pages marked as MADV_USERFAULT, and allows the program to acknowledge those stalls and tell the accessing thread to carry on.
Signed-off-by: Dr. David Alan Gilbert <dgilb...@redhat.com> --- postcopy-ram.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 6 deletions(-) diff --git a/postcopy-ram.c b/postcopy-ram.c index 8d0a225..466c42b 100644 --- a/postcopy-ram.c +++ b/postcopy-ram.c @@ -68,6 +68,14 @@ #define __NR_remap_anon_pages 317 #endif +#ifndef __NR_userfaultfd +#define __NR_userfaultfd 318 +#endif + +#ifndef USERFAULTFD_PROTOCOL +#define USERFAULTFD_PROTOCOL (uint64_t)0xaa +#endif + /* ---------------------------------------------------------------------- */ /* Postcopy pagemap-inbound (pmi) - data structures that record the */ /* state of each page used by the inbound postcopy */ @@ -192,6 +200,7 @@ int postcopy_ram_hosttest(void) */ void *testarea, *testarea2; long pagesize = getpagesize(); + int ufd; testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -201,15 +210,24 @@ int postcopy_ram_hosttest(void) } g_assert(((size_t)testarea & (pagesize-1)) == 0); + ufd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (ufd == -1) { + perror("postcopy_ram_hosttest: userfaultfd not available"); + munmap(testarea, pagesize); + return -1; + } + if (madvise(testarea, pagesize, MADV_USERFAULT)) { perror("postcopy_ram_hosttest: MADV_USERFAULT not available"); munmap(testarea, pagesize); + close(ufd); return -1; } if (madvise(testarea, pagesize, MADV_NOUSERFAULT)) { perror("postcopy_ram_hosttest: MADV_NOUSERFAULT not available"); munmap(testarea, pagesize); + close(ufd); return -1; } @@ -226,11 +244,13 @@ int postcopy_ram_hosttest(void) perror("postcopy_ram_hosttest: remap_anon_pages not available"); munmap(testarea, pagesize); munmap(testarea2, pagesize); + close(ufd); return -1; } munmap(testarea, pagesize); munmap(testarea2, pagesize); + close(ufd); return 0; } @@ -361,6 +381,39 @@ static int postcopy_ram_sensitise_area(const char *block_name, void *host_addr, } /* + * Tell the kernel that we've now got some memory it previously asked for. + * Note: We're not allowed to ack a page which wasn't requested. + */ +static int ack_userfault(MigrationIncomingState *mis, void *start, size_t len) +{ + uint64_t tmp[2]; + + /* Kernel wants the range that's now safe to access */ + tmp[0] = (uint64_t)start; + tmp[1] = (uint64_t)start + (uint64_t)(len-1); + + if (write(mis->userfault_fd, tmp, 16) != 16) { + int e = errno; + + if (e == ENOENT) { + /* Kernel said it wasn't waiting - one case where this can + * happen is where two threads triggered the userfault + * and we receive the page and ack it just after we received + * the 2nd request and that ends up deciding it should ack it + * We could optimise it out, but it's rare. + */ + /*fprintf(stderr, "ack_userfault: %p/%zx ENOENT\n", start, len); */ + return 0; + } + error_report("postcopy_ram: Failed to notify kernel for %p/%zx (%d)", + start, len, e); + return -errno; + } + + return 0; +} + +/* * Handle faults detected by the USERFAULT markings */ static void *postcopy_ram_fault_thread(void *opaque) @@ -420,10 +473,9 @@ static void *postcopy_ram_fault_thread(void *opaque) /* Already arrived - no state change, just kick the kernel */ DPRINTF("postcopy_ram_fault_thread: notify pre of %p", hostaddr); - /* TODO! Send ack if (ack_userfault(mis, hostaddr, hostpagesize)) { assert(0); - } */ + } break; case POSTCOPY_PMI_MISSING: @@ -464,8 +516,33 @@ static void *postcopy_ram_fault_thread(void *opaque) int postcopy_ram_enable_notify(MigrationIncomingState *mis) { - /* Create the fault handler thread and wait for it to be ready */ - mis->userfault_fd = -1; /* TODO */ + uint64_t tmp64; + + /* Open the fd for the kernel to give us userfaults */ + mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (mis->userfault_fd == -1) { + perror("Failed to open userfault fd"); + return -1; + } + + /* + * Version handshake, we send it the version we want and expect to get the + * same back. + */ + tmp64 = USERFAULTFD_PROTOCOL; + if (write(mis->userfault_fd, &tmp64, sizeof(tmp64)) != sizeof(tmp64)) { + perror("Writing userfaultfd version"); + return -1; + } + if (read(mis->userfault_fd, &tmp64, sizeof(tmp64)) != sizeof(tmp64)) { + perror("Reading userfaultfd version"); + return -1; + } + if (tmp64 != USERFAULTFD_PROTOCOL) { + error_report("Mismatched userfaultfd version, expected %zx, got %zx", + (size_t)USERFAULTFD_PROTOCOL, (size_t)tmp64); + } + qemu_sem_init(&mis->fault_thread_sem, 0); qemu_thread_create(&mis->fault_thread, "postcopy/fault", postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE); @@ -476,6 +553,8 @@ int postcopy_ram_enable_notify(MigrationIncomingState *mis) return -1; } + DPRINTF("postcopy_ram_enable_notify: Sensitised"); + return 0; } @@ -509,11 +588,12 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, if (syscall(__NR_remap_anon_pages, host, from, getpagesize(), 0) != getpagesize()) { + int e = errno; perror("remap_anon_pages in postcopy_place_page"); fprintf(stderr, "host: %p from: %p pmi=%d\n", host, from, postcopy_pmi_get_state(mis, bitmap_offset)); - return -errno; + return -e; } tmp_state = postcopy_pmi_get_state(mis, bitmap_offset); @@ -526,7 +606,10 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, if (old_state == POSTCOPY_PMI_REQUESTED) { - /* TODO: Notify kernel */ + /* Send the kernel the host address that should now be accessible */ + DPRINTF("%s: Notifying kernel bitmap_offset=0x%lx host=%p", + __func__, bitmap_offset, host); + return ack_userfault(mis, host, getpagesize()); } /* TODO: hostpagesize!=targetpagesize case */ -- 1.9.3