This introduces the MREMAP_RETAIN flag for preserving the source mapping
when MREMAP_MAYMOVE moves the pages to a new destination. Accesses to
the source mapping will fault and map in fresh zeroed pages.

It is currently limited to writable MAP_PRIVATE|MAP_ANONYMOUS mappings
and will return EFAULT when used on anything else. This covers the
intended use case in general purpose allocators.

For consistency, the old_len >= new_len case could decommit the pages
instead of unmapping. However, userspace can accomplish the same thing
via madvise and the flag is coherent without the additional complexity.

Motivation:

TCMalloc and jemalloc avoid releasing virtual memory in order to reduce
virtual memory fragmentation. A call to munmap or mremap would leave a
hole in the address space. Instead, unused pages are lazily returned to
the operating system via MADV_DONTNEED.

Since mremap cannot be used to elide copies, TCMalloc and jemalloc end
up being significantly slower for patterns like repeated vector / hash
table reallocations. Consider the typical vector building pattern:

    #include <string.h>
    #include <stdlib.h>

    int main(void) {
        for (size_t i = 0; i < 100; i++) {
            void *ptr = NULL;
            size_t old_size = 0;
            for (size_t size = 4; size < (1 << 30); size *= 2) {
                ptr = realloc(ptr, size);
                if (!ptr) return 1;
                memset(ptr + old_size, 0xff, size - old_size);
                old_size = size;
            }
            free(ptr);
        }
    }

Transparent huge pages disabled:

glibc (baseline, uses mremap already): 15.051s
jemalloc without MREMAP_RETAIN: 38.540s
jemalloc with MREMAP_RETAIN: 15.086s

Transparent huge pages enabled:

glibc (baseline, uses mremap already): 8.464s
jemalloc without MREMAP_RETAIN: 18.230s
jemalloc with MREMAP_RETAIN: 6.696s

In practice, in-place growth never occurs for huge allocations because
the heap grows in the downwards direction for all 3 allocators. TCMalloc
and jemalloc pay for enormous copies while glibc is only spending time
writing new elements to the vector. Even if it was grown in the other
direction, real-world applications would end up blocking in-place growth
with new allocations.

The allocators could attempt to map the source location again after an
mremap call, but there is no guarantee of success in a multi-threaded
program and fragmentating memory over time is considered unacceptable.

Signed-off-by: Daniel Micay <danielmi...@gmail.com>
---
 include/linux/huge_mm.h   |  2 +-
 include/linux/mm.h        |  6 ++++++
 include/uapi/linux/mman.h |  1 +
 mm/huge_memory.c          |  4 ++--
 mm/memory.c               |  2 +-
 mm/mmap.c                 | 12 +++++++++++
 mm/mremap.c               | 52 +++++++++++++++++++++++++++++++----------------
 7 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 63579cb..3c13b20 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -143,7 +143,7 @@ static inline void vma_adjust_trans_huge(struct 
vm_area_struct *vma,
                                         unsigned long end,
                                         long adjust_next)
 {
-       if (!vma->anon_vma || vma->vm_ops)
+       if (!vma->anon_vma || (vma->vm_ops && !vma->vm_ops->allow_huge_pages))
                return;
        __vma_adjust_trans_huge(vma, start, end, adjust_next);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8981cc8..1e61036 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -273,6 +273,12 @@ struct vm_operations_struct {
        /* called by sys_remap_file_pages() to populate non-linear mapping */
        int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
                           unsigned long size, pgoff_t pgoff);
+
+       /* Check if the mapping may be duplicated by MREMAP_RETAIN */
+       bool (*may_duplicate)(struct vm_area_struct *vma);
+
+       /* if there is no vm_ops table, this is considered true */
+       bool allow_huge_pages;
 };
 
 struct mmu_gather;
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd..4e9a546 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,6 +5,7 @@
 
 #define MREMAP_MAYMOVE 1
 #define MREMAP_FIXED   2
+#define MREMAP_RETAIN  4
 
 #define OVERCOMMIT_GUESS               0
 #define OVERCOMMIT_ALWAYS              1
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d9a21d06..350b478 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2077,7 +2077,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
                 * page fault if needed.
                 */
                return 0;
-       if (vma->vm_ops)
+       if ((vma->vm_ops && !vma->vm_ops->allow_huge_pages))
                /* khugepaged not yet working on file or special mappings */
                return 0;
        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
@@ -2405,7 +2405,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
            (vma->vm_flags & VM_NOHUGEPAGE))
                return false;
 
-       if (!vma->anon_vma || vma->vm_ops)
+       if (!vma->anon_vma || (vma->vm_ops && !vma->vm_ops->allow_huge_pages))
                return false;
        if (is_vma_temporary_stack(vma))
                return false;
diff --git a/mm/memory.c b/mm/memory.c
index e229970..c181401 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3275,7 +3275,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct 
vm_area_struct *vma,
                return VM_FAULT_OOM;
        if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
                int ret = VM_FAULT_FALLBACK;
-               if (!vma->vm_ops)
+               if (!vma->vm_ops || vma->vm_ops->allow_huge_pages)
                        ret = do_huge_pmd_anonymous_page(mm, vma, address,
                                        pmd, flags);
                if (!(ret & VM_FAULT_FALLBACK))
diff --git a/mm/mmap.c b/mm/mmap.c
index c0a3637..6b644fe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1500,6 +1500,16 @@ static inline int accountable_mapping(struct file *file, 
vm_flags_t vm_flags)
        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
+static bool anon_may_duplicate(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_WRITE && !(vma->vm_flags & VM_SHARED);
+}
+
+static const struct vm_operations_struct anon_vmops = {
+       .may_duplicate = anon_may_duplicate,
+       .allow_huge_pages = true
+};
+
 unsigned long mmap_region(struct file *file, unsigned long addr,
                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
 {
@@ -1569,6 +1579,8 @@ munmap_back:
        vma->vm_flags = vm_flags;
        vma->vm_page_prot = vm_get_page_prot(vm_flags);
        vma->vm_pgoff = pgoff;
+       if (!file)
+               vma->vm_ops = &anon_vmops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
 
        if (file) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180..ca7a662 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -235,7 +235,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
 static unsigned long move_vma(struct vm_area_struct *vma,
                unsigned long old_addr, unsigned long old_len,
-               unsigned long new_len, unsigned long new_addr, bool *locked)
+               unsigned long new_len, unsigned long new_addr, bool retain,
+               bool *locked)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma;
@@ -287,15 +288,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                old_len = new_len;
                old_addr = new_addr;
                new_addr = -ENOMEM;
-       }
-
-       /* Conceal VM_ACCOUNT so old reservation is not undone */
-       if (vm_flags & VM_ACCOUNT) {
-               vma->vm_flags &= ~VM_ACCOUNT;
-               excess = vma->vm_end - vma->vm_start - old_len;
-               if (old_addr > vma->vm_start &&
-                   old_addr + old_len < vma->vm_end)
-                       split = 1;
+               retain = false;
        }
 
        /*
@@ -310,6 +303,19 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        hiwater_vm = mm->hiwater_vm;
        vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
+       /* Leave the old mapping in place for MREMAP_RETAIN */
+       if (retain)
+               goto out;
+
+       /* Conceal VM_ACCOUNT so old reservation is not undone */
+       if (vm_flags & VM_ACCOUNT) {
+               vma->vm_flags &= ~VM_ACCOUNT;
+               excess = vma->vm_end - vma->vm_start - old_len;
+               if (old_addr > vma->vm_start &&
+                   old_addr + old_len < vma->vm_end)
+                       split = 1;
+       }
+
        if (do_munmap(mm, old_addr, old_len) < 0) {
                /* OOM: unable to split vma, just get accounts right */
                vm_unacct_memory(excess >> PAGE_SHIFT);
@@ -324,6 +330,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                        vma->vm_next->vm_flags |= VM_ACCOUNT;
        }
 
+out:
        if (vm_flags & VM_LOCKED) {
                mm->locked_vm += new_len >> PAGE_SHIFT;
                *locked = true;
@@ -333,7 +340,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 }
 
 static struct vm_area_struct *vma_to_resize(unsigned long addr,
-       unsigned long old_len, unsigned long new_len, unsigned long *p)
+       unsigned long old_len, unsigned long new_len, bool retain,
+       unsigned long *p)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = find_vma(mm, addr);
@@ -348,6 +356,11 @@ static struct vm_area_struct *vma_to_resize(unsigned long 
addr,
        if (old_len > vma->vm_end - addr)
                goto Efault;
 
+       /* Forbid MREMAP_RETAIN if not explicitly permitted by the mapping */
+       if (retain && !(vma->vm_ops && vma->vm_ops->may_duplicate &&
+           vma->vm_ops->may_duplicate(vma)))
+               goto Efault;
+
        /* Need to be careful about a growing mapping */
        if (new_len > old_len) {
                unsigned long pgoff;
@@ -392,7 +405,8 @@ Eagain:
 }
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-               unsigned long new_addr, unsigned long new_len, bool *locked)
+               unsigned long new_addr, unsigned long new_len, bool retain,
+               bool *locked)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
@@ -426,7 +440,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned 
long old_len,
                old_len = new_len;
        }
 
-       vma = vma_to_resize(addr, old_len, new_len, &charged);
+       vma = vma_to_resize(addr, old_len, new_len, retain, &charged);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
@@ -442,7 +456,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned 
long old_len,
        if (ret & ~PAGE_MASK)
                goto out1;
 
-       ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+       ret = move_vma(vma, addr, old_len, new_len, new_addr, retain, locked);
        if (!(ret & ~PAGE_MASK))
                goto out;
 out1:
@@ -482,7 +496,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, 
old_len,
        unsigned long charged = 0;
        bool locked = false;
 
-       if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+       if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_RETAIN))
                return ret;
 
        if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
@@ -506,7 +520,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, 
old_len,
 
        if (flags & MREMAP_FIXED) {
                ret = mremap_to(addr, old_len, new_addr, new_len,
-                               &locked);
+                               flags & MREMAP_RETAIN, &locked);
                goto out;
        }
 
@@ -526,7 +540,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, 
old_len,
        /*
         * Ok, we need to grow..
         */
-       vma = vma_to_resize(addr, old_len, new_len, &charged);
+       vma = vma_to_resize(addr, old_len, new_len, flags & MREMAP_RETAIN,
+                           &charged);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
@@ -575,7 +590,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, 
old_len,
                        goto out;
                }
 
-               ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+               ret = move_vma(vma, addr, old_len, new_len, new_addr,
+                              flags & MREMAP_RETAIN, &locked);
        }
 out:
        if (ret & ~PAGE_MASK)
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to