Refactor the madvise syscall to allow for parts of it to be reused by a
prctl syscall that affects vmas.

Move the code that walks vmas in a virtual address range into a function
that takes a function pointer as a parameter.  The only caller for now is
sys_madvise, which uses it to call madvise_vma_behavior on each vma, but
the next patch will add an additional caller.

Move handling all vma behaviors inside madvise_behavior, and rename it to
madvise_vma_behavior.

Move the code that updates the flags on a vma, including splitting or
merging the vma as necessary, into a new function called
madvise_update_vma.  The next patch will add support for updating a new
anon_name field as well.

Signed-off-by: Colin Cross <ccr...@android.com>
Cc: Pekka Enberg <penb...@kernel.org>
Cc: Dave Hansen <dave.han...@intel.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Ingo Molnar <mi...@kernel.org>
Cc: Oleg Nesterov <o...@redhat.com>
Cc: "Eric W. Biederman" <ebied...@xmission.com>
Cc: Jan Glauber <jan.glau...@gmail.com>
Cc: John Stultz <john.stu...@linaro.org>
Cc: Rob Landley <r...@landley.net>
Cc: Cyrill Gorcunov <gorcu...@openvz.org>
Cc: Kees Cook <keesc...@chromium.org>
Cc: "Serge E. Hallyn" <serge.hal...@ubuntu.com>
Cc: David Rientjes <rient...@google.com>
Cc: Al Viro <v...@zeniv.linux.org.uk>
Cc: Hugh Dickins <hu...@google.com>
Cc: Rik van Riel <r...@redhat.com>
Cc: Mel Gorman <mgor...@suse.de>
Cc: Michel Lespinasse <wal...@google.com>
Cc: Tang Chen <tangc...@cn.fujitsu.com>
Cc: Robin Holt <h...@sgi.com>
Cc: Shaohua Li <s...@fusionio.com>
Cc: Sasha Levin <sasha.le...@oracle.com>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Minchan Kim <minc...@kernel.org>
Signed-off-by: Andrew Morton <a...@linux-foundation.org>
---
 mm/madvise.c | 272 +++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 151 insertions(+), 121 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 539eeb96b323..aa346f87edbb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -39,65 +39,20 @@ static int madvise_need_mmap_write(int behavior)
 }
 
 /*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
+ * Update the vm_flags on regiion of a vma, splitting it or merging it as
+ * necessary.  Must be called with mmap_sem held for writing;
  */
-static long madvise_behavior(struct vm_area_struct *vma,
-                    struct vm_area_struct **prev,
-                    unsigned long start, unsigned long end, int behavior)
+static int madvise_update_vma(struct vm_area_struct *vma,
+                    struct vm_area_struct **prev, unsigned long start,
+                    unsigned long end, unsigned long new_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
-       int error = 0;
        pgoff_t pgoff;
-       unsigned long new_flags = vma->vm_flags;
-
-       switch (behavior) {
-       case MADV_NORMAL:
-               new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
-               break;
-       case MADV_SEQUENTIAL:
-               new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
-               break;
-       case MADV_RANDOM:
-               new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
-               break;
-       case MADV_DONTFORK:
-               new_flags |= VM_DONTCOPY;
-               break;
-       case MADV_DOFORK:
-               if (vma->vm_flags & VM_IO) {
-                       error = -EINVAL;
-                       goto out;
-               }
-               new_flags &= ~VM_DONTCOPY;
-               break;
-       case MADV_DONTDUMP:
-               new_flags |= VM_DONTDUMP;
-               break;
-       case MADV_DODUMP:
-               if (new_flags & VM_SPECIAL) {
-                       error = -EINVAL;
-                       goto out;
-               }
-               new_flags &= ~VM_DONTDUMP;
-               break;
-       case MADV_MERGEABLE:
-       case MADV_UNMERGEABLE:
-               error = ksm_madvise(vma, start, end, behavior, &new_flags);
-               if (error)
-                       goto out;
-               break;
-       case MADV_HUGEPAGE:
-       case MADV_NOHUGEPAGE:
-               error = hugepage_madvise(vma, &new_flags, behavior);
-               if (error)
-                       goto out;
-               break;
-       }
+       int error;
 
        if (new_flags == vma->vm_flags) {
                *prev = vma;
-               goto out;
+               return 0;
        }
 
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -113,13 +68,13 @@ static long madvise_behavior(struct vm_area_struct *vma,
        if (start != vma->vm_start) {
                error = split_vma(mm, vma, start, 1);
                if (error)
-                       goto out;
+                       return error;
        }
 
        if (end != vma->vm_end) {
                error = split_vma(mm, vma, end, 0);
                if (error)
-                       goto out;
+                       return error;
        }
 
 success:
@@ -128,10 +83,7 @@ success:
         */
        vma->vm_flags = new_flags;
 
-out:
-       if (error == -ENOMEM)
-               error = -EAGAIN;
-       return error;
+       return 0;
 }
 
 #ifdef CONFIG_SWAP
@@ -337,6 +289,77 @@ static long madvise_remove(struct vm_area_struct *vma,
        return error;
 }
 
+/*
+ * Apply an madvise behavior to a region of a vma.  madvise_update_vma
+ * will handle splitting a vm area into separate areas, each area with its own
+ * behavior.
+ */
+static int madvise_vma_behavior(struct vm_area_struct *vma,
+                    struct vm_area_struct **prev,
+                    unsigned long start, unsigned long end,
+                    unsigned long behavior)
+{
+       int error = 0;
+       unsigned long new_flags = vma->vm_flags;
+
+       switch (behavior) {
+       case MADV_REMOVE:
+               return madvise_remove(vma, prev, start, end);
+       case MADV_WILLNEED:
+               return madvise_willneed(vma, prev, start, end);
+       case MADV_DONTNEED:
+               return madvise_dontneed(vma, prev, start, end);
+       case MADV_NORMAL:
+               new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+               break;
+       case MADV_SEQUENTIAL:
+               new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+               break;
+       case MADV_RANDOM:
+               new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+               break;
+       case MADV_DONTFORK:
+               new_flags |= VM_DONTCOPY;
+               break;
+       case MADV_DOFORK:
+               if (vma->vm_flags & VM_IO) {
+                       error = -EINVAL;
+                       goto out;
+               }
+               new_flags &= ~VM_DONTCOPY;
+               break;
+       case MADV_DONTDUMP:
+               new_flags |= VM_DONTDUMP;
+               break;
+       case MADV_DODUMP:
+               if (new_flags & VM_SPECIAL) {
+                       error = -EINVAL;
+                       goto out;
+               }
+               new_flags &= ~VM_DONTDUMP;
+               break;
+       case MADV_MERGEABLE:
+       case MADV_UNMERGEABLE:
+               error = ksm_madvise(vma, start, end, behavior, &new_flags);
+               if (error)
+                       goto out;
+               break;
+       case MADV_HUGEPAGE:
+       case MADV_NOHUGEPAGE:
+               error = hugepage_madvise(vma, &new_flags, behavior);
+               if (error)
+                       goto out;
+               break;
+       }
+
+       error = madvise_update_vma(vma, prev, start, end, new_flags);
+
+out:
+       if (error == -ENOMEM)
+               error = -EAGAIN;
+       return error;
+}
+
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Error injection support for memory error handling.
@@ -375,22 +398,6 @@ static int madvise_hwpoison(int bhv, unsigned long start, 
unsigned long end)
 }
 #endif
 
-static long
-madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
-               unsigned long start, unsigned long end, int behavior)
-{
-       switch (behavior) {
-       case MADV_REMOVE:
-               return madvise_remove(vma, prev, start, end);
-       case MADV_WILLNEED:
-               return madvise_willneed(vma, prev, start, end);
-       case MADV_DONTNEED:
-               return madvise_dontneed(vma, prev, start, end);
-       default:
-               return madvise_behavior(vma, prev, start, end, behavior);
-       }
-}
-
 static int
 madvise_behavior_valid(int behavior)
 {
@@ -421,6 +428,73 @@ madvise_behavior_valid(int behavior)
 }
 
 /*
+ * Walk the vmas in range [start,end), and call the visit function on each one.
+ * The visit function will get start and end parameters that cover the overlap
+ * between the current vma and the original range.  Any unmapped regions in the
+ * original range will result in this function returning -ENOMEM while still
+ * calling the visit function on all of the existing vmas in the range.
+ * Must be called with the mmap_sem held for reading or writing.
+ */
+static
+int madvise_walk_vmas(unsigned long start, unsigned long end,
+               unsigned long arg,
+               int (*visit)(struct vm_area_struct *vma,
+                       struct vm_area_struct **prev, unsigned long start,
+                       unsigned long end, unsigned long arg))
+{
+       struct vm_area_struct *vma;
+       struct vm_area_struct *prev;
+       unsigned long tmp;
+       int unmapped_error = 0;
+
+       /*
+        * If the interval [start,end) covers some unmapped address
+        * ranges, just ignore them, but return -ENOMEM at the end.
+        * - different from the way of handling in mlock etc.
+        */
+       vma = find_vma_prev(current->mm, start, &prev);
+       if (vma && start > vma->vm_start)
+               prev = vma;
+
+       for (;;) {
+               int error;
+
+               /* Still start < end. */
+               if (!vma)
+                       return -ENOMEM;
+
+               /* Here start < (end|vma->vm_end). */
+               if (start < vma->vm_start) {
+                       unmapped_error = -ENOMEM;
+                       start = vma->vm_start;
+                       if (start >= end)
+                               break;
+               }
+
+               /* Here vma->vm_start <= start < (end|vma->vm_end) */
+               tmp = vma->vm_end;
+               if (end < tmp)
+                       tmp = end;
+
+               /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+               error = visit(vma, &prev, start, tmp, arg);
+               if (error)
+                       return error;
+               start = tmp;
+               if (prev && start < prev->vm_end)
+                       start = prev->vm_end;
+               if (start >= end)
+                       break;
+               if (prev)
+                       vma = prev->vm_next;
+               else    /* madvise_remove dropped mmap_sem */
+                       vma = find_vma(current->mm, start);
+       }
+
+       return unmapped_error;
+}
+
+/*
  * The madvise(2) system call.
  *
  * Applications can use madvise() to advise the kernel how it should
@@ -464,9 +538,7 @@ madvise_behavior_valid(int behavior)
  */
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
-       unsigned long end, tmp;
-       struct vm_area_struct *vma, *prev;
-       int unmapped_error = 0;
+       unsigned long end;
        int error = -EINVAL;
        int write;
        size_t len;
@@ -501,52 +573,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, 
len_in, int, behavior)
        else
                down_read(&current->mm->mmap_sem);
 
-       /*
-        * If the interval [start,end) covers some unmapped address
-        * ranges, just ignore them, but return -ENOMEM at the end.
-        * - different from the way of handling in mlock etc.
-        */
-       vma = find_vma_prev(current->mm, start, &prev);
-       if (vma && start > vma->vm_start)
-               prev = vma;
-
        blk_start_plug(&plug);
-       for (;;) {
-               /* Still start < end. */
-               error = -ENOMEM;
-               if (!vma)
-                       goto out;
-
-               /* Here start < (end|vma->vm_end). */
-               if (start < vma->vm_start) {
-                       unmapped_error = -ENOMEM;
-                       start = vma->vm_start;
-                       if (start >= end)
-                               goto out;
-               }
-
-               /* Here vma->vm_start <= start < (end|vma->vm_end) */
-               tmp = vma->vm_end;
-               if (end < tmp)
-                       tmp = end;
-
-               /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
-               error = madvise_vma(vma, &prev, start, tmp, behavior);
-               if (error)
-                       goto out;
-               start = tmp;
-               if (prev && start < prev->vm_end)
-                       start = prev->vm_end;
-               error = unmapped_error;
-               if (start >= end)
-                       goto out;
-               if (prev)
-                       vma = prev->vm_next;
-               else    /* madvise_remove dropped mmap_sem */
-                       vma = find_vma(current->mm, start);
-       }
-out:
+       error = madvise_walk_vmas(start, end, behavior, madvise_vma_behavior);
        blk_finish_plug(&plug);
+
        if (write)
                up_write(&current->mm->mmap_sem);
        else
-- 
1.8.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to