The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will 
appear at g...@bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.5
------>
commit c596f82fac95faf0af5f74d4ccd7c597d179f94e
Author: Anthony Yznaga <anthony.yzn...@oracle.com>
Date:   Wed Dec 7 09:50:29 2022 -0800

    oracle/mm: use padata for copying page ranges in vma_dup()
    
    When a VMA marked for preservation via MADV_DOEXEC is copied to a new mm
    during exec, its pagetable entries are copied using copy_page_range().
    The time to complete the copy increases linearly with size and becomes
    excessive when preserving memory for very large VMs. Use padata to speed
    up the copying by parallelizing the work.
    
    Performance results for this patch series:
    
        System:           X6-2
        CPU:              2 nodes * 10 cores/node * 2 threads/core = 40 CPUs
                          Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz
        Memory:           251G split evenly between nodes
    
        Test:             Time to exec measured in ms/GB.
    
                          Exec after mmap'ing and touching a 200GB range
                          of anon memory and then preserving it with
                          MADV_DOEXEC:
    
                           kernel     speedup    avg ms/GB
                         --------    --------    ---------
                         baseline                     17.4
                           padata        7.2x          2.4
    
                          Exec after mmap'ing and touching a 200GB range
                          of shared memory backed by shmem.
    
                           kernel     speedup    avg ms/GB
                         --------    --------    ---------
                         baseline                     21.7
                           padata          7x          3.1
    
    Orabug: 35054621
    Signed-off-by: Anthony Yznaga <anthony.yzn...@oracle.com>
    Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
    
    https://virtuozzo.atlassian.net/browse/VSTOR-96305
    
    (cherry picked from Oracle commit 800339ff06da9ffcc0e26fb13a43513792a6aa5e)
    Signed-off-by: Konstantin Khorenko <khore...@virtuozzo.com>
    
    Feature: oracle/mm: MADV_DOEXEC madvise() flag
---
 include/linux/mm.h |  4 ++++
 mm/memory.c        | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/mmap.c          |  5 ++++-
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d2ce7bded6da..9b1ceb0db308 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1958,6 +1958,10 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned 
long addr,
                unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct 
*src_vma);
+#ifdef CONFIG_PADATA
+int copy_page_range_mt(struct vm_area_struct *dst_vma,
+                       struct vm_area_struct *src_vma);
+#endif
 int follow_pte(struct mm_struct *mm, unsigned long address,
               pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 872de67ca2d7..b431a43b68fd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1325,6 +1325,66 @@ copy_page_range(struct vm_area_struct *dst_vma, struct 
vm_area_struct *src_vma)
        return ret;
 }
 
+#ifdef CONFIG_PADATA
+
+struct copy_page_range_args {
+       struct vm_area_struct *dst_vma;
+       struct vm_area_struct *src_vma;
+};
+
+static int copy_page_range_chunk(unsigned long addr,
+                                unsigned long end, void *arg)
+{
+       struct copy_page_range_args *args = arg;
+       struct vm_area_struct *dst_vma = args->dst_vma;
+       struct vm_area_struct *src_vma = args->src_vma;
+       struct mm_struct *dst_mm = dst_vma->vm_mm;
+       struct mm_struct *src_mm = src_vma->vm_mm;
+       pgd_t *src_pgd, *dst_pgd;
+       unsigned long next;
+       int ret = 0;
+
+       dst_pgd = pgd_offset(dst_mm, addr);
+       src_pgd = pgd_offset(src_mm, addr);
+
+       do {
+               next = pgd_addr_end(addr, end);
+               if (pgd_none_or_clear_bad(src_pgd))
+                       continue;
+               if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+                                           addr, next))) {
+                       ret = -ENOMEM;
+                       break;
+               }
+       } while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+       return ret;
+}
+
+/*
+ * A stripped down version of copy_page_range() used to copy a VMA as part
+ * of preserving it across exec. Multithreading via padata is used to speed
+ * up the copying of very large VMAs.
+ */
+int copy_page_range_mt(struct vm_area_struct *dst_vma, struct vm_area_struct 
*src_vma)
+{
+       struct copy_page_range_args args = { dst_vma, src_vma };
+       struct padata_mt_job job = {
+               .thread_fn   = copy_page_range_chunk,
+               .fn_arg      = &args,
+               .start       = src_vma->vm_start,
+               .size        = src_vma->vm_end - src_vma->vm_start,
+               .align       = PMD_SIZE,
+               .min_chunk   = max(1ul << 27, PMD_SIZE),
+               .max_threads = 16,
+       };
+
+       BUG_ON(!(src_vma->vm_flags & VM_EXEC_KEEP));
+
+       return padata_do_multithreaded(&job);
+}
+#endif /* CONFIG_PADATA */
+
 /* Whether we should zap all COWed (private) pages too */
 static inline bool should_zap_cows(struct zap_details *details)
 {
diff --git a/mm/mmap.c b/mm/mmap.c
index 9bb2382d9101..04b769eb27a4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3318,8 +3318,11 @@ int vma_dup(struct vm_area_struct *old_vma, struct 
mm_struct *mm)
         */
        old_vma->vm_flags &= ~VM_ACCOUNT;
 
+#ifdef CONFIG_PADATA
+       ret = copy_page_range_mt(vma, old_vma);
+#else
        ret = copy_page_range(vma, old_vma);
-
+#endif
        vma->vm_flags &= ~VM_EXEC_KEEP;
 
        return ret;
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to