The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at g...@bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.5 ------> commit c596f82fac95faf0af5f74d4ccd7c597d179f94e Author: Anthony Yznaga <anthony.yzn...@oracle.com> Date: Wed Dec 7 09:50:29 2022 -0800
oracle/mm: use padata for copying page ranges in vma_dup() When a VMA marked for preservation via MADV_DOEXEC is copied to a new mm during exec, its pagetable entries are copied using copy_page_range(). The time to complete the copy increases linearly with size and becomes excessive when preserving memory for very large VMs. Use padata to speed up the copying by parallelizing the work. Performance results for this patch series: System: X6-2 CPU: 2 nodes * 10 cores/node * 2 threads/core = 40 CPUs Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz Memory: 251G split evenly between nodes Test: Time to exec measured in ms/GB. Exec after mmap'ing and touching a 200GB range of anon memory and then preserving it with MADV_DOEXEC: kernel speedup avg ms/GB -------- -------- --------- baseline 17.4 padata 7.2x 2.4 Exec after mmap'ing and touching a 200GB range of shared memory backed by shmem. kernel speedup avg ms/GB -------- -------- --------- baseline 21.7 padata 7x 3.1 Orabug: 35054621 Signed-off-by: Anthony Yznaga <anthony.yzn...@oracle.com> Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com> https://virtuozzo.atlassian.net/browse/VSTOR-96305 (cherry picked from Oracle commit 800339ff06da9ffcc0e26fb13a43513792a6aa5e) Signed-off-by: Konstantin Khorenko <khore...@virtuozzo.com> Feature: oracle/mm: MADV_DOEXEC madvise() flag --- include/linux/mm.h | 4 ++++ mm/memory.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/mmap.c | 5 ++++- 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d2ce7bded6da..9b1ceb0db308 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1958,6 +1958,10 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +#ifdef CONFIG_PADATA +int copy_page_range_mt(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma); +#endif int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 872de67ca2d7..b431a43b68fd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1325,6 +1325,66 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return ret; } +#ifdef CONFIG_PADATA + +struct copy_page_range_args { + struct vm_area_struct *dst_vma; + struct vm_area_struct *src_vma; +}; + +static int copy_page_range_chunk(unsigned long addr, + unsigned long end, void *arg) +{ + struct copy_page_range_args *args = arg; + struct vm_area_struct *dst_vma = args->dst_vma; + struct vm_area_struct *src_vma = args->src_vma; + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; + pgd_t *src_pgd, *dst_pgd; + unsigned long next; + int ret = 0; + + dst_pgd = pgd_offset(dst_mm, addr); + src_pgd = pgd_offset(src_mm, addr); + + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, + addr, next))) { + ret = -ENOMEM; + break; + } + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + + return ret; +} + +/* + * A stripped down version of copy_page_range() used to copy a VMA as part + * of preserving it across exec. Multithreading via padata is used to speed + * up the copying of very large VMAs. + */ +int copy_page_range_mt(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) +{ + struct copy_page_range_args args = { dst_vma, src_vma }; + struct padata_mt_job job = { + .thread_fn = copy_page_range_chunk, + .fn_arg = &args, + .start = src_vma->vm_start, + .size = src_vma->vm_end - src_vma->vm_start, + .align = PMD_SIZE, + .min_chunk = max(1ul << 27, PMD_SIZE), + .max_threads = 16, + }; + + BUG_ON(!(src_vma->vm_flags & VM_EXEC_KEEP)); + + return padata_do_multithreaded(&job); +} +#endif /* CONFIG_PADATA */ + /* Whether we should zap all COWed (private) pages too */ static inline bool should_zap_cows(struct zap_details *details) { diff --git a/mm/mmap.c b/mm/mmap.c index 9bb2382d9101..04b769eb27a4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3318,8 +3318,11 @@ int vma_dup(struct vm_area_struct *old_vma, struct mm_struct *mm) */ old_vma->vm_flags &= ~VM_ACCOUNT; +#ifdef CONFIG_PADATA + ret = copy_page_range_mt(vma, old_vma); +#else ret = copy_page_range(vma, old_vma); - +#endif vma->vm_flags &= ~VM_EXEC_KEEP; return ret; _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel