From: Andrey Ryabinin <aryabi...@virtuozzo.com> Make vdso mapping per-ve. This will allow per container modification of the linux version in .note section of vdso and monotonic time.
https://jira.sw.ru/browse/PSBM-121668 Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> +++ ve: fix copy_vdso error handling Else we would return null pointer (e.g. to cgroup_init_subsys) and IS_ERR would say that it's not an error and the caller code would badly consider ve_start being successful while it's not. https://jira.sw.ru/browse/PSBM-131158 Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> +++ vdso: fix VM_BUG_ON_PAGE(PageSlab(page)) on unmap vdso_data is mapped to userspace which means that we can't use kmalloc() to allocate it. Kmalloc() doesn't even guarantee that we will get page aligned memory. kernel BUG at include/linux/mm.h:693! RIP: 0010:unmap_page_range+0x15f2/0x2630 Call Trace: unmap_vmas+0x11e/0x1d0 exit_mmap+0x215/0x420 mmput+0x10a/0x400 do_exit+0x98f/0x2d00 do_group_exit+0xec/0x2b0 __x64_sys_exit_group+0x3a/0x50 do_syscall_64+0xa5/0x4d0 entry_SYSCALL_64_after_hwframe+0x6a/0xdf Use alloc_pages_exact() to allocate it. We can't use alloc_pages(), or __get_free_pages() here since vdso_fault() need to perform get_page() on individual sub-pages and alloc_pages() doesn't initalize sub-pages. https://jira.sw.ru/browse/PSBM-123551 Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> (cherry-picked from vz8 commit 12c3967a0009 ("ve, x86_64: add per-ve vdso mapping.")) Signed-off-by: Nikita Yushchenko <nikita.yushche...@virtuozzo.com> --- arch/x86/entry/vdso/vma.c | 4 +++- arch/x86/kernel/process_64.c | 2 +- include/linux/ve.h | 3 +++ kernel/ve/ve.c | 44 ++++++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 235a5794296a..e58417321af2 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -15,6 +15,7 @@ #include <linux/cpu.h> #include <linux/ptrace.h> #include <linux/time_namespace.h> +#include <linux/ve.h> #include <asm/pvclock.h> #include <asm/vgtod.h> @@ -391,7 +392,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso64_enabled) return 0; - return map_vdso_randomized(&vdso_image_64); + + return map_vdso_randomized(get_exec_env()->vdso_64); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4811c8669f92..206cdb4793f5 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -829,7 +829,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: - return prctl_map_vdso(&vdso_image_64, arg2); + return prctl_map_vdso(get_exec_env()->vdso_64, arg2); #endif default: diff --git a/include/linux/ve.h b/include/linux/ve.h index 95dcd99267df..741867427f57 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -16,6 +16,7 @@ #include <linux/kmapset.h> #include <linux/kthread.h> #include <linux/vzstat.h> +#include <asm/vdso.h> struct nsproxy; struct veip_struct; @@ -71,6 +72,8 @@ struct ve_struct { struct kthread_worker umh_worker; struct task_struct *umh_task; + + struct vdso_image *vdso_64; }; #define VE_MEMINFO_DEFAULT 1 /* default behaviour */ diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 178aa658b50b..6a3248efaf07 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -55,6 +55,7 @@ struct ve_struct ve0 = { 2, #endif .meminfo_val = VE_MEMINFO_SYSTEM, + .vdso_64 = (struct vdso_image*)&vdso_image_64, }; EXPORT_SYMBOL(ve0); @@ -562,6 +563,33 @@ void ve_exit_ns(struct pid_namespace *pid_ns) up_write(&ve->op_sem); } +static int copy_vdso(struct ve_struct *ve) +{ + const struct vdso_image *vdso_src = &vdso_image_64; + struct vdso_image *vdso; + void *vdso_data; + + if (ve->vdso_64) + return 0; + + vdso = kmemdup(vdso_src, sizeof(*vdso), GFP_KERNEL); + if (!vdso) + return -ENOMEM; + + vdso_data = alloc_pages_exact(vdso_src->size, GFP_KERNEL); + if (!vdso_data) { + kfree(vdso); + return -ENOMEM; + } + + memcpy(vdso_data, vdso_src->data, vdso_src->size); + + vdso->data = vdso_data; + + ve->vdso_64 = vdso; + return 0; +} + static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_css) { struct ve_struct *ve = &ve0; @@ -595,12 +623,18 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ if (err) goto err_log; + err = copy_vdso(ve); + if (err) + goto err_vdso; + do_init: init_rwsem(&ve->op_sem); INIT_LIST_HEAD(&ve->ve_list); kmapset_init_key(&ve->sysfs_perms_key); return &ve->css; +err_vdso: + ve_log_destroy(ve); err_log: free_percpu(ve->sched_lat_ve.cur); err_lat: @@ -639,12 +673,22 @@ static void ve_offline(struct cgroup_subsys_state *css) ve->ve_name = NULL; } +static void ve_free_vdso(struct ve_struct *ve) +{ + if (ve->vdso_64 == &vdso_image_64) + return; + + free_pages_exact(ve->vdso_64->data, ve->vdso_64->size); + kfree(ve->vdso_64); +} + static void ve_destroy(struct cgroup_subsys_state *css) { struct ve_struct *ve = css_to_ve(css); kmapset_unlink(&ve->sysfs_perms_key, &sysfs_ve_perms_set); ve_log_destroy(ve); + ve_free_vdso(ve); free_percpu(ve->sched_lat_ve.cur); kmem_cache_free(ve_cachep, ve); } -- 2.30.2 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel