From: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com> Introduced 'real_start_time_ct' field in task_struct.
The value is READ: 1. When the process lives inside of a ve group and any process inside of the same ve group wants to know it's start time by reading it's /proc/[pid]/stat file. 2. At container suspend operation to store this value to a dump image. The value is WRITTEN: 1. At creation time (copy_process function) 1.1. If a process is being created outside of ve group / on host, then this value is initialized to 0 1.2. If a process is being created by process already living in ve group, this value is calculated as host_uptime - ve_uptime. 2. During attach to ve. (ve_attach function). The process can be created on a host and later attached to ve. It's container's start_time value has been already initialized to 0 at creation time. After the process enters the domain of a ve, the value should be initialized. Note that the process can be attached to a non-running container, in which case it's start_time value should not be calculated and left initialized to 0. 3. At container restore via prctl (prctl_set_task_ct_fields function). In this case the value is only settable outside of a container. During restore the processes would be created from the dump image. At restore step each process will execute prctl to set it's start_time value, read from the dump. This would only be permitted during pseudosuper ve mode. The value is set as is (read from the dump), without any calculations. https://jira.sw.ru/browse/PSBM-64123 Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com> (cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0) Signed-off-by: Konstantin Khorenko <khore...@virtuozzo.com> Reviewed-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com> Changes vz9: - separate from unrelated sys_times hunks - switch to time namespaces - rename real_start_time -> start_boottime (cherry picked from vz8 commit 222870c58a3b4a284698e8cf7a692f7fea577b13) Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> --- fs/proc/array.c | 5 +++++ include/linux/sched.h | 6 ++++++ include/linux/ve.h | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/prctl.h | 8 ++++++++ kernel/fork.c | 11 +++++++++++ kernel/sys.c | 23 +++++++++++++++++++++++ kernel/ve/ve.c | 1 + 7 files changed, 84 insertions(+) diff --git a/fs/proc/array.c b/fs/proc/array.c index 24e5c06c2ed0..a5e02af46f31 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -588,6 +588,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, start_time = nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime)); +#ifdef CONFIG_VE + if (!is_super) + start_time = nsec_to_clock_t(task->start_boottime_ct); +#endif + seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); seq_puts(m, " ("); proc_task_name(m, task, false); diff --git a/include/linux/sched.h b/include/linux/sched.h index 7b4ef0e90c05..332c36a8f4c4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -957,6 +957,12 @@ struct task_struct { /* Boot based time in nsecs: */ u64 start_boottime; + /* + * This is a Container-side copy of 'start_boottime' field + * shown from inside of a Container and modified by host. + */ + u64 start_boottime_ct; + /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; diff --git a/include/linux/ve.h b/include/linux/ve.h index 4de91c86a084..c2ff0602cb25 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -17,6 +17,7 @@ #include <linux/kthread.h> #include <linux/vzstat.h> #include <asm/vdso.h> +#include <linux/time_namespace.h> struct nsproxy; struct veip_struct; @@ -110,6 +111,35 @@ static inline struct ve_struct *css_to_ve(struct cgroup_subsys_state *css) extern struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id); +static u64 ve_get_uptime(struct ve_struct *ve) +{ + struct timespec64 tp = ns_to_timespec64(0); + struct time_namespace *time_ns; + struct nsproxy *ve_ns; + + rcu_read_lock(); + ve_ns = rcu_dereference(ve->ve_ns); + if (!ve_ns) { + rcu_read_unlock(); + goto out; + } + + time_ns = get_time_ns(ve_ns->time_ns); + rcu_read_unlock(); + + ktime_get_boottime_ts64(&tp); + tp = timespec64_add(tp, time_ns->offsets.boottime); + put_time_ns(time_ns); +out: + return timespec64_to_ns(&tp); +} + +static inline void ve_set_task_start_time(struct ve_struct *ve, + struct task_struct *t) +{ + t->start_boottime_ct = ve_get_uptime(ve); +} + #define ve_feature_set(ve, f) \ !!((ve)->features & VE_FEATURE_##f) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 967d9c55323d..709fd88ede27 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -267,4 +267,12 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ # define PR_SCHED_CORE_MAX 4 +/* Set task container related fields */ +#define PR_SET_TASK_CT_FIELDS 1000 +#define PR_TASK_CT_FIELDS_START_BOOTTIME (1UL << 0) + +struct prctl_task_ct_fields { + __s64 start_boottime; +}; + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index fa42bf77ddef..61adb6409f0d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,7 @@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> +#include <linux/ve.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -1867,6 +1868,9 @@ static __latent_entropy struct task_struct *copy_process( struct file *pidfile = NULL; u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; +#ifdef CONFIG_VE + struct ve_struct *ve = get_exec_env(); +#endif /* * Don't allow sharing the root directory with processes in a different @@ -2233,6 +2237,13 @@ static __latent_entropy struct task_struct *copy_process( p->start_time = ktime_get_ns(); p->start_boottime = ktime_get_boottime_ns(); + p->start_boottime_ct = 0; + +#ifdef CONFIG_VE + if (!ve_is_super(ve)) + ve_set_task_start_time(ve, p); +#endif + /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! diff --git a/kernel/sys.c b/kernel/sys.c index 8378cb0f5434..ae566d26ab6e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2268,6 +2268,26 @@ static int prctl_get_tid_address(struct task_struct *me, int __user * __user *ti } #endif +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg, + unsigned long flags) +{ + struct prctl_task_ct_fields params; +#ifdef CONFIG_VE + struct ve_struct *ve = t->task_ve; + + if (!ve_is_super(ve) && !ve->is_pseudosuper) + return -EPERM; +#endif + + if (copy_from_user(¶ms, (const void __user *)arg, sizeof(params))) + return -EFAULT; + + if (flags & PR_TASK_CT_FIELDS_START_BOOTTIME) + t->start_boottime_ct = (u64)params.start_boottime; + + return 0; +} + static int propagate_has_child_subreaper(struct task_struct *p, void *data) { /* @@ -2568,6 +2588,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_TASK_CT_FIELDS: + error = prctl_set_task_ct_fields(me, arg2, arg3); + break; default: error = -EINVAL; break; diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index d45a10c02493..c93518fe4a33 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -809,6 +809,7 @@ static void ve_attach(struct cgroup_taskset *tset) /* Leave parent exec domain */ task->parent_exec_id--; + ve_set_task_start_time(ve, task); task->task_ve = ve; } } -- 2.31.1 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel