A task can exit and become a zombie, and outlive its ve. Because exited
task is moved into init_css_set, so it does not hold any reference back
to ve.

More specifically that can happen in two cases:

1) If task is not inside pid namespace of ve and zap_pid_ns_processes
   will not reap/collect the task;
2) If task is an init of root pid namespace of ve, nothing guarantees
   that it can't become a zombie;

And if someone reads e.g.: /proc/pid/stat for such a task it may lead to
a crash on accessing freed ->task_ve memory.

https://virtuozzo.atlassian.net/browse/PSBM-157285

Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com>
---
v2: s/current/task/ in ve_exit
---
 kernel/ve/ve.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 70e588b7df4c6..ff81fce4adcd0 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -839,6 +839,14 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
        if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
                return;
 
+       /*
+        * To prevent use-after-free (e.g. reading /proc/pid/stat) in case
+        * zombie container init outlives the ve_struct.
+        *
+        * Note: synchronize_rcu in ve_drop_context makes it rcu safe.
+        */
+       rcu_assign_pointer(current->task_ve, &ve0);
+
        cgroup_unmark_ve_roots(ve);
 
        ve_workqueue_stop(ve);
@@ -1115,6 +1123,32 @@ static void ve_attach(struct cgroup *cg, struct 
cgroup_taskset *tset)
        on_each_cpu(ve_update_cpuid_faulting, NULL, 1);
 }
 
+/*
+ * When a container task exits it is moved into init_css_set, see cgroup_exit.
+ * So the task effectively puts its reference to ve_struct of the container,
+ * and now nothing prevents ve_struct from being freed while the task is still
+ * there.
+ *
+ * We need to make sure that this task's ->task_ve is not pointing to the
+ * container ve before putting the reference, else the use of ->task_ve from
+ * the task can lead to use-after-free (e.g. reading /proc/pid/stat).
+ *
+ * One exception to that rule is the init process of the container, it needs
+ * ->task_ve in ve_stop_ns / ve_exit_ns, and holds a reference to ve_struct.
+ */
+static void ve_exit(struct cgroup *cg, struct cgroup *oldcg, struct 
task_struct *task)
+{
+       struct ve_struct *ve = task->task_ve;
+
+       /*
+        * Pairs with synchronize_rcu in ve_grab_context and ve_drop_context
+        */
+       rcu_read_lock();
+       if (!ve->ve_ns || ve->init_task != task)
+               rcu_assign_pointer(task->task_ve, &ve0);
+       rcu_read_unlock();
+}
+
 static int ve_state_read(struct cgroup *cg, struct cftype *cft,
                         struct seq_file *m)
 {
@@ -1898,6 +1932,7 @@ struct cgroup_subsys ve_subsys = {
        .css_free       = ve_destroy,
        .can_attach     = ve_can_attach,
        .attach         = ve_attach,
+       .exit           = ve_exit,
        .base_cftypes   = ve_cftypes,
 };
 EXPORT_SYMBOL(ve_subsys);
-- 
2.45.2

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to