cgroup: Move release_agent from system_wq to per-ve workqueues

Pavel Tikhomirov Mon, 18 Oct 2021 09:51:46 -0700

From: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com>

Each VE should execute release agent notifications within it's own
workqueue. This way we achieve a more fine-grained control over
release_agent work flushing at VE destruction.


Short history of the patch:

https://jira.sw.ru/browse/PSBM-83887
Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktk...@virtuozzo.com>
(cherry picked from vz7 commit 9fbfb5b4cfb87ba7c9dd63eec5e5e27946a38d3c)
https://jira.sw.ru/browse/PSBM-108270
(cherry picked from vz8 commit 0eda8e4d0a9f6660a72f7584e8504a363c25a392)
+++
cgroup/ve: Do not run release_agent on non-running ve
+++
cgroup/ve: Move ve workqueue stop to ve_stop_ns()
(cherry picked from vz8 commit 1481d130afa32eecffe28b34fd00b57e11d2666f)

vz9 changes:
- merge fixup hunks from 1481d130afa3 related to workqueue locking
- do less code movements around cgroup->release_list and
  ve->release_list init
- add comment to ve_rm_from_release_list about why we always have the
  right ve->release_list_lock there
- rework cgroup1_release_agent, on vz8 release-agent patches are badly
  broken with conterintuitive reordering, e.g. cgroup_path_ve_relative
  used in this patch was defined in later patches...
- fix missing cgroup->release_list initialization

https://jira.sw.ru/browse/PSBM-134002
Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com>
---
 include/linux/cgroup-defs.h     |  8 +++-
 include/linux/cgroup.h          |  2 +
 include/linux/ve.h              | 10 +++++
 kernel/cgroup/cgroup-internal.h |  1 +
 kernel/cgroup/cgroup-v1.c       | 78 +++++++++++++++++++++++----------
 kernel/cgroup/cgroup.c          |  4 +-
 kernel/ve/ve.c                  | 73 ++++++++++++++++++++++++++++++
 7 files changed, 148 insertions(+), 28 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 315c399c5d3b..2035909f7a0a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -482,8 +482,12 @@ struct cgroup {
        /* used to wait for offlining of csses */
        wait_queue_head_t offline_waitq;
 
-       /* used to schedule release agent */
-       struct work_struct release_agent_work;
+       /*
+        * Linked list running through all cgroups that can
+        * potentially be reaped by the release agent. Protected by
+        * release_list_lock
+        */
+       struct list_head release_list;
 
        /* used to track pressure stalls */
        struct psi_group psi;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f4d9b0f44a43..d2586a8397f3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -886,6 +886,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
 int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns);
 
+void cgroup1_release_agent(struct work_struct *work);
+
 #ifdef CONFIG_VE
 extern int cgroup_mark_ve_roots(struct ve_struct *ve);
 void cgroup_unmark_ve_roots(struct ve_struct *ve);
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 474863746a48..140919fcf4ce 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -104,8 +104,16 @@ struct ve_struct {
        unsigned long           aio_nr;
        unsigned long           aio_max_nr;
 #endif
+       /*
+        * cgroups, that want to notify about becoming
+        * empty, are linked to this release_list.
+        */
+       struct list_head        release_list;
+       spinlock_t              release_list_lock;
+
        /* Should take rcu_read_lock and check ve->is_running before queue */
        struct workqueue_struct *wq;
+       struct work_struct      release_agent_work;
 
        struct vfsmount         *devtmpfs_mnt;
 };
@@ -131,6 +139,8 @@ extern int nr_ve;
 extern unsigned int sysctl_ve_mount_nr;
 
 #ifdef CONFIG_VE
+void ve_add_to_release_list(struct cgroup *cgrp);
+void ve_rm_from_release_list(struct cgroup *cgrp);
 extern struct ve_struct *get_ve(struct ve_struct *ve);
 extern void put_ve(struct ve_struct *ve);
 
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..81bf97510198 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -151,6 +151,7 @@ extern spinlock_t css_set_lock;
 extern struct cgroup_subsys *cgroup_subsys[];
 extern struct list_head cgroup_roots;
 extern struct file_system_type cgroup_fs_type;
+struct cgroup *cgroup_get_local_root(struct cgroup *cgrp);
 
 /* iterate across the hierarchies */
 #define for_each_root(root)                                            \
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 494b4bdd3265..5b3e8e79cc32 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -774,7 +774,7 @@ void cgroup1_check_for_release(struct cgroup *cgrp)
 {
        if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
            !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
-               schedule_work(&cgrp->release_agent_work);
+               ve_add_to_release_list(cgrp);
 }
 
 /*
@@ -802,42 +802,72 @@ void cgroup1_check_for_release(struct cgroup *cgrp)
  */
 void cgroup1_release_agent(struct work_struct *work)
 {
-       struct cgroup *cgrp =
-               container_of(work, struct cgroup, release_agent_work);
+       struct ve_struct *ve =
+               container_of(work, struct ve_struct, release_agent_work);
+       /*
+        * Release agent work can be queued only to running ve (see
+        * ve_add_to_release_list) and ve waits for all queued works to finish
+        * (see ve_workqueue_stop) before stopping, so we can safely access
+        * ve_ns->cgroup_ns here without rcu_read_lock - it won't disappear
+        * under us.
+        */
+       struct cgroup_namespace *ve_cgroup_ns =
+               rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns;
        char *pathbuf, *agentbuf;
        char *argv[3], *envp[3];
+       unsigned long flags;
        int ret;
 
-       /* snoop agent path and exit early if empty */
-       if (!cgrp->root->release_agent_path[0])
-               return;
-
        /* prepare argument buffers */
        pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
        agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!pathbuf || !agentbuf)
                goto out_free;
 
-       spin_lock(&release_agent_path_lock);
-       strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
-       spin_unlock(&release_agent_path_lock);
-       if (!agentbuf[0])
-               goto out_free;
+       spin_lock_irqsave(&ve->release_list_lock, flags);
+       while (!list_empty(&ve->release_list)) {
+               struct cgroup *cgrp;
 
-       ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-       if (ret < 0 || ret >= PATH_MAX)
-               goto out_free;
-
-       argv[0] = agentbuf;
-       argv[1] = pathbuf;
-       argv[2] = NULL;
+               cgrp = list_entry(ve->release_list.next,
+                                 struct cgroup,
+                                 release_list);
+               list_del_init(&cgrp->release_list);
+               spin_unlock_irqrestore(&ve->release_list_lock, flags);
 
-       /* minimal command environment */
-       envp[0] = "HOME=/";
-       envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-       envp[2] = NULL;
+               /* snoop agent path and exit early if empty */
+               if (!cgrp->root->release_agent_path[0])
+                       goto continue_locked;
 
-       call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+               spin_lock(&release_agent_path_lock);
+               strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
+               spin_unlock(&release_agent_path_lock);
+               if (!agentbuf[0])
+                       goto continue_locked;
+
+               ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, ve_cgroup_ns);
+               if (ret < 0 || ret >= PATH_MAX)
+                       goto continue_locked;
+
+               argv[0] = agentbuf;
+               argv[1] = pathbuf;
+               argv[2] = NULL;
+
+               /* minimal command environment */
+               envp[0] = "HOME=/";
+               envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+               envp[2] = NULL;
+
+               ret = call_usermodehelper_ve(ve, argv[0], argv, envp,
+                                            UMH_WAIT_EXEC);
+
+               if (ret < 0 && ve == &ve0)
+                       pr_warn_ratelimited("cgroup1_release_agent "
+                                           "%s %s failed: %d\n",
+                                           agentbuf, pathbuf, ret);
+continue_locked:
+               spin_lock_irqsave(&ve->release_list_lock, flags);
+       }
+       spin_unlock_irqrestore(&ve->release_list_lock, flags);
 out_free:
        kfree(agentbuf);
        kfree(pathbuf);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ff9f5b9dc219..c0c92a4cf791 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2167,6 +2167,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->self.children);
        INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->pidlists);
+       INIT_LIST_HEAD(&cgrp->release_list);
        mutex_init(&cgrp->pidlist_mutex);
        cgrp->self.cgroup = cgrp;
        cgrp->self.flags |= CSS_ONLINE;
@@ -2180,7 +2181,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 
        init_waitqueue_head(&cgrp->offline_waitq);
-       INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
 }
 
 void init_cgroup_root(struct cgroup_fs_context *ctx)
@@ -5327,7 +5327,6 @@ static void css_free_rwork_fn(struct work_struct *work)
                /* cgroup free path */
                atomic_dec(&cgrp->root->nr_cgrps);
                cgroup1_pidlist_destroy_all(cgrp);
-               cancel_work_sync(&cgrp->release_agent_work);
 
                if (cgroup_parent(cgrp)) {
                        /*
@@ -5879,6 +5878,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         * the migration path.
         */
        cgrp->self.flags &= ~CSS_ONLINE;
+       ve_rm_from_release_list(cgrp);
 
        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 30ace4a07568..b7b66e835a8a 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -72,6 +72,11 @@ struct ve_struct ve0 = {
        .meminfo_val            = VE_MEMINFO_SYSTEM,
        .vdso_64                = (struct vdso_image*)&vdso_image_64,
        .vdso_32                = (struct vdso_image*)&vdso_image_32,
+       .release_list_lock      = __SPIN_LOCK_UNLOCKED(
+                                       ve0.release_list_lock),
+       .release_list           = LIST_HEAD_INIT(ve0.release_list),
+       .release_agent_work     = __WORK_INITIALIZER(ve0.release_agent_work,
+                                       cgroup1_release_agent),
 };
 EXPORT_SYMBOL(ve0);
 
@@ -476,6 +481,69 @@ static void ve_workqueue_stop(struct ve_struct *ve)
        ve->wq = NULL;
 }
 
+/*
+ * ve_add_to_release_list - called from cgroup1_check_for_release to put a
+ * cgroup into a release workqueue. There are two codepaths that lead to this
+ * function. One starts from cgroup_exit() which holds css_set_lock, another
+ * one from cgroup_destroy_locked which does not hold css_set_lock. So we
+ * should not use any reschedulable
+ *
+ */
+void ve_add_to_release_list(struct cgroup *cgrp)
+{
+       struct ve_struct *ve;
+       unsigned long flags;
+       int need_schedule_work = 0;
+
+       rcu_read_lock();
+       ve = cgroup_ve_owner(cgrp);
+       if (!ve)
+               ve = &ve0;
+
+       if (!ve->is_running) {
+               rcu_read_unlock();
+               return;
+       }
+
+       spin_lock_irqsave(&ve->release_list_lock, flags);
+       if (!cgroup_is_dead(cgrp) &&
+           list_empty(&cgrp->release_list)) {
+               list_add(&cgrp->release_list, &ve->release_list);
+               need_schedule_work = 1;
+       }
+       spin_unlock_irqrestore(&ve->release_list_lock, flags);
+
+       if (need_schedule_work)
+               queue_work(ve->wq, &ve->release_agent_work);
+
+       rcu_read_unlock();
+}
+
+/*
+ * As workqueue destroy happens before we unset CGRP_VE_ROOT and
+ * cgroup->ve_owner and also as destroy waits for all current works
+ * to finish, we can rely that if cgroup is in release_list of some ve
+ * then cgroup_ve_owner would return exactly the same ve, and we
+ * would take the right ve->release_list_lock for operating on
+ * our cgroup->release_list.
+ */
+void ve_rm_from_release_list(struct cgroup *cgrp)
+{
+       struct ve_struct *ve;
+       unsigned long flags;
+
+       rcu_read_lock();
+       ve = cgroup_ve_owner(cgrp);
+       if (!ve)
+               ve = &ve0;
+
+       spin_lock_irqsave(&ve->release_list_lock, flags);
+       if (!list_empty(&cgrp->release_list))
+               list_del_init(&cgrp->release_list);
+       spin_unlock_irqrestore(&ve->release_list_lock, flags);
+       rcu_read_unlock();
+}
+
 /* under ve->op_sem write-lock */
 static int ve_start_container(struct ve_struct *ve)
 {
@@ -718,6 +786,11 @@ static struct cgroup_subsys_state *ve_create(struct 
cgroup_subsys_state *parent_
                goto err_lat;
 
        ve->features = VE_FEATURES_DEF;
+
+       INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
+       spin_lock_init(&ve->release_list_lock);
+       INIT_LIST_HEAD(&ve->release_list);
+
        ve->_randomize_va_space = ve0._randomize_va_space;
 
        ve->meminfo_val = VE_MEMINFO_DEFAULT;
-- 
2.31.1

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 4/6] ve/cgroup: Move release_agent from system_wq to per-ve workqueues

Reply via email to