From: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com> Each VE should execute release agent notifications within it's own workqueue. This way we achieve a more fine-grained control over release_agent work flushing at VE destruction.
Short history of the patch: https://jira.sw.ru/browse/PSBM-83887 Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com> Reviewed-by: Kirill Tkhai <ktk...@virtuozzo.com> (cherry picked from vz7 commit 9fbfb5b4cfb87ba7c9dd63eec5e5e27946a38d3c) https://jira.sw.ru/browse/PSBM-108270 (cherry picked from vz8 commit 0eda8e4d0a9f6660a72f7584e8504a363c25a392) +++ cgroup/ve: Do not run release_agent on non-running ve +++ cgroup/ve: Move ve workqueue stop to ve_stop_ns() (cherry picked from vz8 commit 1481d130afa32eecffe28b34fd00b57e11d2666f) vz9 changes: - merge fixup hunks from 1481d130afa3 related to workqueue locking - do less code movements around cgroup->release_list and ve->release_list init - add comment to ve_rm_from_release_list about why we always have the right ve->release_list_lock there - rework cgroup1_release_agent, on vz8 release-agent patches are badly broken with conterintuitive reordering, e.g. cgroup_path_ve_relative used in this patch was defined in later patches... - fix missing cgroup->release_list initialization https://jira.sw.ru/browse/PSBM-134002 Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> --- include/linux/cgroup-defs.h | 8 +++- include/linux/cgroup.h | 2 + include/linux/ve.h | 10 +++++ kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 78 +++++++++++++++++++++++---------- kernel/cgroup/cgroup.c | 4 +- kernel/ve/ve.c | 73 ++++++++++++++++++++++++++++++ 7 files changed, 148 insertions(+), 28 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 315c399c5d3b..2035909f7a0a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -482,8 +482,12 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used to schedule release agent */ - struct work_struct release_agent_work; + /* + * Linked list running through all cgroups that can + * potentially be reaped by the release agent. Protected by + * release_list_lock + */ + struct list_head release_list; /* used to track pressure stalls */ struct psi_group psi; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f4d9b0f44a43..d2586a8397f3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -886,6 +886,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup1_release_agent(struct work_struct *work); + #ifdef CONFIG_VE extern int cgroup_mark_ve_roots(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); diff --git a/include/linux/ve.h b/include/linux/ve.h index 474863746a48..140919fcf4ce 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -104,8 +104,16 @@ struct ve_struct { unsigned long aio_nr; unsigned long aio_max_nr; #endif + /* + * cgroups, that want to notify about becoming + * empty, are linked to this release_list. + */ + struct list_head release_list; + spinlock_t release_list_lock; + /* Should take rcu_read_lock and check ve->is_running before queue */ struct workqueue_struct *wq; + struct work_struct release_agent_work; struct vfsmount *devtmpfs_mnt; }; @@ -131,6 +139,8 @@ extern int nr_ve; extern unsigned int sysctl_ve_mount_nr; #ifdef CONFIG_VE +void ve_add_to_release_list(struct cgroup *cgrp); +void ve_rm_from_release_list(struct cgroup *cgrp); extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bfbeabc17a9d..81bf97510198 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -151,6 +151,7 @@ extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; extern struct file_system_type cgroup_fs_type; +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp); /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 494b4bdd3265..5b3e8e79cc32 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -774,7 +774,7 @@ void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); + ve_add_to_release_list(cgrp); } /* @@ -802,42 +802,72 @@ void cgroup1_check_for_release(struct cgroup *cgrp) */ void cgroup1_release_agent(struct work_struct *work) { - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); + struct ve_struct *ve = + container_of(work, struct ve_struct, release_agent_work); + /* + * Release agent work can be queued only to running ve (see + * ve_add_to_release_list) and ve waits for all queued works to finish + * (see ve_workqueue_stop) before stopping, so we can safely access + * ve_ns->cgroup_ns here without rcu_read_lock - it won't disappear + * under us. + */ + struct cgroup_namespace *ve_cgroup_ns = + rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns; char *pathbuf, *agentbuf; char *argv[3], *envp[3]; + unsigned long flags; int ret; - /* snoop agent path and exit early if empty */ - if (!cgrp->root->release_agent_path[0]) - return; - /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf || !agentbuf) goto out_free; - spin_lock(&release_agent_path_lock); - strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); - spin_unlock(&release_agent_path_lock); - if (!agentbuf[0]) - goto out_free; + spin_lock_irqsave(&ve->release_list_lock, flags); + while (!list_empty(&ve->release_list)) { + struct cgroup *cgrp; - ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - if (ret < 0 || ret >= PATH_MAX) - goto out_free; - - argv[0] = agentbuf; - argv[1] = pathbuf; - argv[2] = NULL; + cgrp = list_entry(ve->release_list.next, + struct cgroup, + release_list); + list_del_init(&cgrp->release_list); + spin_unlock_irqrestore(&ve->release_list_lock, flags); - /* minimal command environment */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; + /* snoop agent path and exit early if empty */ + if (!cgrp->root->release_agent_path[0]) + goto continue_locked; - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + spin_lock(&release_agent_path_lock); + strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); + spin_unlock(&release_agent_path_lock); + if (!agentbuf[0]) + goto continue_locked; + + ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, ve_cgroup_ns); + if (ret < 0 || ret >= PATH_MAX) + goto continue_locked; + + argv[0] = agentbuf; + argv[1] = pathbuf; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + ret = call_usermodehelper_ve(ve, argv[0], argv, envp, + UMH_WAIT_EXEC); + + if (ret < 0 && ve == &ve0) + pr_warn_ratelimited("cgroup1_release_agent " + "%s %s failed: %d\n", + agentbuf, pathbuf, ret); +continue_locked: + spin_lock_irqsave(&ve->release_list_lock, flags); + } + spin_unlock_irqrestore(&ve->release_list_lock, flags); out_free: kfree(agentbuf); kfree(pathbuf); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ff9f5b9dc219..c0c92a4cf791 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2167,6 +2167,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); + INIT_LIST_HEAD(&cgrp->release_list); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; @@ -2180,7 +2181,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); - INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_fs_context *ctx) @@ -5327,7 +5327,6 @@ static void css_free_rwork_fn(struct work_struct *work) /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); cgroup1_pidlist_destroy_all(cgrp); - cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { /* @@ -5879,6 +5878,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; + ve_rm_from_release_list(cgrp); spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 30ace4a07568..b7b66e835a8a 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -72,6 +72,11 @@ struct ve_struct ve0 = { .meminfo_val = VE_MEMINFO_SYSTEM, .vdso_64 = (struct vdso_image*)&vdso_image_64, .vdso_32 = (struct vdso_image*)&vdso_image_32, + .release_list_lock = __SPIN_LOCK_UNLOCKED( + ve0.release_list_lock), + .release_list = LIST_HEAD_INIT(ve0.release_list), + .release_agent_work = __WORK_INITIALIZER(ve0.release_agent_work, + cgroup1_release_agent), }; EXPORT_SYMBOL(ve0); @@ -476,6 +481,69 @@ static void ve_workqueue_stop(struct ve_struct *ve) ve->wq = NULL; } +/* + * ve_add_to_release_list - called from cgroup1_check_for_release to put a + * cgroup into a release workqueue. There are two codepaths that lead to this + * function. One starts from cgroup_exit() which holds css_set_lock, another + * one from cgroup_destroy_locked which does not hold css_set_lock. So we + * should not use any reschedulable + * + */ +void ve_add_to_release_list(struct cgroup *cgrp) +{ + struct ve_struct *ve; + unsigned long flags; + int need_schedule_work = 0; + + rcu_read_lock(); + ve = cgroup_ve_owner(cgrp); + if (!ve) + ve = &ve0; + + if (!ve->is_running) { + rcu_read_unlock(); + return; + } + + spin_lock_irqsave(&ve->release_list_lock, flags); + if (!cgroup_is_dead(cgrp) && + list_empty(&cgrp->release_list)) { + list_add(&cgrp->release_list, &ve->release_list); + need_schedule_work = 1; + } + spin_unlock_irqrestore(&ve->release_list_lock, flags); + + if (need_schedule_work) + queue_work(ve->wq, &ve->release_agent_work); + + rcu_read_unlock(); +} + +/* + * As workqueue destroy happens before we unset CGRP_VE_ROOT and + * cgroup->ve_owner and also as destroy waits for all current works + * to finish, we can rely that if cgroup is in release_list of some ve + * then cgroup_ve_owner would return exactly the same ve, and we + * would take the right ve->release_list_lock for operating on + * our cgroup->release_list. + */ +void ve_rm_from_release_list(struct cgroup *cgrp) +{ + struct ve_struct *ve; + unsigned long flags; + + rcu_read_lock(); + ve = cgroup_ve_owner(cgrp); + if (!ve) + ve = &ve0; + + spin_lock_irqsave(&ve->release_list_lock, flags); + if (!list_empty(&cgrp->release_list)) + list_del_init(&cgrp->release_list); + spin_unlock_irqrestore(&ve->release_list_lock, flags); + rcu_read_unlock(); +} + /* under ve->op_sem write-lock */ static int ve_start_container(struct ve_struct *ve) { @@ -718,6 +786,11 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ goto err_lat; ve->features = VE_FEATURES_DEF; + + INIT_WORK(&ve->release_agent_work, cgroup1_release_agent); + spin_lock_init(&ve->release_list_lock); + INIT_LIST_HEAD(&ve->release_list); + ve->_randomize_va_space = ve0._randomize_va_space; ve->meminfo_val = VE_MEMINFO_DEFAULT; -- 2.31.1 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel