The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh9-5.14.0-4.vz9.10.12 ------> commit ee84e05864f9acf498132dbf166e0462d0d5665e Author: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com> Date: Wed Oct 20 11:43:57 2021 +0300
ve/cgroup: Set release_agent_path for root cgroups separately Previous patches already switched release agent binary calling mechanism to call the binary in ve. Now we need to switch the binary path to per-ve path. So we store release agent path in ve_ra_data structures in per-ve ve->ra_data_list. Short history of the patch: Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com> https://jira.sw.ru/browse/PSBM-83887 (cherry-picked from vz7 commit f1199bd9589b7c0914343dcc72f49ddaa9b98496) https://jira.sw.ru/browse/PSBM-108270 (cherry picked from vz8 commit 40381d9afaff5618fbe93428f125a7acc2bde56d) +++ ve/cgroup: Add release_agent to each container root cgroup (cherry picked from vz8 commit 00bd9c411dba17692408fcdd1769fa90b275313b) vz9 changes: - rework and cleanup patch description - don't remove cgroup_kn_lock_live error checking - switch to new ve_ra_data_set / ve_ra_data_get_path_locked - add BUG_ON if cft->file_offset is set as that can lead to cgroup_add_file(NULL,...) crash - lock cgroup_mutex as we need it for cgroup_add_file / cgroup_rm_file and cgroup_is_dead consistency - fixup hunks from 00bd9c411dba are merged here and there but without excess cft_q_node related stuff - if for some reason ve sees host root cgroup don't allow writing to it and showing it Note: ve_ra_data_set does allocation so we do not want it under rcu_read_lock, so we get_ve and release rcu_read_lock. We can get_ve (not tryget) as we just saw ve_owner so ve is not fully stopped yet and should be populated. https://jira.sw.ru/browse/PSBM-134002 Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> Reviewed-by: Alexander Mikhalitsyn <alexander.mikhalit...@virtuozzo.com> Reviewed-by: Kirill Tkhai <ktk...@virtuozzo.com> --- include/linux/cgroup-defs.h | 3 -- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 105 +++++++++++++++++++++++++++------------- kernel/cgroup/cgroup.c | 40 ++++++++++++++- 4 files changed, 111 insertions(+), 38 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 2035909f7a0a..ef8ebd8a1363 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -539,9 +539,6 @@ struct cgroup_root { /* Hierarchy-specific flags */ unsigned int flags; - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 81bf97510198..c23ea4aa8e19 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -267,6 +267,7 @@ extern const struct proc_ns_operations cgroupns_operations; extern struct cftype cgroup1_base_files[]; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; extern const struct fs_parameter_spec cgroup1_fs_parameters[]; +struct cftype *get_cftype_by_name(const char *name); int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 5b3e8e79cc32..fe781bda5962 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -39,9 +39,6 @@ static bool cgroup_no_v1_named; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* protects cgroup_subsys->release_agent_path */ -static DEFINE_SPINLOCK(release_agent_path_lock); - bool cgroup1_ssid_disabled(int ssid) { return cgroup_no_v1_mask & (1 << ssid); @@ -542,28 +539,62 @@ static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct cgroup *root_cgrp; + struct ve_struct *ve = NULL; struct cgroup *cgrp; - - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + int ret; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - spin_lock(&release_agent_path_lock); - strlcpy(cgrp->root->release_agent_path, strstrip(buf), - sizeof(cgrp->root->release_agent_path)); - spin_unlock(&release_agent_path_lock); + + rcu_read_lock(); + root_cgrp = cgroup_ve_root1(cgrp); + if (!root_cgrp) { + if (ve_is_super(get_exec_env()) && cgrp == &cgrp->root->cgrp) + ve = &ve0; + } else { + if (root_cgrp == cgrp) + ve = rcu_dereference(root_cgrp->ve_owner); + } + get_ve(ve); + rcu_read_unlock(); + + if (ve) { + ret = ve_ra_data_set(ve, cgrp->root, strstrip(buf)); + } else { + /* The ve is stopped or we have a bad cgrp */ + ret = -ENODEV; + } + + put_ve(ve); cgroup_kn_unlock(of->kn); - return nbytes; + return ret ? : nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { + const char *release_agent = NULL; + struct cgroup *root_cgrp; struct cgroup *cgrp = seq_css(seq)->cgroup; + struct ve_struct *ve = NULL; + + rcu_read_lock(); + root_cgrp = cgroup_ve_root1(cgrp); + if (!root_cgrp) { + if (ve_is_super(get_exec_env()) && cgrp == &cgrp->root->cgrp) + ve = &ve0; + } else { + if (root_cgrp == cgrp) + ve = rcu_dereference(root_cgrp->ve_owner); + } + + if (ve) + release_agent = ve_ra_data_get_path_locked(ve, cgrp->root); + if (release_agent) + seq_puts(seq, release_agent); + rcu_read_unlock(); - spin_lock(&release_agent_path_lock); - seq_puts(seq, cgrp->root->release_agent_path); - spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); return 0; } @@ -661,7 +692,7 @@ struct cftype cgroup1_base_files[] = { }, { .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, @@ -674,6 +705,17 @@ struct cftype cgroup1_base_files[] = { { } /* terminate */ }; +struct cftype *get_cftype_by_name(const char *name) +{ + struct cftype *cft; + + for (cft = cgroup1_base_files; cft->name[0] != '\0'; cft++) { + if (!strcmp(cft->name, name)) + return cft; + } + return NULL; +} + #define _cg_virtualized(x) ((ve_is_super(get_exec_env())) ? (x) : 1) /* Display information about each subsystem and each hierarchy */ @@ -827,6 +869,7 @@ void cgroup1_release_agent(struct work_struct *work) spin_lock_irqsave(&ve->release_list_lock, flags); while (!list_empty(&ve->release_list)) { struct cgroup *cgrp; + const char *release_agent; cgrp = list_entry(ve->release_list.next, struct cgroup, @@ -834,15 +877,14 @@ void cgroup1_release_agent(struct work_struct *work) list_del_init(&cgrp->release_list); spin_unlock_irqrestore(&ve->release_list_lock, flags); - /* snoop agent path and exit early if empty */ - if (!cgrp->root->release_agent_path[0]) - goto continue_locked; - - spin_lock(&release_agent_path_lock); - strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); - spin_unlock(&release_agent_path_lock); - if (!agentbuf[0]) + rcu_read_lock(); + release_agent = ve_ra_data_get_path_locked(ve, cgrp->root); + if (!release_agent || !release_agent[0]) { + rcu_read_unlock(); goto continue_locked; + } + strlcpy(agentbuf, release_agent, PATH_MAX); + rcu_read_unlock(); ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, ve_cgroup_ns); if (ret < 0 || ret >= PATH_MAX) @@ -914,6 +956,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { + const char *release_agent; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; @@ -928,12 +971,11 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); - spin_lock(&release_agent_path_lock); - if (strlen(root->release_agent_path)) - seq_show_option(seq, "release_agent", - root->release_agent_path); - spin_unlock(&release_agent_path_lock); - + rcu_read_lock(); + release_agent = ve_ra_data_get_path_locked(get_exec_env(), root); + if (release_agent && release_agent[0]) + seq_show_option(seq, "release_agent", release_agent); + rcu_read_unlock(); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) @@ -1144,11 +1186,8 @@ int cgroup1_reconfigure(struct fs_context *fc) WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - if (ctx->release_agent) { - spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, ctx->release_agent); - spin_unlock(&release_agent_path_lock); - } + if (ctx->release_agent) + ret = ve_ra_data_set(get_exec_env(), root, ctx->release_agent); trace_cgroup_remount(root); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fcb3041abb7..396c0dc98b64 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -68,6 +68,8 @@ /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) +#define CGROUP_FILENAME_RELEASE_AGENT "release_agent" + /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -2050,12 +2052,21 @@ static inline bool ve_check_root_cgroups(struct css_set *cset) return false; } +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, + struct cftype *cft, bool activate); + int cgroup_mark_ve_roots(struct ve_struct *ve) { struct cgrp_cset_link *link; struct css_set *cset; struct cgroup *cgrp; + struct cftype *cft; + int err = 0; + + cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT); + BUG_ON(!cft || cft->file_offset); + mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); /* @@ -2068,6 +2079,7 @@ int cgroup_mark_ve_roots(struct ve_struct *ve) if (ve_check_root_cgroups(cset)) { spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); return -EINVAL; } @@ -2079,12 +2091,30 @@ int cgroup_mark_ve_roots(struct ve_struct *ve) rcu_assign_pointer(cgrp->ve_owner, ve); set_bit(CGRP_VE_ROOT, &cgrp->flags); + + /* + * The cgroupns can hold reference on cgroups which are already + * offline with already removed directory on the filesystem. We + * should not add files to them. + */ + if (!cgroup_is_dead(cgrp)) { + err = cgroup_add_file(NULL, cgrp, cft, true); + if (err) { + pr_warn("failed to add file to VE_ROOT cgroup," + " err:%d\n", err); + break; + } + } } link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]); spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); synchronize_rcu(); - return 0; + + if (err) + cgroup_unmark_ve_roots(ve); + return err; } void cgroup_unmark_ve_roots(struct ve_struct *ve) @@ -2092,7 +2122,11 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) struct cgrp_cset_link *link; struct css_set *cset; struct cgroup *cgrp; + struct cftype *cft; + + cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT); + mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); /* @@ -2109,11 +2143,13 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) if (!is_virtualized_cgroup(cgrp)) continue; + cgroup_rm_file(cgrp, cft); rcu_assign_pointer(cgrp->ve_owner, NULL); clear_bit(CGRP_VE_ROOT, &cgrp->flags); } spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); /* ve_owner == NULL will be visible */ synchronize_rcu(); } @@ -2196,7 +2232,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) root->flags = ctx->flags; if (ctx->release_agent) - strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); + ve_ra_data_set(get_exec_env(), root, ctx->release_agent); if (ctx->name) strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); if (ctx->cpuset_clone_children) _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel