The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh9-5.14.0-4.vz9.10.17 ------> commit b5caa8e3e42efeba5d4e57ce79dcb1562047396b Author: Nikita Yushchenko <nikita.yushche...@virtuozzo.com> Date: Mon Oct 25 16:57:10 2021 +0300
binfmt_misc: fix mount after umount in CT The assumption that bm_fill_super() is not called for the second time for CT is wrong: umount operation clears sb->s_root, which causes vfs_get_super() to call fill_super again on the next mount. Make bm_fill_super() handle multiple-calls corrently: - initialize bm_data and set ve->binfmt_misc only if it is not done before, - delay desctruction of it up to CT destruction. https://jira.sw.ru/browse/PSBM-133968 Fixes: edb6893b99b2 ("ve/fs/binfmt: virtualization") Signed-off-by: Nikita Yushchenko <nikita.yushche...@virtuozzo.com> --- fs/binfmt_misc.c | 58 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 628d4fc2db94..a7ec4daff163 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -805,39 +805,32 @@ static int bm_fill_super(struct super_block *sb, struct fs_context *fc) /* last one */ {""} }; +#ifdef CONFIG_VE struct ve_struct *ve = get_exec_env(); - struct binfmt_misc *bm_data; + struct binfmt_misc *bm_data = ve->binfmt_misc; +#else + static struct binfmt_misc *bm_data = NULL; +#endif - /* - * bm_get_tree() - * get_tree_keyed(fc, bm_fill_super, get_ve(ve)) - * fc->s_fs_info = current VE - * vfs_get_super(fc, vfs_get_keyed_super, bm_fill_super) - * sb = sget_fc(fc, test, set_anon_super_fc) - * if (!sb->s_root) { - * err = bm_fill_super(sb, fc); - * - * => we should never get here with initialized ve->binfmt_misc. - */ - if (WARN_ON_ONCE(ve->binfmt_misc)) - return -EEXIST; + if (!bm_data) { + bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL); + if (!bm_data) + return -ENOMEM; - bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL); - if (!bm_data) - return -ENOMEM; + INIT_LIST_HEAD(&bm_data->entries); + rwlock_init(&bm_data->entries_lock); - INIT_LIST_HEAD(&bm_data->entries); - rwlock_init(&bm_data->entries_lock); +#ifdef CONFIG_VE + ve->binfmt_misc = bm_data; + /* this will be cleared by ve_destroy() */ +#endif + } err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); - if (err) { - kfree(bm_data); + if (err) return err; - } sb->s_op = &s_ops; - - ve->binfmt_misc = bm_data; bm_data->enabled = 1; return 0; @@ -909,6 +902,7 @@ static struct file_system_type bm_fs_type = { }; MODULE_ALIAS_FS("binfmt_misc"); +#ifdef CONFIG_VE static void ve_binfmt_fini(void *data) { struct ve_struct *ve = data; @@ -918,8 +912,17 @@ static void ve_binfmt_fini(void *data) return; /* - * XXX: Note we don't take any locks here. This is safe as long as - * nobody uses binfmt_misc outside the owner ve. + * This is called when VE is being destructed, no more processes are + * in VE and thus use of bm_data is unexpected. + * + * Still, there is a possibility for a race, if a host process + * explicitly enters VE's mount namespace and accesses files on + * binfmt_misc mount, while VE is being destructed. + * + * This is extremely unlikely so ignore it for now. + * + * To fix, need to move this to ve_destroy() path that is executed when + * no more references to VE are left. */ while (!list_empty(&bm_data->entries)) kill_node(bm_data, list_first_entry( @@ -931,6 +934,9 @@ static struct ve_hook ve_binfmt_hook = { .priority = HOOK_PRIO_DEFAULT, .owner = THIS_MODULE, }; +#else +#define ve_binfmt_hook 0 +#endif static int __init init_misc_binfmt(void) { _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel