Make it possible for fsopen() to create a superblock in a specified
container, using the namespaces associated with that container to cover UID
translation, networking and filesystem content.  This involves adding a new
fsconfig command to specify the container.

For example:

        cfd = container_create("fred", CONTAINER_NEW_FS_NS);

        fsfd = fsopen("ext4", 0);
        fsconfig(fsfd, FSCONFIG_SET_CONTAINER, NULL, NULL, cfd);
        fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "/dev/sda3", 0);
        fsconfig(fsfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
        fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
        mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY);
        move_mount(mfd, "", cfd, "/",
                   MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_CONTAINER_ROOT);

Signed-off-by: David Howells <dhowe...@redhat.com>
---

 fs/fs_context.c            |   19 +++++++++++++++
 fs/fsopen.c                |   54 +++++++++++++++++++++++++++++++++++++-------
 fs/namespace.c             |   19 +++++++++++----
 fs/proc/root.c             |   11 +++++++--
 include/linux/container.h  |    1 +
 include/linux/fs_context.h |    3 ++
 include/linux/pid.h        |    5 +++-
 include/linux/proc_ns.h    |    6 +++--
 include/uapi/linux/mount.h |    1 +
 kernel/container.c         |    4 +++
 kernel/fork.c              |    2 +-
 kernel/pid.c               |    4 ++-
 12 files changed, 108 insertions(+), 21 deletions(-)

diff --git a/fs/fs_context.c b/fs/fs_context.c
index a47ccd5a4a78..fc76ac02d618 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/magic.h>
 #include <linux/security.h>
+#include <linux/container.h>
 #include <linux/mnt_namespace.h>
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
@@ -169,6 +170,21 @@ int vfs_parse_fs_param(struct fs_context *fc, struct 
fs_parameter *param)
 }
 EXPORT_SYMBOL(vfs_parse_fs_param);
 
+/*
+ * Specify a container in which a superblock will exist.
+ */
+void vfs_set_container(struct fs_context *fc, struct container *container)
+{
+       if (container) {
+               put_user_ns(fc->user_ns);
+               put_net(fc->net_ns);
+
+               fc->container = get_container(container);
+               fc->user_ns = get_user_ns(container->cred->user_ns);
+               fc->net_ns = get_net(container->ns->net_ns);
+       }
+}
+
 /**
  * vfs_parse_fs_string - Convenience function to just parse a string.
  */
@@ -364,6 +380,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context 
*src_fc)
        fc->source      = NULL;
        fc->security    = NULL;
        get_filesystem(fc->fs_type);
+       if (fc->container)
+               get_container(fc->container);
        get_net(fc->net_ns);
        get_user_ns(fc->user_ns);
        get_cred(fc->cred);
@@ -510,6 +528,7 @@ void put_fs_context(struct fs_context *fc)
        put_net(fc->net_ns);
        put_user_ns(fc->user_ns);
        put_cred(fc->cred);
+       put_container(fc->container);
        kfree(fc->subtype);
        put_fc_log(fc);
        put_filesystem(fc->fs_type);
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 3bb9c0c8cbcc..d0fe9e563ebb 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -17,11 +17,33 @@
 #include <linux/security.h>
 #include <linux/anon_inodes.h>
 #include <linux/namei.h>
+#include <linux/container.h>
 #include <linux/file.h>
 #include <uapi/linux/mount.h>
 #include "internal.h"
 #include "mount.h"
 
+/*
+ * Configure the destination container on a filesystem context.  This must be
+ * done before any other parameters are offered.  Containers are presented as
+ * fds attached to such objects given by the auxiliary parameter.
+ *
+ * For example:
+ *
+ *     fsconfig(fsfd, FSCONFIG_SET_CONTAINER, NULL, NULL, container_fd);
+ */
+static int fsconfig_set_container(struct fs_context *fc, struct fs_parameter 
*param)
+{
+       struct container *c;
+
+       if (!is_container_file(param->file))
+               return -EINVAL;
+
+       c = param->file->private_data;
+       vfs_set_container(fc, c);
+       return 0;
+}
+
 /*
  * Allow the user to read back any error, warning or informational messages.
  */
@@ -111,10 +133,6 @@ static int fscontext_alloc_log(struct fs_context *fc)
 
 /*
  * Open a filesystem by name so that it can be configured for mounting.
- *
- * We are allowed to specify a container in which the filesystem will be
- * opened, thereby indicating which namespaces will be used (notably, which
- * network namespace will be used for network filesystems).
  */
 SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
 {
@@ -143,7 +161,7 @@ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, 
unsigned int, flags)
        if (IS_ERR(fc))
                return PTR_ERR(fc);
 
-       fc->phase = FS_CONTEXT_CREATE_PARAMS;
+       fc->phase = FS_CONTEXT_CREATE_NS;
 
        ret = fscontext_alloc_log(fc);
        if (ret < 0)
@@ -228,7 +246,8 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int 
cmd,
                return ret;
        switch (cmd) {
        case FSCONFIG_CMD_CREATE:
-               if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
+               if (fc->phase != FS_CONTEXT_CREATE_NS &&
+                   fc->phase != FS_CONTEXT_CREATE_PARAMS)
                        return -EBUSY;
                fc->phase = FS_CONTEXT_CREATING;
                ret = vfs_get_tree(fc);
@@ -259,9 +278,17 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int 
cmd,
                        break;
                vfs_clean_context(fc);
                return 0;
+
+       case FSCONFIG_SET_CONTAINER:
+               if (fc->phase != FS_CONTEXT_CREATE_NS)
+                       return -EBUSY;
+               return fsconfig_set_container(fc, param);
+
        default:
-               if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
-                   fc->phase != FS_CONTEXT_RECONF_PARAMS)
+               if (fc->phase == FS_CONTEXT_CREATE_NS)
+                       fc->phase = FS_CONTEXT_CREATE_PARAMS;
+               else if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
+                        fc->phase != FS_CONTEXT_RECONF_PARAMS)
                        return -EBUSY;
 
                return vfs_parse_fs_param(fc, param);
@@ -353,6 +380,10 @@ SYSCALL_DEFINE5(fsconfig,
                if (!_key || _value || aux < 0)
                        return -EINVAL;
                break;
+       case FSCONFIG_SET_CONTAINER:
+               if (_key || _value || aux < 0)
+                       return -EINVAL;
+               break;
        case FSCONFIG_CMD_CREATE:
        case FSCONFIG_CMD_RECONFIGURE:
                if (_key || _value || aux)
@@ -438,6 +469,12 @@ SYSCALL_DEFINE5(fsconfig,
                if (!param.file)
                        goto out_key;
                break;
+       case FSCONFIG_SET_CONTAINER:
+               ret = -EBADF;
+               param.file = fget(aux);
+               if (!param.file)
+                       goto out_key;
+               break;
        default:
                break;
        }
@@ -463,6 +500,7 @@ SYSCALL_DEFINE5(fsconfig,
                        putname(param.name);
                break;
        case FSCONFIG_SET_FD:
+       case FSCONFIG_SET_CONTAINER:
                if (param.file)
                        fput(param.file);
                break;
diff --git a/fs/namespace.c b/fs/namespace.c
index ea005f55ec4c..cc5d56f7ae29 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -781,9 +781,16 @@ static void put_mountpoint(struct mountpoint *mp)
        }
 }
 
+static inline int __check_mnt(struct mount *mnt, struct mnt_namespace *mnt_ns)
+{
+       if (!mnt_ns)
+               mnt_ns = current->nsproxy->mnt_ns;
+       return mnt->mnt_ns == mnt_ns;
+}
+
 static inline int check_mnt(struct mount *mnt)
 {
-       return mnt->mnt_ns == current->nsproxy->mnt_ns;
+       return __check_mnt(mnt, NULL);
 }
 
 /*
@@ -2696,7 +2703,8 @@ static int do_move_mount_old(struct path *path, const 
char *old_name)
 /*
  * add a mount into a namespace's mount tree
  */
-static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
+static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags,
+                       struct mnt_namespace *mnt_ns)
 {
        struct mountpoint *mp;
        struct mount *parent;
@@ -2710,7 +2718,7 @@ static int do_add_mount(struct mount *newmnt, struct path 
*path, int mnt_flags)
 
        parent = real_mount(path->mnt);
        err = -EINVAL;
-       if (unlikely(!check_mnt(parent))) {
+       if (unlikely(!__check_mnt(parent, mnt_ns))) {
                /* that's acceptable only for automounts done in private ns */
                if (!(mnt_flags & MNT_SHRINKABLE))
                        goto unlock;
@@ -2765,7 +2773,8 @@ static int do_new_mount_fc(struct fs_context *fc, struct 
path *mountpoint,
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
 
-       error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+       error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags,
+                            fc->container ? fc->container->ns->mnt_ns : NULL);
        if (error < 0)
                mntput(mnt);
        return error;
@@ -2839,7 +2848,7 @@ int finish_automount(struct vfsmount *m, struct path 
*path)
                goto fail;
        }
 
-       err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+       err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE, 
NULL);
        if (!err)
                return 0;
 fail:
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 6927b29ece76..aa802006d855 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,7 @@
 #include <linux/sched/stat.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
+#include <linux/container.h>
 #include <linux/user_namespace.h>
 #include <linux/fs_context.h>
 #include <linux/mount.h>
@@ -186,8 +187,12 @@ static int proc_init_fs_context(struct fs_context *fc)
        ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;
+       
+       if (fc->container)
+               ctx->pid_ns = get_pid_ns(fc->container->pid_ns);
+       else
+               ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
 
-       ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
        fc->fs_private = ctx;
        fc->ops = &proc_fs_context_ops;
        return 0;
@@ -300,7 +305,7 @@ struct proc_dir_entry proc_root = {
        .name           = "/proc",
 };
 
-int pid_ns_prepare_proc(struct pid_namespace *ns)
+int pid_ns_prepare_proc(struct pid_namespace *ns, struct container *container)
 {
        struct proc_fs_context *ctx;
        struct fs_context *fc;
@@ -315,6 +320,8 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
                fc->user_ns = get_user_ns(ns->user_ns);
        }
 
+       vfs_set_container(fc, container);
+       
        ctx = fc->fs_private;
        if (ctx->pid_ns != ns) {
                put_pid_ns(ctx->pid_ns);
diff --git a/include/linux/container.h b/include/linux/container.h
index 0a8918435097..087aa1885ef7 100644
--- a/include/linux/container.h
+++ b/include/linux/container.h
@@ -37,6 +37,7 @@ struct container {
        struct path             root;           /* The root of the container's 
fs namespace */
        struct task_struct      *init;          /* The 'init' task for this 
container */
        struct container        *parent;        /* Parent of this container. */
+       struct pid_namespace    *pid_ns;        /* The process ID namespace for 
this container */
        void                    *security;      /* LSM data */
        struct list_head        members;        /* Member processes, guarded 
with ->lock */
        struct list_head        child_link;     /* Link in parent->children */
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index dc8c9fcba341..45486080eb84 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -40,6 +40,7 @@ enum fs_context_purpose {
  * Userspace usage phase for fsopen/fspick.
  */
 enum fs_context_phase {
+       FS_CONTEXT_CREATE_NS,           /* Set namespaces for sb creation */
        FS_CONTEXT_CREATE_PARAMS,       /* Loading params for sb creation */
        FS_CONTEXT_CREATING,            /* A superblock is being created */
        FS_CONTEXT_AWAITING_MOUNT,      /* Superblock created, awaiting 
fsmount() */
@@ -93,6 +94,7 @@ struct fs_context {
        struct file_system_type *fs_type;
        void                    *fs_private;    /* The filesystem's context */
        struct dentry           *root;          /* The root and superblock */
+       struct container        *container;     /* The container in which the 
mount will exist */
        struct user_namespace   *user_ns;       /* The user namespace for this 
mount */
        struct net              *net_ns;        /* The network namespace for 
this mount */
        const struct cred       *cred;          /* The mounter's credentials */
@@ -136,6 +138,7 @@ extern int vfs_parse_fs_param(struct fs_context *fc, struct 
fs_parameter *param)
 extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
                               const char *value, size_t v_size);
 extern int generic_parse_monolithic(struct fs_context *fc, void *data);
+extern void vfs_set_container(struct fs_context *fc, struct container 
*container);
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..16dc152ceef1 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -73,6 +73,8 @@ static inline struct pid *get_pid(struct pid *pid)
        return pid;
 }
 
+struct container;
+
 extern void put_pid(struct pid *pid);
 extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
 extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
@@ -111,7 +113,8 @@ extern struct pid *find_get_pid(int nr);
 extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
 
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns,
+                            struct container *container);
 extern void free_pid(struct pid *pid);
 extern void disable_pid_allocation(struct pid_namespace *ns);
 
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index d31cb6215905..dee0881eca5c 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -47,14 +47,16 @@ enum {
 
 #ifdef CONFIG_PROC_FS
 
-extern int pid_ns_prepare_proc(struct pid_namespace *ns);
+extern int pid_ns_prepare_proc(struct pid_namespace *ns,
+                              struct container *container);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
 extern int proc_alloc_inum(unsigned int *pino);
 extern void proc_free_inum(unsigned int inum);
 
 #else /* CONFIG_PROC_FS */
 
-static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
+static inline int pid_ns_prepare_proc(struct pid_namespace *ns, struct 
container *container)
+{ return 0; }
 static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
 
 static inline int proc_alloc_inum(unsigned int *inum)
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 96a0240f23fe..f60bbe6f4099 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -97,6 +97,7 @@ enum fsconfig_command {
        FSCONFIG_SET_FD         = 5,    /* Set parameter, supplying an object 
by fd */
        FSCONFIG_CMD_CREATE     = 6,    /* Invoke superblock creation */
        FSCONFIG_CMD_RECONFIGURE = 7,   /* Invoke superblock reconfiguration */
+       FSCONFIG_SET_CONTAINER  = 8,    /* Set a container, supplied by fd */
 };
 
 /*
diff --git a/kernel/container.c b/kernel/container.c
index 1d2cb1c1e9b1..fd3b2a6849a1 100644
--- a/kernel/container.c
+++ b/kernel/container.c
@@ -30,6 +30,7 @@ struct container init_container = {
        .cred           = &init_cred,
        .ns             = &init_nsproxy,
        .init           = &init_task,
+       .pid_ns         = &init_pid_ns,
        .members.next   = &init_task.container_link,
        .members.prev   = &init_task.container_link,
        .children       = LIST_HEAD_INIT(init_container.children),
@@ -51,6 +52,8 @@ void put_container(struct container *c)
 
        while (c && refcount_dec_and_test(&c->usage)) {
                BUG_ON(!list_empty(&c->members));
+               if (c->pid_ns)
+                       put_pid_ns(c->pid_ns);
                if (c->ns)
                        put_nsproxy(c->ns);
                path_put(&c->root);
@@ -391,6 +394,7 @@ static struct container *create_container(const char __user 
*name, unsigned int
        }
 
        c->ns = ns;
+       c->pid_ns = get_pid_ns(c->ns->pid_ns_for_children);
        c->root = fs->root;
        c->seq = fs->seq;
        fs->root.mnt = NULL;
diff --git a/kernel/fork.c b/kernel/fork.c
index 71401deb4434..09de5f35d312 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1958,7 +1958,7 @@ static __latent_entropy struct task_struct *copy_process(
        stackleak_task_init(p);
 
        if (pid != &init_struct_pid) {
-               pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+               pid = alloc_pid(p->nsproxy->pid_ns_for_children, 
dest_container);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
                        goto bad_fork_cleanup_thread;
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..6528a75e6c0d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -156,7 +156,7 @@ void free_pid(struct pid *pid)
        call_rcu(&pid->rcu, delayed_put_pid);
 }
 
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, struct container *container)
 {
        struct pid *pid;
        enum pid_type type;
@@ -205,7 +205,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        }
 
        if (unlikely(is_child_reaper(pid))) {
-               if (pid_ns_prepare_proc(ns))
+               if (pid_ns_prepare_proc(ns, container))
                        goto out_free;
        }
 

Reply via email to