Allow a single process to be forked directly into a container using a new
syscall, thereby 'booting' the container:

        pid_t pid = fork_into_container(int container_fd);

This process will be the 'init' process of the container.

Further attempts to fork into the container will be rejected.

Signed-off-by: David Howells <dhowe...@redhat.com>
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 arch/x86/ia32/sys_ia32.c               |    2 -
 include/linux/cred.h                   |    3 +
 include/linux/nsproxy.h                |    7 ++
 include/linux/sched/task.h             |    3 +
 include/linux/syscalls.h               |    1 
 kernel/cred.c                          |   45 +++++++++++++
 kernel/fork.c                          |  110 ++++++++++++++++++++++++++------
 kernel/nsproxy.c                       |   11 +++
 kernel/sys_ni.c                        |    1 
 11 files changed, 157 insertions(+), 28 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
b/arch/x86/entry/syscalls/syscall_32.tbl
index 3564814a5d21..8666693510f9 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -408,3 +408,4 @@
 394    i386    mount_notify            sys_mount_notify                
__ia32_sys_mount_notify
 395    i386    sb_notify               sys_sb_notify                   
__ia32_sys_sb_notify
 396    i386    container_create        sys_container_create            
__ia32_sys_container_create
+397    i386    fork_into_container     sys_fork_into_container         
__ia32_sys_fork_into_container
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index aa6cccbe5271..d40d4790fcb2 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -353,6 +353,7 @@
 342    common  mount_notify            __x64_sys_mount_notify
 343    common  sb_notify               __x64_sys_sb_notify
 344    common  container_create        __x64_sys_container_create
+345    common  fork_into_container     __x64_sys_fork_into_container
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index a43212036257..080d9e21b697 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -238,5 +238,5 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, 
clone_flags,
                       unsigned long, tls_val, int __user *, child_tidptr)
 {
        return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
-                       tls_val);
+                       tls_val, NULL);
 }
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4907c9df86b3..357e743d5d4a 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -23,6 +23,7 @@
 
 struct cred;
 struct inode;
+struct container;
 
 /*
  * COW Supplementary groups list
@@ -155,7 +156,7 @@ struct cred {
 
 extern void __put_cred(struct cred *);
 extern void exit_creds(struct task_struct *);
-extern int copy_creds(struct task_struct *, unsigned long);
+extern int copy_creds(struct task_struct *, unsigned long, struct container *);
 extern const struct cred *get_task_cred(struct task_struct *);
 extern struct cred *cred_alloc_blank(void);
 extern struct cred *prepare_creds(void);
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 2ae1b1a4d84d..81838ae24a92 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -11,6 +11,7 @@ struct ipc_namespace;
 struct pid_namespace;
 struct cgroup_namespace;
 struct fs_struct;
+struct container;
 
 /*
  * A structure to contain pointers to all per-process
@@ -63,9 +64,13 @@ extern struct nsproxy init_nsproxy;
  *         * /
  *     task_unlock(task);
  *
+ *  4. Container namespaces are set at container creation and cannot be
+ *     changed.
+ *
  */
 
-int copy_namespaces(unsigned long flags, struct task_struct *tsk);
+int copy_namespaces(unsigned long flags, struct task_struct *tsk,
+                   struct container *dest_container);
 void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 void free_nsproxy(struct nsproxy *ns);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 44c6f15800ff..bdff71b0fb66 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -73,7 +73,8 @@ extern void do_group_exit(int);
 extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
-extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user 
*, int __user *, unsigned long);
+extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *,
+                    int __user *, unsigned long, struct container *);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, 
int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index dac42098c2dd..15e5cc704df3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -946,6 +946,7 @@ asmlinkage long sys_sb_notify(int dfd, const char __user 
*path,
 asmlinkage long sys_container_create(const char __user *name, unsigned int 
flags,
                                     unsigned long spare3, unsigned long spare4,
                                     unsigned long spare5);
+asmlinkage long sys_fork_into_container(int containerfd);
 
 /*
  * Architecture-specific system calls
diff --git a/kernel/cred.c b/kernel/cred.c
index 21f4a97085b4..f0ee5cec533d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -313,6 +313,43 @@ struct cred *prepare_exec_creds(void)
        return new;
 }
 
+/*
+ * Handle forking a process into a container.
+ */
+static struct cred *copy_container_creds(struct container *dest_container)
+{
+       struct cred *new;
+
+       validate_process_creds();
+
+       new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+       if (!new)
+               return NULL;
+
+       kdebug("prepare_creds() alloc %p", new);
+
+       memcpy(new, dest_container->cred, sizeof(struct cred));
+
+       atomic_set(&new->usage, 1);
+       set_cred_subscribers(new, 0);
+       get_group_info(new->group_info);
+       get_uid(new->user);
+       get_user_ns(new->user_ns);
+
+#ifdef CONFIG_SECURITY
+       new->security = NULL;
+#endif
+
+       if (security_prepare_creds(new, dest_container->cred, GFP_KERNEL) < 0)
+               goto error;
+       validate_creds(new);
+       return new;
+
+error:
+       abort_creds(new);
+       return NULL;
+}
+
 /*
  * Copy credentials for the new process created by fork()
  *
@@ -322,7 +359,8 @@ struct cred *prepare_exec_creds(void)
  * The new process gets the current process's subjective credentials as its
  * objective and subjective credentials
  */
-int copy_creds(struct task_struct *p, unsigned long clone_flags)
+int copy_creds(struct task_struct *p, unsigned long clone_flags,
+              struct container *dest_container)
 {
        struct cred *new;
        int ret;
@@ -343,7 +381,10 @@ int copy_creds(struct task_struct *p, unsigned long 
clone_flags)
                return 0;
        }
 
-       new = prepare_creds();
+       if (dest_container)
+               new = copy_container_creds(dest_container);
+       else
+               new = prepare_creds();
        if (!new)
                return -ENOMEM;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 009cf7e63894..71401deb4434 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1385,9 +1385,33 @@ static int copy_mm(unsigned long clone_flags, struct 
task_struct *tsk)
        return retval;
 }
 
-static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_fs(unsigned long clone_flags, struct task_struct *tsk,
+                  struct container *dest_container)
 {
        struct fs_struct *fs = current->fs;
+
+#ifdef CONFIG_CONTAINERS
+       if (dest_container) {
+               fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+               if (!fs)
+                       return -ENOMEM;
+
+               fs->users = 1;
+               fs->in_exec = 0;
+               spin_lock_init(&fs->lock);
+               seqcount_init(&fs->seq);
+               fs->umask = 0022;
+
+               spin_lock(&dest_container->lock);
+               fs->pwd = fs->root = dest_container->root;
+               path_get(&fs->root);
+               path_get(&fs->pwd);
+               spin_unlock(&dest_container->lock);
+               tsk->fs = fs;
+               return 0;
+       }
+#endif
+
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
                spin_lock(&fs->lock);
@@ -1679,7 +1703,8 @@ static __latent_entropy struct task_struct *copy_process(
                                        struct pid *pid,
                                        int trace,
                                        unsigned long tls,
-                                       int node)
+                                       int node,
+                                       struct container *dest_container)
 {
        int retval;
        struct task_struct *p;
@@ -1783,7 +1808,7 @@ static __latent_entropy struct task_struct *copy_process(
        }
        current->flags &= ~PF_NPROC_EXCEEDED;
 
-       retval = copy_creds(p, clone_flags);
+       retval = copy_creds(p, clone_flags, dest_container);
        if (retval < 0)
                goto bad_fork_free;
 
@@ -1905,7 +1930,7 @@ static __latent_entropy struct task_struct *copy_process(
        retval = copy_files(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_semundo;
-       retval = copy_fs(clone_flags, p);
+       retval = copy_fs(clone_flags, p, dest_container);
        if (retval)
                goto bad_fork_cleanup_files;
        retval = copy_sighand(clone_flags, p);
@@ -1917,15 +1942,15 @@ static __latent_entropy struct task_struct 
*copy_process(
        retval = copy_mm(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_signal;
-       retval = copy_namespaces(clone_flags, p);
+       retval = copy_container(clone_flags, p, dest_container);
        if (retval)
                goto bad_fork_cleanup_mm;
-       retval = copy_container(clone_flags, p, NULL);
+       retval = copy_namespaces(clone_flags, p, dest_container);
        if (retval)
-               goto bad_fork_cleanup_namespaces;
+               goto bad_fork_cleanup_container;
        retval = copy_io(clone_flags, p);
        if (retval)
-               goto bad_fork_cleanup_container;
+               goto bad_fork_cleanup_namespaces;
        retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
        if (retval)
                goto bad_fork_cleanup_io;
@@ -2124,10 +2149,10 @@ static __latent_entropy struct task_struct 
*copy_process(
 bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
-bad_fork_cleanup_container:
-       exit_container(p);
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
+bad_fork_cleanup_container:
+       exit_container(p);
 bad_fork_cleanup_mm:
        if (p->mm)
                mmput(p->mm);
@@ -2183,7 +2208,7 @@ struct task_struct *fork_idle(int cpu)
 {
        struct task_struct *task;
        task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
-                           cpu_to_node(cpu));
+                           cpu_to_node(cpu), NULL);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
@@ -2195,15 +2220,16 @@ struct task_struct *fork_idle(int cpu)
 /*
  *  Ok, this is the main fork-routine.
  *
- * It copies the process, and if successful kick-starts
- * it and waits for it to finish using the VM if required.
+ * It copies the process into the specified container, and if successful
+ * kick-starts it and waits for it to finish using the VM if required.
  */
 long _do_fork(unsigned long clone_flags,
              unsigned long stack_start,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr,
-             unsigned long tls)
+             unsigned long tls,
+             struct container *dest_container)
 {
        struct completion vfork;
        struct pid *pid;
@@ -2229,8 +2255,32 @@ long _do_fork(unsigned long clone_flags,
                        trace = 0;
        }
 
+       if (dest_container) {
+               /* A process spawned into a container doesn't share anything
+                * with the parent other than namespaces.
+                */
+               if (clone_flags & (CLONE_CHILD_CLEARTID |
+                                  CLONE_CHILD_SETTID |
+                                  CLONE_FILES |
+                                  CLONE_FS |
+                                  CLONE_IO |
+                                  CLONE_PARENT |
+                                  CLONE_PARENT_SETTID |
+                                  CLONE_PTRACE |
+                                  CLONE_SETTLS |
+                                  CLONE_SIGHAND |
+                                  CLONE_SYSVSEM |
+                                  CLONE_THREAD))
+                       return -EINVAL;
+
+               /* However, we do have to let kernel threads borrow a VM. */
+               if ((clone_flags & CLONE_VM) && current->mm)
+                       return -EINVAL;
+       }
+       
        p = copy_process(clone_flags, stack_start, stack_size,
-                        child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+                        child_tidptr, NULL, trace, tls, NUMA_NO_NODE,
+                        dest_container);
        add_latent_entropy();
 
        if (IS_ERR(p))
@@ -2279,7 +2329,7 @@ long do_fork(unsigned long clone_flags,
              int __user *child_tidptr)
 {
        return _do_fork(clone_flags, stack_start, stack_size,
-                       parent_tidptr, child_tidptr, 0);
+                       parent_tidptr, child_tidptr, 0, NULL);
 }
 #endif
 
@@ -2289,14 +2339,14 @@ long do_fork(unsigned long clone_flags,
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
        return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-               (unsigned long)arg, NULL, NULL, 0);
+                       (unsigned long)arg, NULL, NULL, 0, NULL);
 }
 
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-       return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+       return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, NULL);
 #else
        /* can not support in nommu mode */
        return -EINVAL;
@@ -2308,7 +2358,26 @@ SYSCALL_DEFINE0(fork)
 SYSCALL_DEFINE0(vfork)
 {
        return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-                       0, NULL, NULL, 0);
+                       0, NULL, NULL, 0, NULL);
+}
+#endif
+
+#ifdef CONFIG_CONTAINERS
+SYSCALL_DEFINE1(fork_into_container, int, containerfd)
+{
+       struct fd f = fdget(containerfd);
+       int ret;
+
+       if (!f.file)
+               return -EBADF;
+       ret = -EINVAL;
+       if (is_container_file(f.file)) {
+               struct container *dest_container = f.file->private_data;
+
+               ret = _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, dest_container);
+       }
+       fdput(f);
+       return ret;
 }
 #endif
 
@@ -2336,7 +2405,8 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, 
unsigned long, newsp,
                 unsigned long, tls)
 #endif
 {
-       return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, 
tls);
+       return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls,
+                       NULL);
 }
 #endif
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 4bb5184b3a80..4031075300a4 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -136,12 +136,19 @@ struct nsproxy *create_new_namespaces(unsigned long flags,
  * called from clone.  This now handles copy for nsproxy and all
  * namespaces therein.
  */
-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+int copy_namespaces(unsigned long flags, struct task_struct *tsk,
+                   struct container *dest_container)
 {
        struct nsproxy *old_ns = tsk->nsproxy;
        struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
        struct nsproxy *new_ns;
 
+       if (dest_container) {
+               get_nsproxy(dest_container->ns);
+               tsk->nsproxy = dest_container->ns;
+               return 0;
+       }
+
        if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                              CLONE_NEWPID | CLONE_NEWNET |
                              CLONE_NEWCGROUP)))) {
@@ -163,7 +170,7 @@ int copy_namespaces(unsigned long flags, struct task_struct 
*tsk)
                (CLONE_NEWIPC | CLONE_SYSVSEM)) 
                return -EINVAL;
 
-       new_ns = create_new_namespaces(flags, tsk->nsproxy, user_ns, tsk->fs);
+       new_ns = create_new_namespaces(flags, old_ns, user_ns, tsk->fs);
        if (IS_ERR(new_ns))
                return  PTR_ERR(new_ns);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f0455cbb91cf..a23ad529d548 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -144,6 +144,7 @@ COND_SYSCALL(container_create);
 /* kernel/exit.c */
 
 /* kernel/fork.c */
+COND_SYSCALL(fork_into_container);
 
 /* kernel/futex.c */
 COND_SYSCALL(futex);

Reply via email to