Quoting Christian Seiler (christ...@iwakd.de):
> When you clone a new user_ns, the child cannot write to the fds
> opened by the parent.  Hnadle this by doing an extra fork.  The
> grandparent hangs around and waits for its child to tell it the
> pid of of the grandchild, which will be the one attached to the
> container.  The grandparent then moves the grandchild into the
> right cgroup, then waits for the child who in turn is waiting on
> the grandchild to complete.
> 
> Secondly, when attaching to a new user namespace, your old uid is
> not valid, so you are uid -1.  This patch simply does setid+setuid
> to 0 if that is the case.  We probably want to be smarter, but
> for now this allows lxc-attach to work.
> 
> Signed-off-by: Christian Seiler <christ...@iwakd.de>

Acked-by: Serge E. Hallyn <serge.hal...@ubuntu.com>

Thanks, Christian, this looks good.

> ---
>  src/lxc/lxc_attach.c |  178 
> ++++++++++++++++++++++++++++++++++++++++++--------
>  1 files changed, 150 insertions(+), 28 deletions(-)
> 
> diff --git a/src/lxc/lxc_attach.c b/src/lxc/lxc_attach.c
> index e1511ef..1f60266 100644
> --- a/src/lxc/lxc_attach.c
> +++ b/src/lxc/lxc_attach.c
> @@ -28,6 +28,7 @@
>  #include <stdlib.h>
>  #include <sys/param.h>
>  #include <sys/types.h>
> +#include <sys/socket.h>
>  #include <sys/wait.h>
>  
>  #include "attach.h"
> @@ -128,9 +129,9 @@ int main(int argc, char *argv[])
>       struct passwd *passwd;
>       struct lxc_proc_context_info *init_ctx;
>       struct lxc_handler *handler;
> -     void *cgroup_data = NULL;
>       uid_t uid;
>       char *curdir;
> +     int cgroup_ipc_sockets[2];
>  
>       ret = lxc_caps_init();
>       if (ret)
> @@ -157,18 +158,6 @@ int main(int argc, char *argv[])
>               return -1;
>       }
>  
> -     if (!elevated_privileges) {
> -             /* we have to do this now since /sys/fs/cgroup may not
> -              * be available inside the container or we may not have
> -              * the required permissions anymore
> -              */
> -             ret = lxc_cgroup_prepare_attach(my_args.name, &cgroup_data);
> -             if (ret < 0) {
> -                     ERROR("failed to prepare attaching to cgroup");
> -                     return -1;
> -             }
> -     }
> -
>       curdir = getcwd(NULL, 0);
>  
>       /* determine which namespaces the container was created with
> @@ -184,6 +173,106 @@ int main(int argc, char *argv[])
>               }
>       }
>  
> +     /* For the cgroup attaching logic to work in conjunction with pid and 
> user namespaces,
> +      * we need to have the following hierarchy:
> +      *
> +      *     lxc-attach [process executed externally]
> +      *         | socketpair(cgroup_ipc_sockets)
> +      *         | fork()           -> child
> +      *         |                       | setns()
> +      *         |                       | fork()    -> grandchild
> +      *         |                       |                   | initialize
> +      *         |                       |                   | signal parent
> +      *         |                       |<------------------|----+
> +      *         |                       | signal parent     |
> +      *         |<----------------------|-----+             |
> +      *         | add to cgroups        |                   |
> +      *         | signal child -------->|                   |
> +      *         |                       | signal child ---->|
> +      *         | waitpid()             | waitpid()         | exec()
> +      *         |                       |<------------------| exit()
> +      *         |<----------------------| exit()
> +      *         | exit()
> +      *
> +      * The rationale is the following: The first parent is needed because 
> after
> +      * setns() (mount + user namespace) we can't access the cgroup 
> filesystem
> +      * to add the pid to the corresponding cgroup. Therefore, we need to do 
> that
> +      * in a process executed on the host, so that's why we need to fork and 
> wait
> +      * for it to have done some initialization (cgroups may restrict certain
> +      * operations so we have to do that in the end) and use IPC for 
> signaling.
> +      *
> +      * Then in the child process we do the setns(). However, a process is 
> never
> +      * really attached to a pid namespace (never changes its pid, doesn't 
> appear
> +      * in the pid namespace /proc), only child processes of that process are
> +      * truely inside the new pid namespace. That's why we need to fork() 
> again
> +      * after setns() before performing final initializations, then signal 
> our
> +      * parent, which signals the primary process, which does cgroup adding,
> +      * which then signals to the grandchild that it can exec().
> +      */
> +     ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, cgroup_ipc_sockets);
> +     if (ret < 0) {
> +             SYSERROR("could not set up required IPC mechanism for 
> attaching");
> +             return -1;
> +     }
> +
> +     pid = fork();
> +     if (pid < 0) {
> +             SYSERROR("failed to create first subprocess");
> +             return -1;
> +     }
> +
> +     if (pid) {
> +             int status;
> +             pid_t grandchild;
> +
> +             close(cgroup_ipc_sockets[1]);
> +
> +     gparent_reread:
> +             ret = read(cgroup_ipc_sockets[0], &grandchild, 
> sizeof(grandchild));
> +             if (ret <= 0) {
> +                     if (ret < 0 && (errno == EAGAIN || errno == EINTR))
> +                             goto gparent_reread;
> +                     ERROR("failed to get pid of attached process to add to 
> cgroup");
> +                     return -1;
> +             }
> +
> +             if (!elevated_privileges) {
> +                     ret = lxc_cgroup_attach(my_args.name, grandchild);
> +                     if (ret < 0) {
> +                             ERROR("failed to attach process to cgroup");
> +                             return -1;
> +                     }
> +             }
> +
> +             status = 0;
> +             ret = write(cgroup_ipc_sockets[0], &status, sizeof(status));
> +             if (ret <= 0) {
> +                     ERROR("failed to signal child that cgroup logic has 
> finished");
> +                     return -1;
> +             }
> +
> +             close(cgroup_ipc_sockets[0]);
> +
> +     gparent_again:
> +             ret = waitpid(pid, &status, 0);
> +             if (ret < 0) {
> +                     if (errno == EINTR)
> +                             goto gparent_again;
> +                     SYSERROR("failed to wait for process '%d'", pid);
> +                     return -1;
> +             }
> +
> +             if (WIFEXITED(status))
> +                     return WEXITSTATUS(status);
> +
> +             return -1;
> +     }
> +
> +     /* at this point we are in the 'parent' process so we need to close the
> +      * socket reserved for the 'grandparent' process
> +      */
> +     close(cgroup_ipc_sockets[0]);
> +
>       /* we need to attach before we fork since certain namespaces
>        * (such as pid namespaces) only really affect children of the
>        * current process and not the process itself
> @@ -199,7 +288,10 @@ int main(int argc, char *argv[])
>  
>       free(curdir);
>  
> -     /* hack: we need sync.h infrastructure - and that needs a handler */
> +     /* hack: we need sync.h infrastructure - and that needs a handler
> +      * FIXME: perhaps we should also just use a very simple socketpair()
> +      * here? - like with the grandparent <-> parent communication?
> +      */
>       handler = calloc(1, sizeof(*handler));
>  
>       if (lxc_sync_init(handler)) {
> @@ -225,23 +317,40 @@ int main(int argc, char *argv[])
>               if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
>                       return -1;
>  
> -             /* now that we are done with all privileged operations,
> -              * we can add ourselves to the cgroup. Since we smuggled in
> -              * the fds earlier, we still have write permission
> +             /* ask grandparent to add child to cgroups, the grandparent will
> +              * itself check whether that's actually necessary
>                */
> -             if (!elevated_privileges) {
> -                     /* since setns() for pid namespaces only really
> -                      * affects child processes, the pid we have is
> -                      * still valid outside the container, so this is
> -                      * fine
> +             ret = write(cgroup_ipc_sockets[1], &pid, sizeof(pid));
> +             if (ret != sizeof(pid)) {
> +                     ERROR("error using IPC to notify main process of pid to 
> add to the cgroups of the container");
> +                     return -1;
> +             }
> +
> +     parent_reread:
> +             /* we need some mechanism to check whether the grandparent could
> +              * add us to the cgroups or not - so we await a dummy integer
> +              * on the same socket (that's why we don't use a pipe - we need
> +              * two-way communication). So if the parent fails and exits, 
> that
> +              * will close the socket, which will cause a read of 0 bytes for
> +              * us, so we just terminate. If we read at least a byte, we 
> don't
> +              * care about the contents...
> +              */
> +             ret = read(cgroup_ipc_sockets[1], &status, sizeof(status));
> +             if (ret <= 0) {
> +                     if (ret < 0 && (errno == EAGAIN || errno == EINTR))
> +                             goto parent_reread;
> +                     /* only print someting if we can't assume the parent 
> already
> +                      * gave an error message, that will reduce confusion 
> for the
> +                      * user
>                        */
> -                     ret = lxc_cgroup_finish_attach(cgroup_data, pid);
> -                     if (ret < 0) {
> -                             ERROR("failed to attach process to cgroup");
> -                             return -1;
> -                     }
> +                     if (ret != 0)
> +                             ERROR("failed to get notification that the 
> child process was added to the container's cgroups");
> +                     return -1;
>               }
>  
> +             /* we don't need that IPC interface anymore */
> +             close(cgroup_ipc_sockets[1]);
> +
>               /* tell the child we are done initializing */
>               if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE))
>                       return -1;
> @@ -264,7 +373,7 @@ int main(int argc, char *argv[])
>  
>       if (!pid) {
>               lxc_sync_fini_parent(handler);
> -             lxc_cgroup_dispose_attach(cgroup_data);
> +             close(cgroup_ipc_sockets[1]);
>  
>               if (attach_apparmor(init_ctx->aa_profile) < 0) {
>                       ERROR("failed switching apparmor profiles");
> @@ -307,6 +416,19 @@ int main(int argc, char *argv[])
>  
>               lxc_sync_fini(handler);
>  
> +             if (namespace_flags & CLONE_NEWUSER) {
> +                     /* XXX FIXME this should get the uid of the container 
> init and setuid to that */
> +                     /* XXX FIXME or perhaps try to map in the lxc-attach 
> caller's uid? */
> +                     if (setgid(0)) {
> +                             SYSERROR("switching to container gid");
> +                             return -1;
> +                     }
> +                     if (setuid(0)) {
> +                             SYSERROR("switching to container uid");
> +                             return -1;
> +                     }
> +             }
> +
>               if (my_args.argc) {
>                       execvp(my_args.argv[0], my_args.argv);
>                       SYSERROR("failed to exec '%s'", my_args.argv[0]);
> -- 
> 1.7.8.6
> 
> 
> ------------------------------------------------------------------------------
> Everyone hates slow websites. So do we.
> Make your web apps faster with AppDynamics
> Download AppDynamics Lite for free today:
> http://p.sf.net/sfu/appdyn_d2d_feb
> _______________________________________________
> Lxc-devel mailing list
> Lxc-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lxc-devel

------------------------------------------------------------------------------
Everyone hates slow websites. So do we.
Make your web apps faster with AppDynamics
Download AppDynamics Lite for free today:
http://p.sf.net/sfu/appdyn_d2d_feb
_______________________________________________
Lxc-devel mailing list
Lxc-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/lxc-devel

Reply via email to