Jann Horn <ja...@google.com> writes: > This allows the admin of a user namespace to mark the namespace as > transparent. All other namespaces, by default, are opaque. > > While the current behavior of user namespaces is appropriate for use in > containers, there are many programs that only use user namespaces because > doing so enables them to do other things (e.g. unsharing the mount or > network namespace) that require namespaced capabilities. For them, the > inability to see the real UIDs and GIDs of things from inside the user > namespace can be very annoying. > > In a transparent namespace, all UIDs and GIDs that are mapped into its > first opaque ancestor are visible and are not remapped. This means that if > a process e.g. stat()s the real root directory in a namespace, it will > still see it as owned by UID 0. > > Traditionally, any UID or GID that was visible in a user namespace was also > mapped into the namespace, giving the namespace admin full access to it. > This patch introduces a distinction: In a transparent namespace, UIDs and > GIDs can be visible without being mapped. Non-mapped, visible UIDs can be > passed from the kernel to userspace, but userspace can't send them back to > the kernel. In order to be able to fully use specific UIDs/GIDs and gain > privileges over them, mappings need to be set up in the usual way - > however, to avoid aliasing problems, only identity mappings are permitted. > > I have gone through all callers of from_kuid() and from_kgid(), and as far > as I can tell, kuid_has_mapping() and kgid_has_mapping() were the only > functions that used them for privilege checks. (The keys subsystem uses > them in an insecure way, and that issue has been known for a while, but my > patch doesn't make that any more vulnerable than it already is.
Perhaps it has been known for a while but no one has stopped and mentioned it to me. What questionable thing is the keys subsystem doing? >) This is a bigish change in semantics and I am going to have to digest this before I can give this an ok. Quite frankly at the base it scares me. If this is just about presentation and allowing some information from the parent user namespace I would be much happier if it was not from_kuid but that you modified, but if you instead you had a function say from_kuid_transparent, that performed the transformation you need and was only used in those places it is safe. I think I could reason about that. As your patchset sits I can not reason about the change in semantics, because without a large grep of the source I don't know what you are changing. And you are dramatically changing the semantics the semantics of from_kuid to the point I do believe we need to inspepect all of the call sites. As such I really don't think it makes sense to reuse the existing name for your new semantics. Eric > Signed-off-by: Jann Horn <ja...@google.com> > --- > fs/proc/base.c | 28 ++++++-- > include/linux/uidgid.h | 16 ++++- > include/linux/user_namespace.h | 4 ++ > kernel/user.c | 1 + > kernel/user_namespace.c | 152 > +++++++++++++++++++++++++++++++++++++++-- > 5 files changed, 191 insertions(+), 10 deletions(-) > > diff --git a/fs/proc/base.c b/fs/proc/base.c > index a11eb71..c521c51 100644 > --- a/fs/proc/base.c > +++ b/fs/proc/base.c > @@ -2744,7 +2744,8 @@ static const struct file_operations > proc_projid_map_operations = { > .release = proc_id_map_release, > }; > > -static int proc_setgroups_open(struct inode *inode, struct file *file) > +static int proc_nsadmin_open(struct inode *inode, struct file *file, > + int (*show)(struct seq_file *, void *)) > { > struct user_namespace *ns = NULL; > struct task_struct *task; > @@ -2767,7 +2768,7 @@ static int proc_setgroups_open(struct inode *inode, > struct file *file) > goto err_put_ns; > } > > - ret = single_open(file, &proc_setgroups_show, ns); > + ret = single_open(file, show, ns); > if (ret) > goto err_put_ns; > > @@ -2778,7 +2779,7 @@ err: > return ret; > } > > -static int proc_setgroups_release(struct inode *inode, struct file *file) > +static int proc_nsadmin_release(struct inode *inode, struct file *file) > { > struct seq_file *seq = file->private_data; > struct user_namespace *ns = seq->private; > @@ -2787,12 +2788,30 @@ static int proc_setgroups_release(struct inode > *inode, struct file *file) > return ret; > } > > +static int proc_setgroups_open(struct inode *inode, struct file *file) > +{ > + return proc_nsadmin_open(inode, file, &proc_setgroups_show); > +} > + > static const struct file_operations proc_setgroups_operations = { > .open = proc_setgroups_open, > .write = proc_setgroups_write, > .read = seq_read, > .llseek = seq_lseek, > - .release = proc_setgroups_release, > + .release = proc_nsadmin_release, > +}; > + > +static int proc_transparent_open(struct inode *inode, struct file *file) > +{ > + return proc_nsadmin_open(inode, file, &proc_transparent_show); > +} > + > +static const struct file_operations proc_transparent_operations = { > + .open = proc_transparent_open, > + .write = proc_transparent_write, > + .read = seq_read, > + .llseek = seq_lseek, > + .release = proc_nsadmin_release, > }; > #endif /* CONFIG_USER_NS */ > > @@ -2901,6 +2920,7 @@ static const struct pid_entry tgid_base_stuff[] = { > REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), > REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), > REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), > + REG("transparent", S_IRUGO|S_IWUSR, proc_transparent_operations), > #endif > #ifdef CONFIG_CHECKPOINT_RESTORE > REG("timers", S_IRUGO, proc_timers_operations), > diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h > index 0383552..2908d40 100644 > --- a/include/linux/uidgid.h > +++ b/include/linux/uidgid.h > @@ -124,17 +124,19 @@ extern kgid_t make_kgid(struct user_namespace *from, > gid_t gid); > > extern uid_t from_kuid(struct user_namespace *to, kuid_t uid); > extern gid_t from_kgid(struct user_namespace *to, kgid_t gid); > +extern uid_t from_kuid_opaque(struct user_namespace *to, kuid_t uid); > +extern gid_t from_kgid_opaque(struct user_namespace *to, kgid_t gid); > extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid); > extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid); > > static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) > { > - return from_kuid(ns, uid) != (uid_t) -1; > + return from_kuid_opaque(ns, uid) != (uid_t) -1; > } > > static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) > { > - return from_kgid(ns, gid) != (gid_t) -1; > + return from_kgid_opaque(ns, gid) != (gid_t) -1; > } > > #else > @@ -159,6 +161,16 @@ static inline gid_t from_kgid(struct user_namespace *to, > kgid_t kgid) > return __kgid_val(kgid); > } > > +static inline uid_t from_kuid_opaque(struct user_namespace *to, kuid_t kuid) > +{ > + return __kuid_val(kuid); > +} > + > +static inline gid_t from_kgid_opaque(struct user_namespace *to, kgid_t kgid) > +{ > + return __kgid_val(kgid); > +} > + > static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid) > { > uid_t uid = from_kuid(to, kuid); > diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h > index 8297e5b..18291ac 100644 > --- a/include/linux/user_namespace.h > +++ b/include/linux/user_namespace.h > @@ -28,6 +28,8 @@ struct user_namespace { > struct uid_gid_map projid_map; > atomic_t count; > struct user_namespace *parent; > + /* self for normal ns; first opaque parent for transparent ns */ > + struct user_namespace *opaque; > int level; > kuid_t owner; > kgid_t group; > @@ -71,6 +73,8 @@ extern ssize_t proc_gid_map_write(struct file *, const char > __user *, size_t, lo > extern ssize_t proc_projid_map_write(struct file *, const char __user *, > size_t, loff_t *); > extern ssize_t proc_setgroups_write(struct file *, const char __user *, > size_t, loff_t *); > extern int proc_setgroups_show(struct seq_file *m, void *v); > +extern ssize_t proc_transparent_write(struct file *, const char __user *, > size_t, loff_t *); > +extern int proc_transparent_show(struct seq_file *m, void *v); > extern bool userns_may_setgroups(const struct user_namespace *ns); > #else > > diff --git a/kernel/user.c b/kernel/user.c > index b069ccb..e1fd9e5 100644 > --- a/kernel/user.c > +++ b/kernel/user.c > @@ -48,6 +48,7 @@ struct user_namespace init_user_ns = { > }, > }, > .count = ATOMIC_INIT(3), > + .opaque = &init_user_ns, > .owner = GLOBAL_ROOT_UID, > .group = GLOBAL_ROOT_GID, > .ns.inum = PROC_USER_INIT_INO, > diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c > index 9bafc21..da329a1 100644 > --- a/kernel/user_namespace.c > +++ b/kernel/user_namespace.c > @@ -98,6 +98,7 @@ int create_user_ns(struct cred *new) > atomic_set(&ns->count, 1); > /* Leave the new->user_ns reference with the new user namespace. */ > ns->parent = parent_ns; > + ns->opaque = ns; > ns->level = parent_ns->level + 1; > ns->owner = owner; > ns->group = group; > @@ -251,18 +252,46 @@ EXPORT_SYMBOL(make_kuid); > * Map @kuid into the user-namespace specified by @targ and > * return the resulting uid. > * > + * This function is *not* appropriate for security checks because > + * if @targ is transparent, the mappings of an ancestor namespace > + * are used. If @targ isn't &init_user_ns and you intend to do > + * anything with the result apart from returning it to a process > + * in @targ, you might want to use from_kuid_opaque() instead. > + * > * There is always a mapping into the initial user_namespace. > * > - * If @kuid has no mapping in @targ (uid_t)-1 is returned. > + * If @kuid is not visible in @targ (uid_t)-1 is returned. > */ > uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) > { > /* Map the uid from a global kernel uid */ > - return map_id_up(&targ->uid_map, __kuid_val(kuid)); > + struct user_namespace *opaque = READ_ONCE(targ->opaque); > + > + return map_id_up(&opaque->uid_map, __kuid_val(kuid)); > } > EXPORT_SYMBOL(from_kuid); > > /** > + * from_kuid_opaque - Create a uid from a kuid user-namespace pair. > + * @targ: The user namespace we want a uid in. > + * @kuid: The kernel internal uid to start with. > + * > + * Map @kuid into the user-namespace specified by @targ and > + * return the resulting uid. This ignores transparent user > + * namespaces and is therefore appropriate for security checks. > + * > + * There is always a mapping into the initial user_namespace. > + * > + * If @kuid has no mapping in @targ (uid_t)-1 is returned. > + */ > +uid_t from_kuid_opaque(struct user_namespace *targ, kuid_t kuid) > +{ > + /* Map the uid from a global kernel uid */ > + return map_id_up(&targ->uid_map, __kuid_val(kuid)); > +} > +EXPORT_SYMBOL(from_kuid_opaque); > + > +/** > * from_kuid_munged - Create a uid from a kuid user-namespace pair. > * @targ: The user namespace we want a uid in. > * @kuid: The kernel internal uid to start with. > @@ -319,18 +348,46 @@ EXPORT_SYMBOL(make_kgid); > * Map @kgid into the user-namespace specified by @targ and > * return the resulting gid. > * > + * This function is *not* appropriate for security checks because > + * if @targ is transparent, the mappings of an ancestor namespace > + * are used. If @targ isn't &init_user_ns and you intend to do > + * anything with the result apart from returning it to a process > + * in @targ, you might want to use from_kgid_opaque() instead. > + * > * There is always a mapping into the initial user_namespace. > * > - * If @kgid has no mapping in @targ (gid_t)-1 is returned. > + * If @kgid is not visible in @targ (gid_t)-1 is returned. > */ > gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) > { > /* Map the gid from a global kernel gid */ > - return map_id_up(&targ->gid_map, __kgid_val(kgid)); > + struct user_namespace *opaque = READ_ONCE(targ->opaque); > + > + return map_id_up(&opaque->gid_map, __kgid_val(kgid)); > } > EXPORT_SYMBOL(from_kgid); > > /** > + * from_kgid_opaque - Create a gid from a kgid user-namespace pair. > + * @targ: The user namespace we want a gid in. > + * @kgid: The kernel internal gid to start with. > + * > + * Map @kgid into the user-namespace specified by @targ and > + * return the resulting gid. This ignores transparent user > + * namespaces and is therefore appropriate for security checks. > + * > + * There is always a mapping into the initial user_namespace. > + * > + * If @kgid has no mapping in @targ (gid_t)-1 is returned. > + */ > +gid_t from_kgid_opaque(struct user_namespace *targ, kgid_t kgid) > +{ > + /* Map the gid from a global kernel gid */ > + return map_id_up(&targ->gid_map, __kgid_val(kgid)); > +} > +EXPORT_SYMBOL(from_kgid_opaque); > + > +/** > * from_kgid_munged - Create a gid from a kgid user-namespace pair. > * @targ: The user namespace we want a gid in. > * @kgid: The kernel internal gid to start with. > @@ -811,6 +868,18 @@ static bool new_idmap_permitted(const struct file *file, > struct uid_gid_map *new_map) > { > const struct cred *cred = file->f_cred; > + unsigned int idx; > + > + /* Don't allow non-identity mappings in transparent namespaces. */ > + if (ns != ns->opaque) { > + for (idx = 0; idx < new_map->nr_extents; idx++) { > + struct uid_gid_extent *ext = &new_map->extent[idx]; > + > + if (ext->first != ext->lower_first) > + return false; > + } > + } > + > /* Don't allow mappings that would allow anything that wouldn't > * be allowed without the establishment of unprivileged mappings. > */ > @@ -922,6 +991,81 @@ out_unlock: > goto out; > } > > +int proc_transparent_show(struct seq_file *seq, void *v) > +{ > + struct user_namespace *ns = seq->private; > + struct user_namespace *opaque = READ_ONCE(ns->opaque); > + > + seq_printf(seq, "%d\n", (ns == opaque) ? 0 : 1); > + return 0; > +} > + > +ssize_t proc_transparent_write(struct file *file, const char __user *buf, > + size_t count, loff_t *ppos) > +{ > + struct seq_file *seq = file->private_data; > + struct user_namespace *ns = seq->private; > + char kbuf[8], *pos; > + bool transparent; > + ssize_t ret; > + > + /* Only allow a very narrow range of strings to be written */ > + ret = -EINVAL; > + if ((*ppos != 0) || (count >= sizeof(kbuf))) > + goto out; > + > + /* What was written? */ > + ret = -EFAULT; > + if (copy_from_user(kbuf, buf, count)) > + goto out; > + kbuf[count] = '\0'; > + pos = kbuf; > + > + /* What is being requested? */ > + ret = -EINVAL; > + if (pos[0] == '1') { > + pos += 1; > + transparent = true; > + } else if (pos[0] == '0') { > + pos += 1; > + transparent = false; > + } else > + goto out; > + > + /* Verify there is not trailing junk on the line */ > + pos = skip_spaces(pos); > + if (*pos != '\0') > + goto out; > + > + ret = -EPERM; > + mutex_lock(&userns_state_mutex); > + /* Is the requested state different from the current one? */ > + if (transparent != (ns->opaque != ns)) { > + /* You can't turn off transparent mode. */ > + if (!transparent) > + goto out_unlock; > + /* If there are existing mappings, they might be > + * non-identity mappings. Therefore, block transparent > + * mode. This also prevents making the init namespace > + * transparent (which wouldn't work). > + */ > + if (ns->uid_map.nr_extents != 0 || ns->gid_map.nr_extents != 0) > + goto out_unlock; > + /* Okay! Make the namespace transparent. */ > + ns->opaque = ns->parent->opaque; > + } > + mutex_unlock(&userns_state_mutex); > + > + /* Report a successful write */ > + *ppos = count; > + ret = count; > +out: > + return ret; > +out_unlock: > + mutex_unlock(&userns_state_mutex); > + goto out; > +} > + > bool userns_may_setgroups(const struct user_namespace *ns) > { > bool allowed;