On Wed, Sep 10, 2025 at 07:21:22PM +0200, Amir Goldstein wrote: > On Wed, Sep 10, 2025 at 4:39 PM Christian Brauner <brau...@kernel.org> wrote: > > > > A while ago we added support for file handles to pidfs so pidfds can be > > encoded and decoded as file handles. Userspace has adopted this quickly > > and it's proven very useful. > > > Pidfd file handles are exhaustive meaning > > they don't require a handle on another pidfd to pass to > > open_by_handle_at() so it can derive the filesystem to decode in. > > > > Implement the exhaustive file handles for namespaces as well. > > I think you decide to split the "exhaustive" part to another patch, > so better drop this paragraph?
Yes, good point. I've dont that. > I am missing an explanation about the permissions for > opening these file handles. > > My understanding of the code is that the opener needs to meet one of > the conditions: > 1. user has CAP_SYS_ADMIN in the userns owning the opened namespace > 2. current task is in the opened namespace Yes. > > But I do not fully understand the rationale behind the 2nd condition, > that is, when is it useful? A caller is always able to open a file descriptor to it's own set of namespaces. File handles will behave the same way. > And as far as I can tell, your selftest does not cover this condition > (only both true or both false)? I've added this now. > > I suggest to start with allowing only the useful and important > cases, so if cond #1 is useful enough, drop cond #2 and we can add > it later if needed and then your selftests already cover cond #1 true and > false. > > > > > Signed-off-by: Christian Brauner <brau...@kernel.org> > > After documenting the permissions, with ot without dropping cond #2 > feel free to add: > > Reviewed-by: Amir Goldstein <amir7...@gmail.com> Thanks! > > > --- > > fs/nsfs.c | 176 > > +++++++++++++++++++++++++++++++++++++++++++++++ > > include/linux/exportfs.h | 6 ++ > > 2 files changed, 182 insertions(+) > > > > diff --git a/fs/nsfs.c b/fs/nsfs.c > > index 6f8008177133..a1585a2f4f03 100644 > > --- a/fs/nsfs.c > > +++ b/fs/nsfs.c > > @@ -13,6 +13,12 @@ > > #include <linux/nsfs.h> > > #include <linux/uaccess.h> > > #include <linux/mnt_namespace.h> > > +#include <linux/ipc_namespace.h> > > +#include <linux/time_namespace.h> > > +#include <linux/utsname.h> > > +#include <linux/exportfs.h> > > +#include <linux/nstree.h> > > +#include <net/net_namespace.h> > > > > #include "mount.h" > > #include "internal.h" > > @@ -417,12 +423,182 @@ static const struct stashed_operations > > nsfs_stashed_ops = { > > .put_data = nsfs_put_data, > > }; > > > > +struct nsfs_fid { > > + u64 ns_id; > > + u32 ns_type; > > + u32 ns_inum; > > +} __attribute__ ((packed)); > > + > > +#define NSFS_FID_SIZE (sizeof(struct nsfs_fid) / sizeof(u32)) > > + > > +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, > > + struct inode *parent) > > +{ > > + struct nsfs_fid *fid = (struct nsfs_fid *)fh; > > + struct ns_common *ns = inode->i_private; > > + int len = *max_len; > > + > > + /* > > + * TODO: > > + * For hierarchical namespaces we should start to encode the > > + * parent namespace. Then userspace can walk a namespace > > + * hierarchy purely based on file handles. > > + */ > > + if (parent) > > + return FILEID_INVALID; > > + > > + if (len < NSFS_FID_SIZE) { > > + *max_len = NSFS_FID_SIZE; > > + return FILEID_INVALID; > > + } > > + > > + len = NSFS_FID_SIZE; > > + > > + fid->ns_id = ns->ns_id; > > + fid->ns_type = ns->ops->type; > > + fid->ns_inum = inode->i_ino; > > + *max_len = len; > > + return FILEID_NSFS; > > +} > > + > > +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid > > *fh, > > + int fh_len, int fh_type) > > +{ > > + struct path path __free(path_put) = {}; > > + struct nsfs_fid *fid = (struct nsfs_fid *)fh; > > + struct user_namespace *owning_ns = NULL; > > + struct ns_common *ns; > > + int ret; > > + > > + if (fh_len < NSFS_FID_SIZE) > > + return NULL; > > + > > + switch (fh_type) { > > + case FILEID_NSFS: > > + break; > > + default: > > + return NULL; > > + } > > + > > + scoped_guard(rcu) { > > + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); > > + if (!ns) > > + return NULL; > > + > > + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); > > + VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); > > + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); > > + > > + if (!refcount_inc_not_zero(&ns->count)) > > + return NULL; > > + } > > + > > + switch (ns->ops->type) { > > +#ifdef CONFIG_CGROUPS > > + case CLONE_NEWCGROUP: > > + if (!current_in_namespace(to_cg_ns(ns))) > > + owning_ns = to_cg_ns(ns)->user_ns; > > + break; > > +#endif > > +#ifdef CONFIG_IPC_NS > > + case CLONE_NEWIPC: > > + if (!current_in_namespace(to_ipc_ns(ns))) > > + owning_ns = to_ipc_ns(ns)->user_ns; > > + break; > > +#endif > > + case CLONE_NEWNS: > > + if (!current_in_namespace(to_mnt_ns(ns))) > > + owning_ns = to_mnt_ns(ns)->user_ns; > > + break; > > +#ifdef CONFIG_NET_NS > > + case CLONE_NEWNET: > > + if (!current_in_namespace(to_net_ns(ns))) > > + owning_ns = to_net_ns(ns)->user_ns; > > + break; > > +#endif > > +#ifdef CONFIG_PID_NS > > + case CLONE_NEWPID: > > + if (!current_in_namespace(to_pid_ns(ns))) { > > + owning_ns = to_pid_ns(ns)->user_ns; > > + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { > > + ns->ops->put(ns); > > + return ERR_PTR(-EPERM); > > + } > > + break; > > +#endif > > +#ifdef CONFIG_TIME_NS > > + case CLONE_NEWTIME: > > + if (!current_in_namespace(to_time_ns(ns))) > > + owning_ns = to_time_ns(ns)->user_ns; > > + break; > > +#endif > > +#ifdef CONFIG_USER_NS > > + case CLONE_NEWUSER: > > + if (!current_in_namespace(to_user_ns(ns))) > > + owning_ns = to_user_ns(ns); > > + break; > > +#endif > > +#ifdef CONFIG_UTS_NS > > + case CLONE_NEWUTS: > > + if (!current_in_namespace(to_uts_ns(ns))) > > + owning_ns = to_uts_ns(ns)->user_ns; > > + break; > > +#endif > > + default: > > + return ERR_PTR(-EOPNOTSUPP); > > + } > > + > > + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { > > + ns->ops->put(ns); > > + return ERR_PTR(-EPERM); > > + } > > + > > + /* path_from_stashed() unconditionally consumes the reference. */ > > + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); > > + if (ret) > > + return ERR_PTR(ret); > > + > > + return no_free_ptr(path.dentry); > > +} > > + > > +/* > > + * Make sure that we reject any nonsensical flags that users pass via > > + * open_by_handle_at(). > > + */ > > +#define VALID_FILE_HANDLE_OPEN_FLAGS \ > > + (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) > > + > > +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, > > + unsigned int oflags) > > +{ > > + if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) > > + return -EINVAL; > > + > > + /* nsfs_fh_to_dentry() is performs further permission checks. */ > > + return 0; > > +} > > + > > +static struct file *nsfs_export_open(struct path *path, unsigned int > > oflags) > > +{ > > + /* Clear O_LARGEFILE as open_by_handle_at() forces it. */ > > + oflags &= ~O_LARGEFILE; > > + return file_open_root(path, "", oflags, 0); > > +} > > + > > +static const struct export_operations nsfs_export_operations = { > > + .encode_fh = nsfs_encode_fh, > > + .fh_to_dentry = nsfs_fh_to_dentry, > > + .open = nsfs_export_open, > > + .permission = nsfs_export_permission, > > +}; > > + > > static int nsfs_init_fs_context(struct fs_context *fc) > > { > > struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); > > if (!ctx) > > return -ENOMEM; > > ctx->ops = &nsfs_ops; > > + ctx->eops = &nsfs_export_operations; > > ctx->dops = &ns_dentry_operations; > > fc->s_fs_info = (void *)&nsfs_stashed_ops; > > return 0; > > diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h > > index cfb0dd1ea49c..3aac58a520c7 100644 > > --- a/include/linux/exportfs.h > > +++ b/include/linux/exportfs.h > > @@ -122,6 +122,12 @@ enum fid_type { > > FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, > > FILEID_BCACHEFS_WITH_PARENT = 0xb2, > > > > + /* > > + * > > + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode > > number. > > + */ > > + FILEID_NSFS = 0xf1, > > + > > /* > > * 64 bit unique kernfs id > > */ > > > > -- > > 2.47.3 > >