There a quite some use cases where users run into the current limit for {g,u}id mappings. Consider a user requesting us to map everything but 999, and 1001 for a given range of 1000000000 with a sub{g,u}id layout of:
some-user:100000:1000000000 some-user:999:1 some-user:1000:1 some-user:1001:1 some-user:1002:1 This translates to: MAPPING-TYPE | CONTAINER | HOST | RANGE | -------------|-----------|---------|-----------| uid | 999 | 999 | 1 | uid | 1001 | 1001 | 1 | uid | 0 | 1000000 | 999 | uid | 1000 | 1001000 | 1 | uid | 1002 | 1001002 | 999998998 | ------------------------------------------------ gid | 999 | 999 | 1 | gid | 1001 | 1001 | 1 | gid | 0 | 1000000 | 999 | gid | 1000 | 1001000 | 1 | gid | 1002 | 1001002 | 999998998 | which is already the current limit. As discussed at LPC simply bumping the number of limits is not going to work since this would mean that struct uid_gid_map won't fit into a single cache-line anymore thereby regressing performance for the base-cases. The same problem seems to arise when using a single pointer. So the idea is to use struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; For the base cases we will only use the struct uid_gid_extent extent member. If we go over UID_GID_MAP_MAX_BASE_EXTENTS mappings we perform a single kalloc() of 4k which means we can have a maximum of 340 mappings (340 * size(struct uid_gid_extent) = 4080). For the latter case we use two pointer *forward, *reverse. The forward pointer points to an array sorted by "first" and reverse points to an array sorted by "lower_first". We can then perform binary search on those arrays. Performance Testing: When Eric introduced the extent-based struct uid_gid_map approach he measured the performanc impact of his idmap changes: > My benchmark consisted of going to single user mode where nothing else was > running. On an ext4 filesystem opening 1,000,000 files and looping through all > of the files 1000 times and calling fstat on the individuals files. This was > to ensure I was benchmarking stat times where the inodes were in the kernels > cache, but the inode values were not in the processors cache. My results: > v3.4-rc1: ~= 156ns (unmodified v3.4-rc1 with user namespace support > disabled) > v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user > namespace support disabled) > v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user > namespace support enabled) I used an identical approach on my laptop. Here's a thorough description of what I did. I built a 4.14.0-rc4 mainline kernel with my new idmap patch applied. I booted into single user mode and used an ext4 filesystem to open/create 1,000,000 files. Then I looped through all of the files calling fstat() on each of them 1000 times and calculated the mean fstat() time for a single file. (The test program can be found below.) Here are the results. For fun, I compared the first version of my patch which scaled linearly with the new version of the patch: | # MAPPINGS | PATCH-V1 | PATCH-V2 | |--------------|------------|----------| | 0 mappings | 158 ns | 158 ns | | 1 mappings | 164 ns | 157 ns | | 2 mappings | 170 ns | 158 ns | | 3 mappings | 175 ns | 161 ns | | 5 mappings | 187 ns | 165 ns | | 10 mappings | 218 ns | 199 ns | | 50 mappings | 528 ns | 218 ns | | 100 mappings | 980 ns | 229 ns | | 200 mappings | 1880 ns | 239 ns | | 300 mappings | 2760 ns | 240 ns | | 340 mappings | not tested | 248 ns | Here's the test program I used. I asked Eric what he did and this is a more "advanced" implementation of the idea. It's pretty straight-forward: int main(int argc, char *argv[]) { int ret; size_t i, k; int fd[1000000]; int times[1000]; char pathname[4096]; struct stat st; struct timeval t1, t2; uint64_t time_in_mcs; uint64_t sum = 0; if (argc != 2) { fprintf(stderr, "Please specify a directory where to create " "the test files\n"); exit(EXIT_FAILURE); } for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) { sprintf(pathname, "%s/idmap_test_%zu", argv[1], i); fd[i]= open(pathname, O_RDWR | O_CREAT, S_IXUSR | S_IXGRP | S_IXOTH); if (fd[i] < 0) { printf("asdf: %s\n", strerror(errno)); ssize_t j; for (j = i; j >= 0; j--) close(fd[j]); exit(EXIT_FAILURE); } } for (k = 0; k < 1000; k++) { ret = gettimeofday(&t1, NULL); if (ret < 0) goto close_all; for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) { ret = fstat(fd[i], &st); if (ret < 0) goto close_all; } ret = gettimeofday(&t2, NULL); if (ret < 0) goto close_all; time_in_mcs = (1000000 * t2.tv_sec + t2.tv_usec) - (1000000 * t1.tv_sec + t1.tv_usec); printf("Total time in micro seconds: %" PRIu64 "\n", time_in_mcs); printf("Total time in nanoseconds: %" PRIu64 "\n", time_in_mcs * 1000); printf("Time per file in nanoseconds: %" PRIu64 "\n", (time_in_mcs * 1000) / 1000000); times[k] = (time_in_mcs * 1000) / 1000000; } close_all: for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) close(fd[i]); if (ret < 0) exit(EXIT_FAILURE); for (k = 0; k < 1000; k++) { sum += times[k]; } printf("Mean time per file in nanoseconds: %" PRIu64 "\n", sum / 1000); exit(EXIT_SUCCESS);; } Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> CC: Eric Biederman Hallyn <se...@hallyn.com> CC: Serge Hallyn <ebied...@xmission.com> Signed-off-by: Christian Brauner <christian.brau...@ubuntu.com> --- Changelog 2017-10-16: * Use completely new approach which makes it pointless to go into specific differences. --- include/linux/user_namespace.h | 7 +- kernel/user_namespace.c | 359 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 333 insertions(+), 33 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 7c83d7f6289b..1c1046a60fb4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -10,7 +10,8 @@ #include <linux/sysctl.h> #include <linux/err.h> -#define UID_GID_MAP_MAX_EXTENTS 5 +#define UID_GID_MAP_MAX_BASE_EXTENTS 5 +#define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; @@ -18,10 +19,10 @@ struct uid_gid_extent { u32 count; }; -struct uid_gid_map { /* 64 bytes -- 1 cache line */ +struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { - struct uid_gid_extent extent[UID_GID_MAP_MAX_EXTENTS]; + struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..eceb03ae597c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -23,6 +23,8 @@ #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> +#include <linux/bsearch.h> +#include <linux/sort.h> static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); @@ -181,6 +183,22 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + + mutex_lock(&userns_state_mutex); + if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->gid_map.forward); + kfree(ns->gid_map.reverse); + } + if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->uid_map.forward); + kfree(ns->uid_map.reverse); + } + if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->projid_map.forward); + kfree(ns->projid_map.reverse); + } + mutex_unlock(&userns_state_mutex); + retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); @@ -198,7 +216,84 @@ void __put_user_ns(struct user_namespace *ns) } EXPORT_SYMBOL(__put_user_ns); -static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +/** + * idmap_key struct holds the information necessary to find an idmapping in a + * sorted idmap array. It is passed to cmp_map_id() as first argument. + */ +struct idmap_key { + bool map_up; /* true -> id from kid; false -> kid from id */ + u32 id; /* id to find */ + u32 count; /* == 0 unless used with map_id_range_down() */ +}; + +/** + * cmp_map_id - Function to be passed to bsearch() to find the requested + * idmapping. Expects struct idmap_key to be passed via @k. + */ +static int cmp_map_id(const void *k, const void *e) +{ + u32 first, last, id2; + const struct idmap_key *key = k; + const struct uid_gid_extent *el = e; + + /* handle map_id_range_down() */ + if (key->count) + id2 = key->id + key->count - 1; + else + id2 = key->id; + + /* handle map_id_{down,up}() */ + if (key->map_up) + first = el->lower_first; + else + first = el->first; + + last = first + el->count - 1; + + if (key->id >= first && key->id <= last && + (id2 >= first && id2 <= last)) + return 0; + + if (key->id < first || id2 < first) + return -1; + + return 1; +} + +/** + * map_id_range_down_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) +{ + u32 extents; + struct uid_gid_extent *extent; + struct idmap_key key; + + key.map_up = false; + key.count = count; + key.id = id; + + extents = map->nr_extents; + smp_rmb(); + + extent = bsearch(&key, map->forward, extents, + sizeof(struct uid_gid_extent), cmp_map_id); + /* Map the id or note failure */ + if (extent) + id = (id - extent->first) + extent->lower_first; + else + id = (u32) -1; + + return id; +} + +/** + * map_id_range_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count) { unsigned idx, extents; u32 first, last, id2; @@ -224,7 +319,23 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -static u32 map_id_down(struct uid_gid_map *map, u32 id) +static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_range_down_base(map, id, count); + + return map_id_range_down_max(map, id, count); +} + +/** + * map_id_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_down_base(struct uid_gid_map *map, u32 id) { unsigned idx, extents; u32 first, last; @@ -247,7 +358,23 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) return id; } -static u32 map_id_up(struct uid_gid_map *map, u32 id) +static u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_down_base(map, id); + + return map_id_range_down_max(map, id, 0); +} + +/** + * map_id_up_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_up_base(struct uid_gid_map *map, u32 id) { unsigned idx, extents; u32 first, last; @@ -270,6 +397,45 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) return id; } +/** + * map_id_up_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_up_max(struct uid_gid_map *map, u32 id) +{ + u32 extents; + struct uid_gid_extent *extent; + struct idmap_key key; + + key.map_up = true; + key.count = 0; + key.id = id; + + extents = map->nr_extents; + smp_rmb(); + + extent = bsearch(&key, map->reverse, extents, + sizeof(struct uid_gid_extent), cmp_map_id); + /* Map the id or note failure */ + if (extent) + id = (id - extent->lower_first) + extent->first; + else + id = (u32) -1; + + return id; +} + +static u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_up_base(map, id); + + return map_id_up_max(map, id); +} + /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in @@ -540,13 +706,15 @@ static int projid_m_show(struct seq_file *seq, void *v) static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { - struct uid_gid_extent *extent = NULL; loff_t pos = *ppos; - if (pos < map->nr_extents) - extent = &map->extent[pos]; + if (pos >= map->nr_extents) + return NULL; + + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return &map->extent[pos]; - return extent; + return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) @@ -618,7 +786,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map, u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; - prev = &new_map->extent[idx]; + if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + prev = &new_map->extent[idx]; + else + prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; @@ -638,6 +809,104 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } +/** + * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. + * Takes care to allocate a 4K block of memory if the number of mappings exceeds + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) +{ + if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) { + map->extent[map->nr_extents].first = extent->first; + map->extent[map->nr_extents].lower_first = extent->lower_first; + map->extent[map->nr_extents].count = extent->count; + return 0; + } + + if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { + struct uid_gid_extent *forward; + + /* Allocate memory for 340 mappings. */ + forward = kmalloc(sizeof(struct uid_gid_extent) * + UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL); + if (IS_ERR(forward)) + return PTR_ERR(forward); + + /* Copy over memory. Only set up memory for the forward pointer. + * Defer the memory setup for the reverse pointer. + */ + memcpy(forward, map->extent, + map->nr_extents * sizeof(map->extent[0])); + + map->forward = forward; + map->reverse = NULL; + } + + map->forward[map->nr_extents].first = extent->first; + map->forward[map->nr_extents].lower_first = extent->lower_first; + map->forward[map->nr_extents].count = extent->count; + return 0; +} + +/* cmp function to sort() forward mappings */ +static int cmp_extents_forward(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->first < e2->first) + return -1; + + if (e1->first > e2->first) + return 1; + + return 0; +} + +/* cmp function to sort() reverse mappings */ +static int cmp_extents_reverse(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->lower_first < e2->lower_first) + return -1; + + if (e1->lower_first > e2->lower_first) + return 1; + + return 0; +} + +/** + * sort_idmaps - Sorts an array of idmap entries. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int sort_idmaps(struct uid_gid_map *map) +{ + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return 0; + + /* Sort forward array. */ + sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_forward, NULL); + + /* Only copy the memory from forward we actually need. */ + map->reverse = kmemdup(map->forward, + map->nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL); + if (IS_ERR(map->reverse)) { + map->reverse = NULL; + return PTR_ERR(map->reverse); + } + + /* Sort reverse array. */ + sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_reverse, NULL); + + return 0; +} + static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -648,7 +917,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; - struct uid_gid_extent *extent = NULL; + struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret = -EINVAL; @@ -673,6 +942,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, */ mutex_lock(&userns_state_mutex); + memset(&new_map, 0, sizeof(struct uid_gid_map)); + ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) @@ -700,9 +971,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Parse the user data */ ret = -EINVAL; pos = kbuf; - new_map.nr_extents = 0; for (; pos; pos = next_line) { - extent = &new_map.extent[new_map.nr_extents]; /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); @@ -714,17 +983,17 @@ static ssize_t map_write(struct file *file, const char __user *buf, } pos = skip_spaces(pos); - extent->first = simple_strtoul(pos, &pos, 10); + extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->lower_first = simple_strtoul(pos, &pos, 10); + extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->count = simple_strtoul(pos, &pos, 10); + extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; @@ -734,29 +1003,33 @@ static ssize_t map_write(struct file *file, const char __user *buf, goto out; /* Verify we have been given valid starting values */ - if ((extent->first == (u32) -1) || - (extent->lower_first == (u32) -1)) + if ((extent.first == (u32) -1) || + (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ - if ((extent->first + extent->count) <= extent->first) + if ((extent.first + extent.count) <= extent.first) goto out; - if ((extent->lower_first + extent->count) <= - extent->lower_first) + if ((extent.lower_first + extent.count) <= + extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ - if (mappings_overlap(&new_map, extent)) + if (mappings_overlap(&new_map, &extent)) goto out; - new_map.nr_extents++; - - /* Fail if the file contains too many extents */ - if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && + if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; + + ret = insert_extent(&new_map, &extent); + if (ret < 0) + goto out; + ret = -EINVAL; + + new_map.nr_extents++; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) @@ -767,16 +1040,26 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; + ret = sort_idmaps(&new_map); + if (ret < 0) + goto out; + + ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { + struct uid_gid_extent *e; u32 lower_first; - extent = &new_map.extent[idx]; + + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + e = &new_map.extent[idx]; + else + e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, - extent->lower_first, - extent->count); + e->lower_first, + e->count); /* Fail if we can not map the specified extent to * the kernel global id space. @@ -784,18 +1067,34 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (lower_first == (u32) -1) goto out; - extent->lower_first = lower_first; + e->lower_first = lower_first; } /* Install the map */ - memcpy(map->extent, new_map.extent, - new_map.nr_extents*sizeof(new_map.extent[0])); + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + memcpy(map->extent, new_map.extent, + new_map.nr_extents * sizeof(new_map.extent[0])); + } else { + map->forward = new_map.forward; + map->reverse = new_map.reverse; + } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: + if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + if (new_map.forward) + kfree(new_map.forward); + map->forward = NULL; + + if (new_map.reverse) + kfree(new_map.reverse); + map->reverse = NULL; + map->nr_extents = 0; + } + mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; -- 2.14.1