From: Kirill Tkhai <ktk...@virtuozzo.com> Patchset description: Shrink big fdtable on criu restore
This patchset allows to avoid memory overuse introduced by service fds on criu restore. The solution is simple: smartly check for closed fd number, and shrink fdtable if this could be made. The checks are happen in is_pseudosuper mode, so we do not affect performance on normal work mode. The problem is we can't solve this for 100% case in userspace. Kernel allows to fix that completely. https://jira.sw.ru/browse/PSBM-78827 Eric Dumazet (1): ms/fs/file.c: don't acquire files->file_lock in fd_install() Kirill Tkhai (3): files: Add new argument to expand_files() files: Add fdtable_align() helper files: Shrink big fdtable on close in is_pseudosuper mode Mateusz Guzik (1): ms/vfs: grab the lock instead of blocking in __fd_install during resizing ============================================================ This patch description: This trick is going to be used for criu restore, to release excess memory occupied by service files: we check a closing fd, and if it's a half of max available fdtable number, we try to shrink the fdstable and decrease amoung of memory needed to store task's fds. Currently is_pseudosuper state is used to detect restore, but it can be changed later if needed. Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com> Reviewed-by: Cyrill Gorcunov <gorcu...@openvz.org> Rebase to vz8: - Used rebased to RH7.9 commit 4b024fd120c5cfc3775387e2ed2e29d389a42849 which handles new copy_fd_bitmaps helper function. (cherry picked from e4a319f998910317ce1559acebecca365f85d8ba) Signed-off-by: Andrey Zhadchenko <andrey.zhadche...@virtuozzo.com> --- fs/file.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/fs/file.c b/fs/file.c index 4f68ef0f..0ed1c4c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -18,6 +18,7 @@ #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> +#include <linux/ve.h> unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -47,21 +48,25 @@ static void free_fdtable_rcu(struct rcu_head *rcu) * spinlock held for write. */ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, - unsigned int count) + unsigned int count, bool shrink) { unsigned int cpy, set; - cpy = count / BITS_PER_BYTE; + cpy = min(count, nfdt->max_fds) / BITS_PER_BYTE; set = (nfdt->max_fds - count) / BITS_PER_BYTE; memcpy(nfdt->open_fds, ofdt->open_fds, cpy); - memset((char *)nfdt->open_fds + cpy, 0, set); + if (!shrink) + memset((char *)nfdt->open_fds + cpy, 0, set); + memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); - memset((char *)nfdt->close_on_exec + cpy, 0, set); + if (!shrink) + memset((char *)nfdt->close_on_exec + cpy, 0, set); - cpy = BITBIT_SIZE(count); + cpy = BITBIT_SIZE(min(count, nfdt->max_fds)); set = BITBIT_SIZE(nfdt->max_fds) - cpy; memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); - memset((char *)nfdt->full_fds_bits + cpy, 0, set); + if (!shrink) + memset((char *)nfdt->full_fds_bits + cpy, 0, set); } /* @@ -72,14 +77,15 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt, bool shrink { unsigned int cpy, set; - BUG_ON(nfdt->max_fds < ofdt->max_fds); + BUG_ON((nfdt->max_fds < ofdt->max_fds) != shrink); - cpy = ofdt->max_fds * sizeof(struct file *); + cpy = min(ofdt->max_fds, nfdt->max_fds) * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); - memset((char *)nfdt->fd + cpy, 0, set); + if (!shrink) + memset((char *)nfdt->fd + cpy, 0, set); - copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); + copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds, shrink); } static unsigned int fdtable_align(unsigned int nr) @@ -170,16 +176,26 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr, bool shri spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; + cur_fdt = files_fdtable(files); /* * extremely unlikely race - sysctl_nr_open decreased between the check in * caller and alloc_fdtable(). Cheaper to catch it here... */ - if (unlikely(new_fdt->max_fds <= nr)) { + if (unlikely((new_fdt->max_fds <= nr && !shrink) || + (shrink && new_fdt->max_fds >= cur_fdt->max_fds))) { __free_fdtable(new_fdt); return -EMFILE; } - cur_fdt = files_fdtable(files); - BUG_ON(nr < cur_fdt->max_fds); + if (unlikely(shrink)) { + int i; + i = find_last_bit(cur_fdt->open_fds, cur_fdt->max_fds); + i = fdtable_align(i); + if (i == cur_fdt->max_fds) { + __free_fdtable(new_fdt); + return 1; + } + } + BUG_ON((nr < cur_fdt->max_fds) != shrink); copy_fdtable(new_fdt, cur_fdt, shrink); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) @@ -208,7 +224,7 @@ static int expand_files(struct files_struct *files, unsigned int nr, bool shrink fdt = files_fdtable(files); /* Do we need to expand? */ - if (nr < fdt->max_fds) + if (nr < fdt->max_fds && !shrink) return expanded; /* Can we expand? */ @@ -223,6 +239,15 @@ static int expand_files(struct files_struct *files, unsigned int nr, bool shrink goto repeat; } + if (unlikely(shrink)) { + unsigned int i; + i = find_last_bit(fdt->open_fds, fdt->max_fds); + nr = i; + i = fdtable_align(i); + if (i >= fdt->max_fds) + return expanded; + } + /* All good, so we try */ files->resize_in_progress = true; expanded = expand_fdtable(files, nr, shrink); @@ -337,7 +362,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) open_files = count_open_files(old_fdt); } - copy_fd_bitmaps(new_fdt, old_fdt, open_files); + copy_fd_bitmaps(new_fdt, old_fdt, open_files, false); old_fds = old_fdt->fd; new_fds = new_fdt->fd; @@ -643,6 +668,14 @@ int __close_fd(struct files_struct *files, unsigned fd) goto out_unlock; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); + + /* Try to shrink fdt and to free memory */ + if (unlikely(fd * 2 >= fdt->max_fds && + fd > (1024 / sizeof(struct file *))) && + get_exec_env() != get_ve0() && + get_exec_env()->is_pseudosuper) + expand_files(files, fd, true); + spin_unlock(&files->file_lock); return filp_close(file, files); -- 1.8.3.1 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel