From: Vladimir Davydov <vdavy...@virtuozzo.com> The new file is supposed to be used for migrating pages accounted to a memory cgroup to a particular set of numa nodes. The reason to add it is that currently there's no API for migrating unmapped file pages used for storing page cache (neither migrate_pages syscall nor cpuset subsys doesn't provide this functionality).
The file is added to the memory cgroup and has the following format: NODELIST[ MAX_SCAN] where NODELIST is a comma-separated list of ranges N1-N2 specifying the set of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN imposes a limit on the number of pages that can be migrated in one go. The call may be interrupted by a signal, in which case -EINTR is returned. https://jira.sw.ru/browse/PSBM-50875 Signed-off-by: Vladimir Davydov <vdavy...@virtuozzo.com> Reviewed-by: Andrey Ryabinin <aryabi...@virtuozzo.com> Cc: Igor Redko <red...@virtuozzo.com> Cc: Konstantin Neumoin <kneum...@virtuozzo.com> Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> Cherry-picked from vz8 commit 42bd2e736895 ("mm: memcontrol: add memory.numa_migrate file"). Followed __alloc_pages_nodemask() -> -__alloc_pages() transition. Followed __update_lru_size() -> update_lru_size() transition. Followed move of lru_lock from pgdat to lruvec. Followed __isolate_lru_page() -> __lru_isolate_page_prepare() rework. For that, re-synced related memcg_numa_isolate_pages() code with origin from isolate_lru_pages(). Signed-off-by: Nikita Yushchenko <nikita.yushche...@virtuozzo.com> --- mm/memcontrol.c | 268 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7cb3e3e8d9fc..1fc6a0c78c4e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -64,6 +64,7 @@ #include <linux/psi.h> #include <linux/seq_buf.h> #include <linux/virtinfo.h> +#include <linux/migrate.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -4114,6 +4115,268 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) return 0; } + +/* + * memcg_numa_migrate_new_page() private argument. @target_nodes specifies the + * set of nodes to allocate pages from. @current_node is the current preferable + * node, it gets rotated after each allocation. + */ +struct memcg_numa_migrate_struct { + nodemask_t *target_nodes; + int current_node; +}; + +/* + * Used as an argument for migrate_pages(). Allocated pages are spread evenly + * among destination nodes. + */ +static struct page *memcg_numa_migrate_new_page(struct page *page, + unsigned long private) +{ + struct memcg_numa_migrate_struct *ms = (void *)private; + gfp_t gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_NORETRY | __GFP_NOWARN; + + ms->current_node = next_node(ms->current_node, *ms->target_nodes); + if (ms->current_node >= MAX_NUMNODES) { + ms->current_node = first_node(*ms->target_nodes); + VM_BUG_ON(ms->current_node >= MAX_NUMNODES); + } + if (thp_migration_supported() && PageTransHuge(page)) { + struct page *thp; + + thp = __alloc_pages(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE, + HPAGE_PMD_ORDER, ms->current_node, + ms->target_nodes); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; + } + + return __alloc_pages(gfp_mask, 0, ms->current_node, ms->target_nodes); +} + +/* + * Update LRU sizes after isolating pages. The LRU size updates must + * be complete before mem_cgroup_update_lru_size due to a santity check. + */ +static __always_inline void update_lru_sizes(struct lruvec *lruvec, + enum lru_list lru, unsigned long *nr_zone_taken) +{ + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_zone_taken[zid]) + continue; + + update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + } + +} + +/* + * Isolate at most @nr_to_scan pages from @lruvec for further migration and + * store them in @dst. Returns the number of pages scanned. Return value of 0 + * means that @lruvec is empty. + */ +static long memcg_numa_isolate_pages(struct lruvec *lruvec, enum lru_list lru, + long nr_to_scan, struct list_head *dst) +{ + struct list_head *src = &lruvec->lists[lru]; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; + struct page *page; + long scanned = 0, taken = 0; + + spin_lock_irq(&lruvec->lru_lock); + while (!list_empty(src) && scanned < nr_to_scan && taken < nr_to_scan) { + int nr_pages; + page = list_last_entry(src, struct page, lru); + + scanned++; + + if (__isolate_lru_page_prepare(page, ISOLATE_ASYNC_MIGRATE)) { + /* It is being freed elsewhere */ + list_move(&page->lru, src); + continue; + } + + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + if (unlikely(!get_page_unless_zero(page))) { + list_move(&page->lru, src); + continue; + } + + if (!TestClearPageLRU(page)) { + /* Another thread is already isolating this page */ + put_page(page); + list_move(&page->lru, src); + continue; + } + + nr_pages = thp_nr_pages(page); + taken += nr_pages; + nr_zone_taken[page_zonenum(page)] += nr_pages; + list_move(&page->lru, dst); + } + __mod_node_page_state(pgdat, NR_LRU_BASE + lru, -taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + is_file_lru(lru), taken); + update_lru_sizes(lruvec, lru, nr_zone_taken); + spin_unlock_irq(&lruvec->lru_lock); + + return scanned; +} + +static long __memcg_numa_migrate_pages(struct lruvec *lruvec, enum lru_list lru, + nodemask_t *target_nodes, long nr_to_scan) +{ + struct memcg_numa_migrate_struct ms = { + .target_nodes = target_nodes, + .current_node = -1, + }; + LIST_HEAD(pages); + long total_scanned = 0; + + /* + * If no limit on the maximal number of migrated pages is specified, + * assume the caller wants to migrate them all. + */ + if (nr_to_scan < 0) + nr_to_scan = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); + + while (total_scanned < nr_to_scan) { + int ret; + long scanned; + + scanned = memcg_numa_isolate_pages(lruvec, lru, + SWAP_CLUSTER_MAX, &pages); + if (!scanned) + break; + + ret = migrate_pages(&pages, memcg_numa_migrate_new_page, + NULL, (unsigned long)&ms, MIGRATE_ASYNC, + MR_SYSCALL); + putback_movable_pages(&pages); + if (ret < 0) + return ret; + + if (signal_pending(current)) + return -EINTR; + + total_scanned += scanned; + } + + return total_scanned; +} + +/* + * Migrate at most @nr_to_scan pages accounted to @memcg to @target_nodes. + * Pages are spreaded evenly among destination nodes. If @nr_to_scan is <= 0, + * then the function will attempt to migrate all pages accounted to @memcg. + */ +static int memcg_numa_migrate_pages(struct mem_cgroup *memcg, + nodemask_t *target_nodes, long nr_to_scan) +{ + struct mem_cgroup *iter; + long total_scanned = 0, scanned; + +again: + scanned = 0; + for_each_mem_cgroup_tree(iter, memcg) { + unsigned int nid; + + for_each_online_node(nid) { + struct lruvec *lruvec; + enum lru_list lru; + + if (node_isset(nid, *target_nodes)) + continue; + + lruvec = mem_cgroup_lruvec(iter, NODE_DATA(nid)); + /* + * For the sake of simplicity, do not attempt to migrate + * unevictable pages. It should be fine as long as there + * aren't too many of them, which is usually true. + */ + for_each_evictable_lru(lru) { + long ret = __memcg_numa_migrate_pages(lruvec, + lru, target_nodes, + nr_to_scan > 0 ? + SWAP_CLUSTER_MAX : -1); + if (ret < 0) { + mem_cgroup_iter_break(memcg, iter); + return ret; + } + scanned += ret; + } + } + } + + total_scanned += scanned; + + /* + * Retry only if we made progress in the previous iteration. + */ + if (nr_to_scan > 0 && scanned > 0 && total_scanned < nr_to_scan) + goto again; + + return 0; +} + +/* + * The format of memory.numa_migrate is + * + * NODELIST[ MAX_SCAN] + * + * where NODELIST is a comma-separated list of ranges N1-N2 specifying the set + * of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN + * imposes a limit on the number of pages that can be migrated in one go. + * + * The call may be interrupted by a signal, in which case -EINTR is returned. + */ +static ssize_t memcg_numa_migrate_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + NODEMASK_ALLOC(nodemask_t, target_nodes, GFP_KERNEL); + const char *nodes_str = buf, *nr_str; + long nr_to_scan = -1; + int ret = -ENOMEM; + + if (!target_nodes) + goto out; + + nr_str = strchr(buf, ' '); + if (nr_str) { + nodes_str = kstrndup(buf, nr_str - buf, GFP_KERNEL); + if (!nodes_str) + goto out; + nr_str += 1; + } + + ret = nodelist_parse(nodes_str, *target_nodes); + if (ret) + goto out; + + ret = -EINVAL; + if (!nodes_subset(*target_nodes, node_states[N_MEMORY])) + goto out; + + if (nr_str && (kstrtol(nr_str, 10, &nr_to_scan) || nr_to_scan <= 0)) + goto out; + + ret = memcg_numa_migrate_pages(memcg, target_nodes, nr_to_scan); +out: + if (nodes_str != buf) + kfree(nodes_str); + NODEMASK_FREE(target_nodes); + return ret ?: nbytes; +} + #endif /* CONFIG_NUMA */ static const unsigned int memcg1_stats[] = { @@ -5236,6 +5499,11 @@ static struct cftype mem_cgroup_legacy_files[] = { .name = "numa_stat", .seq_show = memcg_numa_stat_show, }, + { + .name = "numa_migrate", + .flags = CFTYPE_NOT_ON_ROOT, + .write = memcg_numa_migrate_write, + }, #endif #ifdef CONFIG_CLEANCACHE { -- 2.30.2 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel