From: Vladimir Davydov <vdavy...@virtuozzo.com>

The new file is supposed to be used for migrating pages accounted to a
memory cgroup to a particular set of numa nodes. The reason to add it is
that currently there's no API for migrating unmapped file pages used for
storing page cache (neither migrate_pages syscall nor cpuset subsys
doesn't provide this functionality).

The file is added to the memory cgroup and has the following format:

  NODELIST[ MAX_SCAN]

where NODELIST is a comma-separated list of ranges N1-N2 specifying the set
of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN
imposes a limit on the number of pages that can be migrated in one go.

The call may be interrupted by a signal, in which case -EINTR is returned.

https://jira.sw.ru/browse/PSBM-50875

Signed-off-by: Vladimir Davydov <vdavy...@virtuozzo.com>
Reviewed-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
Cc: Igor Redko <red...@virtuozzo.com>
Cc: Konstantin Neumoin <kneum...@virtuozzo.com>
Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com>

Cherry-picked from vz8 commit 42bd2e736895 ("mm: memcontrol:
add memory.numa_migrate file").

Followed __alloc_pages_nodemask() -> -__alloc_pages() transition.

Followed __update_lru_size() -> update_lru_size() transition.

Followed move of lru_lock from pgdat to lruvec.

Followed __isolate_lru_page() -> __lru_isolate_page_prepare() rework.
For that, re-synced related memcg_numa_isolate_pages() code with origin
from isolate_lru_pages().

Signed-off-by: Nikita Yushchenko <nikita.yushche...@virtuozzo.com>
---
 mm/memcontrol.c | 268 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 268 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7cb3e3e8d9fc..1fc6a0c78c4e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -64,6 +64,7 @@
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
 #include <linux/virtinfo.h>
+#include <linux/migrate.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -4114,6 +4115,268 @@ static int memcg_numa_stat_show(struct seq_file *m, 
void *v)
 
        return 0;
 }
+
+/*
+ * memcg_numa_migrate_new_page() private argument. @target_nodes specifies the
+ * set of nodes to allocate pages from. @current_node is the current preferable
+ * node, it gets rotated after each allocation.
+ */
+struct memcg_numa_migrate_struct {
+       nodemask_t *target_nodes;
+       int current_node;
+};
+
+/*
+ * Used as an argument for migrate_pages(). Allocated pages are spread evenly
+ * among destination nodes.
+ */
+static struct page *memcg_numa_migrate_new_page(struct page *page,
+                               unsigned long private)
+{
+       struct memcg_numa_migrate_struct *ms = (void *)private;
+       gfp_t gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_NORETRY | __GFP_NOWARN;
+
+       ms->current_node = next_node(ms->current_node, *ms->target_nodes);
+       if (ms->current_node >= MAX_NUMNODES) {
+               ms->current_node = first_node(*ms->target_nodes);
+               VM_BUG_ON(ms->current_node >= MAX_NUMNODES);
+       }
+       if (thp_migration_supported() && PageTransHuge(page)) {
+               struct page *thp;
+
+               thp = __alloc_pages(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE,
+                                       HPAGE_PMD_ORDER, ms->current_node,
+                                       ms->target_nodes);
+               if (!thp)
+                       return NULL;
+               prep_transhuge_page(thp);
+               return thp;
+       }
+
+       return __alloc_pages(gfp_mask, 0, ms->current_node, ms->target_nodes);
+}
+
+/*
+ * Update LRU sizes after isolating pages. The LRU size updates must
+ * be complete before mem_cgroup_update_lru_size due to a santity check.
+ */
+static __always_inline void update_lru_sizes(struct lruvec *lruvec,
+                       enum lru_list lru, unsigned long *nr_zone_taken)
+{
+       int zid;
+
+       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+               if (!nr_zone_taken[zid])
+                       continue;
+
+               update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
+       }
+
+}
+
+/*
+ * Isolate at most @nr_to_scan pages from @lruvec for further migration and
+ * store them in @dst. Returns the number of pages scanned. Return value of 0
+ * means that @lruvec is empty.
+ */
+static long memcg_numa_isolate_pages(struct lruvec *lruvec, enum lru_list lru,
+                                    long nr_to_scan, struct list_head *dst)
+{
+       struct list_head *src = &lruvec->lists[lru];
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+       struct page *page;
+       long scanned = 0, taken = 0;
+
+       spin_lock_irq(&lruvec->lru_lock);
+       while (!list_empty(src) && scanned < nr_to_scan && taken < nr_to_scan) {
+               int nr_pages;
+               page = list_last_entry(src, struct page, lru);
+
+               scanned++;
+
+               if (__isolate_lru_page_prepare(page, ISOLATE_ASYNC_MIGRATE)) {
+                       /* It is being freed elsewhere */
+                       list_move(&page->lru, src);
+                       continue;
+               }
+
+               /*
+                * Be careful not to clear PageLRU until after we're
+                * sure the page is not being freed elsewhere -- the
+                * page release code relies on it.
+                */
+               if (unlikely(!get_page_unless_zero(page))) {
+                       list_move(&page->lru, src);
+                       continue;
+               }
+
+               if (!TestClearPageLRU(page)) {
+                       /* Another thread is already isolating this page */
+                       put_page(page);
+                       list_move(&page->lru, src);
+                       continue;
+               }
+
+               nr_pages = thp_nr_pages(page);
+               taken += nr_pages;
+               nr_zone_taken[page_zonenum(page)] += nr_pages;
+               list_move(&page->lru, dst);
+       }
+       __mod_node_page_state(pgdat, NR_LRU_BASE + lru, -taken);
+       __mod_node_page_state(pgdat, NR_ISOLATED_ANON + is_file_lru(lru), 
taken);
+       update_lru_sizes(lruvec, lru, nr_zone_taken);
+       spin_unlock_irq(&lruvec->lru_lock);
+
+       return scanned;
+}
+
+static long __memcg_numa_migrate_pages(struct lruvec *lruvec, enum lru_list 
lru,
+                                      nodemask_t *target_nodes, long 
nr_to_scan)
+{
+       struct memcg_numa_migrate_struct ms = {
+               .target_nodes = target_nodes,
+               .current_node = -1,
+       };
+       LIST_HEAD(pages);
+       long total_scanned = 0;
+
+       /*
+        * If no limit on the maximal number of migrated pages is specified,
+        * assume the caller wants to migrate them all.
+        */
+       if (nr_to_scan < 0)
+               nr_to_scan = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+
+       while (total_scanned < nr_to_scan) {
+               int ret;
+               long scanned;
+
+               scanned = memcg_numa_isolate_pages(lruvec, lru,
+                                                  SWAP_CLUSTER_MAX, &pages);
+               if (!scanned)
+                       break;
+
+               ret = migrate_pages(&pages, memcg_numa_migrate_new_page,
+                                   NULL, (unsigned long)&ms, MIGRATE_ASYNC,
+                                   MR_SYSCALL);
+               putback_movable_pages(&pages);
+               if (ret < 0)
+                       return ret;
+
+               if (signal_pending(current))
+                       return -EINTR;
+
+               total_scanned += scanned;
+       }
+
+       return total_scanned;
+}
+
+/*
+ * Migrate at most @nr_to_scan pages accounted to @memcg to @target_nodes.
+ * Pages are spreaded evenly among destination nodes. If @nr_to_scan is <= 0,
+ * then the function will attempt to migrate all pages accounted to @memcg.
+ */
+static int memcg_numa_migrate_pages(struct mem_cgroup *memcg,
+                                   nodemask_t *target_nodes, long nr_to_scan)
+{
+       struct mem_cgroup *iter;
+       long total_scanned = 0, scanned;
+
+again:
+       scanned = 0;
+       for_each_mem_cgroup_tree(iter, memcg) {
+               unsigned int nid;
+
+               for_each_online_node(nid) {
+                       struct lruvec *lruvec;
+                       enum lru_list lru;
+
+                       if (node_isset(nid, *target_nodes))
+                               continue;
+
+                       lruvec = mem_cgroup_lruvec(iter, NODE_DATA(nid));
+                       /*
+                        * For the sake of simplicity, do not attempt to migrate
+                        * unevictable pages. It should be fine as long as there
+                        * aren't too many of them, which is usually true.
+                        */
+                       for_each_evictable_lru(lru) {
+                               long ret = __memcg_numa_migrate_pages(lruvec,
+                                               lru, target_nodes,
+                                               nr_to_scan > 0 ?
+                                               SWAP_CLUSTER_MAX : -1);
+                               if (ret < 0) {
+                                       mem_cgroup_iter_break(memcg, iter);
+                                       return ret;
+                               }
+                               scanned += ret;
+                       }
+               }
+       }
+
+       total_scanned += scanned;
+
+       /*
+        * Retry only if we made progress in the previous iteration.
+        */
+       if (nr_to_scan > 0 && scanned > 0 && total_scanned < nr_to_scan)
+               goto again;
+
+       return 0;
+}
+
+/*
+ * The format of memory.numa_migrate is
+ *
+ *   NODELIST[ MAX_SCAN]
+ *
+ * where NODELIST is a comma-separated list of ranges N1-N2 specifying the set
+ * of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN
+ * imposes a limit on the number of pages that can be migrated in one go.
+ *
+ * The call may be interrupted by a signal, in which case -EINTR is returned.
+ */
+static ssize_t memcg_numa_migrate_write(struct kernfs_open_file *of, char *buf,
+                               size_t nbytes, loff_t off)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       NODEMASK_ALLOC(nodemask_t, target_nodes, GFP_KERNEL);
+       const char *nodes_str = buf, *nr_str;
+       long nr_to_scan = -1;
+       int ret = -ENOMEM;
+
+       if (!target_nodes)
+               goto out;
+
+       nr_str = strchr(buf, ' ');
+       if (nr_str) {
+               nodes_str = kstrndup(buf, nr_str - buf, GFP_KERNEL);
+               if (!nodes_str)
+                       goto out;
+               nr_str += 1;
+       }
+
+       ret = nodelist_parse(nodes_str, *target_nodes);
+       if (ret)
+               goto out;
+
+       ret = -EINVAL;
+       if (!nodes_subset(*target_nodes, node_states[N_MEMORY]))
+               goto out;
+
+       if (nr_str && (kstrtol(nr_str, 10, &nr_to_scan) || nr_to_scan <= 0))
+               goto out;
+
+       ret = memcg_numa_migrate_pages(memcg, target_nodes, nr_to_scan);
+out:
+       if (nodes_str != buf)
+               kfree(nodes_str);
+       NODEMASK_FREE(target_nodes);
+       return ret ?: nbytes;
+}
+
 #endif /* CONFIG_NUMA */
 
 static const unsigned int memcg1_stats[] = {
@@ -5236,6 +5499,11 @@ static struct cftype mem_cgroup_legacy_files[] = {
                .name = "numa_stat",
                .seq_show = memcg_numa_stat_show,
        },
+       {
+               .name = "numa_migrate",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .write = memcg_numa_migrate_write,
+       },
 #endif
 #ifdef CONFIG_CLEANCACHE
        {
-- 
2.30.2

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to