In order to scan page tables, we add an infrastructure to maintain
either a system-wide mm_struct list or per-memcg mm_struct lists.
Multiple threads can concurrently work on the same mm_struct list, and
each of them will be given a different mm_struct.

This infrastructure also tracks whether an mm_struct is being used on
any CPUs or has been used since the last time a worker looked at it.
In other words, workers will not be given an mm_struct that belongs to
a process that has been sleeping.

Signed-off-by: Yu Zhao <yuz...@google.com>
---
 fs/exec.c                  |   2 +
 include/linux/memcontrol.h |   6 +
 include/linux/mm_types.h   | 117 ++++++++++++++
 include/linux/mmzone.h     |   2 -
 kernel/exit.c              |   1 +
 kernel/fork.c              |  10 ++
 kernel/kthread.c           |   1 +
 kernel/sched/core.c        |   2 +
 mm/memcontrol.c            |  28 ++++
 mm/vmscan.c                | 316 +++++++++++++++++++++++++++++++++++++
 10 files changed, 483 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..c691d4d7720c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1008,6 +1008,7 @@ static int exec_mmap(struct mm_struct *mm)
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
+       lru_gen_add_mm(mm);
        /*
         * This prevents preemption while active_mm is being loaded and
         * it and mm are being updated, which could cause problems for
@@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm)
        if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        activate_mm(active_mm, mm);
+       lru_gen_switch_mm(active_mm, mm);
        if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        tsk->mm->vmacache_seqnum = 0;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f13dc02cf277..cff95ed1ee2b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -212,6 +212,8 @@ struct obj_cgroup {
        };
 };
 
+struct lru_gen_mm_list;
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -335,6 +337,10 @@ struct mem_cgroup {
        struct deferred_split deferred_split_queue;
 #endif
 
+#ifdef CONFIG_LRU_GEN
+       struct lru_gen_mm_list *mm_list;
+#endif
+
        struct mem_cgroup_per_node *nodeinfo[0];
        /* WARNING: nodeinfo must be the last member here */
 };
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..f8a239fbb958 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -15,6 +15,8 @@
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
 #include <linux/seqlock.h>
+#include <linux/nodemask.h>
+#include <linux/mmdebug.h>
 
 #include <asm/mmu.h>
 
@@ -383,6 +385,8 @@ struct core_state {
        struct completion startup;
 };
 
+#define ANON_AND_FILE 2
+
 struct kioctx_table;
 struct mm_struct {
        struct {
@@ -561,6 +565,22 @@ struct mm_struct {
 
 #ifdef CONFIG_IOMMU_SUPPORT
                u32 pasid;
+#endif
+#ifdef CONFIG_LRU_GEN
+               struct {
+                       /* the node of a global or per-memcg mm_struct list */
+                       struct list_head list;
+#ifdef CONFIG_MEMCG
+                       /* points to memcg of the owner task above */
+                       struct mem_cgroup *memcg;
+#endif
+                       /* whether this mm_struct has been used since the last 
walk */
+                       nodemask_t nodes[ANON_AND_FILE];
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+                       /* the number of CPUs using this mm_struct */
+                       atomic_t nr_cpus;
+#endif
+               } lrugen;
 #endif
        } __randomize_layout;
 
@@ -588,6 +608,103 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
        return (struct cpumask *)&mm->cpu_bitmap;
 }
 
+#ifdef CONFIG_LRU_GEN
+
+void lru_gen_init_mm(struct mm_struct *mm);
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg);
+void lru_gen_free_mm_list(struct mem_cgroup *memcg);
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+/*
+ * Track the usage so mm_struct's that haven't been used since the last walk 
can
+ * be skipped. This function adds a theoretical overhead to each context 
switch,
+ * which hasn't been measurable.
+ */
+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct 
*new)
+{
+       int file;
+
+       /* exclude init_mm, efi_mm, etc. */
+       if (!core_kernel_data((unsigned long)old)) {
+               VM_BUG_ON(old == &init_mm);
+
+               for (file = 0; file < ANON_AND_FILE; file++)
+                       nodes_setall(old->lrugen.nodes[file]);
+
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+               atomic_dec(&old->lrugen.nr_cpus);
+               VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old);
+#endif
+       } else
+               VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) ||
+                            READ_ONCE(old->lrugen.list.next), old);
+
+       if (!core_kernel_data((unsigned long)new)) {
+               VM_BUG_ON(new == &init_mm);
+
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+               atomic_inc(&new->lrugen.nr_cpus);
+               VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new);
+#endif
+       } else
+               VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) ||
+                            READ_ONCE(new->lrugen.list.next), new);
+}
+
+/* Return whether this mm_struct is being used on any CPUs. */
+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
+{
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       return !cpumask_empty(mm_cpumask(mm));
+#else
+       return atomic_read(&mm->lrugen.nr_cpus);
+#endif
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
+static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct 
*new)
+{
+}
+
+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
+{
+       return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct 
*mm);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a60c7498afd7..dcfadf6a8c07 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -285,8 +285,6 @@ static inline bool is_active_lru(enum lru_list lru)
        return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
 }
 
-#define ANON_AND_FILE 2
-
 enum lruvec_flags {
        LRUVEC_CONGESTED,               /* lruvec has many dirty pages
                                         * backed by a congested BDI
diff --git a/kernel/exit.c b/kernel/exit.c
index 04029e35e69a..e4292717ce37 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -422,6 +422,7 @@ void mm_update_next_owner(struct mm_struct *mm)
                goto retry;
        }
        WRITE_ONCE(mm->owner, c);
+       lru_gen_migrate_mm(mm);
        task_unlock(c);
        put_task_struct(c);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 426cd0c51f9e..dfa84200229f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -665,6 +665,7 @@ static void check_mm(struct mm_struct *mm)
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
 #endif
+       VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm);
 }
 
 #define allocate_mm()  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
@@ -1055,6 +1056,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
                goto fail_nocontext;
 
        mm->user_ns = get_user_ns(user_ns);
+       lru_gen_init_mm(mm);
        return mm;
 
 fail_nocontext:
@@ -1097,6 +1099,7 @@ static inline void __mmput(struct mm_struct *mm)
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
+       lru_gen_del_mm(mm);
        mmdrop(mm);
 }
 
@@ -2521,6 +2524,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
                get_task_struct(p);
        }
 
+       if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+               /* lock the task to synchronize with memcg migration */
+               task_lock(p);
+               lru_gen_add_mm(p->mm);
+               task_unlock(p);
+       }
+
        wake_up_new_task(p);
 
        /* forking complete and child started to run, tell ptracer */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1578973c5740..8da7767bb06a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1303,6 +1303,7 @@ void kthread_use_mm(struct mm_struct *mm)
        tsk->mm = mm;
        membarrier_update_current_mm(mm);
        switch_mm_irqs_off(active_mm, mm, tsk);
+       lru_gen_switch_mm(active_mm, mm);
        local_irq_enable();
        task_unlock(tsk);
 #ifdef finish_arch_post_lock_switch
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98191218d891..bd626dbdb816 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4306,6 +4306,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                 * finish_task_switch()'s mmdrop().
                 */
                switch_mm_irqs_off(prev->active_mm, next->mm, next);
+               lru_gen_switch_mm(prev->active_mm, next->mm);
 
                if (!prev->mm) {                        // from kernel
                        /* will mmdrop() in finish_task_switch(). */
@@ -7597,6 +7598,7 @@ void idle_task_exit(void)
 
        if (mm != &init_mm) {
                switch_mm(mm, &init_mm, current);
+               lru_gen_switch_mm(mm, &init_mm);
                finish_arch_post_lock_switch();
        }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e064ac0d850a..496e91e813af 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5206,6 +5206,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                free_mem_cgroup_per_node_info(memcg, node);
        free_percpu(memcg->vmstats_percpu);
        free_percpu(memcg->vmstats_local);
+       lru_gen_free_mm_list(memcg);
        kfree(memcg);
 }
 
@@ -5258,6 +5259,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
                if (alloc_mem_cgroup_per_node_info(memcg, node))
                        goto fail;
 
+       if (lru_gen_alloc_mm_list(memcg))
+               goto fail;
+
        if (memcg_wb_domain_init(memcg, GFP_KERNEL))
                goto fail;
 
@@ -6162,6 +6166,29 @@ static void mem_cgroup_move_task(void)
 }
 #endif
 
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+       struct cgroup_subsys_state *css;
+       struct task_struct *task = NULL;
+
+       cgroup_taskset_for_each_leader(task, css, tset)
+               ;
+
+       if (!task)
+               return;
+
+       task_lock(task);
+       if (task->mm && task->mm->owner == task)
+               lru_gen_migrate_mm(task->mm);
+       task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+#endif
+
 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
 {
        if (value == PAGE_COUNTER_MAX)
@@ -6502,6 +6529,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
        .css_free = mem_cgroup_css_free,
        .css_reset = mem_cgroup_css_reset,
        .can_attach = mem_cgroup_can_attach,
+       .attach = mem_cgroup_attach,
        .cancel_attach = mem_cgroup_cancel_attach,
        .post_attach = mem_cgroup_move_task,
        .dfl_cftypes = memory_files,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c74ebe2039f7..d67dfd1e3930 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4464,6 +4464,313 @@ static bool positive_ctrl_err(struct controller_pos 
*sp, struct controller_pos *
               sp->refaulted * max(pv->total, 1UL) * pv->gain;
 }
 
+/******************************************************************************
+ *                          mm_struct list
+ 
******************************************************************************/
+
+enum {
+       MM_SCHED_ACTIVE,        /* running processes */
+       MM_SCHED_INACTIVE,      /* sleeping processes */
+       MM_LOCK_CONTENTION,     /* lock contentions */
+       MM_VMA_INTERVAL,        /* VMAs within the range of current table */
+       MM_LEAF_OTHER_NODE,     /* entries not from node under reclaim */
+       MM_LEAF_OTHER_MEMCG,    /* entries not from memcg under reclaim */
+       MM_LEAF_OLD,            /* old entries */
+       MM_LEAF_YOUNG,          /* young entries */
+       MM_LEAF_DIRTY,          /* dirty entries */
+       MM_LEAF_HOLE,           /* non-present entries */
+       MM_NONLEAF_OLD,         /* old non-leaf pmd entries */
+       MM_NONLEAF_YOUNG,       /* young non-leaf pmd entries */
+       NR_MM_STATS
+};
+
+/* mnemonic codes for the stats above */
+#define MM_STAT_CODES          "aicvnmoydhlu"
+
+struct lru_gen_mm_list {
+       /* the head of a global or per-memcg mm_struct list */
+       struct list_head head;
+       /* protects the list */
+       spinlock_t lock;
+       struct {
+               /* set to max_seq after each round of walk */
+               unsigned long cur_seq;
+               /* the next mm on the list to walk */
+               struct list_head *iter;
+               /* to wait for the last worker to finish */
+               struct wait_queue_head wait;
+               /* the number of concurrent workers */
+               int nr_workers;
+               /* stats for debugging */
+               unsigned long stats[NR_STAT_GENS][NR_MM_STATS];
+       } nodes[0];
+};
+
+static struct lru_gen_mm_list *global_mm_list;
+
+static struct lru_gen_mm_list *alloc_mm_list(void)
+{
+       int nid;
+       struct lru_gen_mm_list *mm_list;
+
+       mm_list = kzalloc(struct_size(mm_list, nodes, nr_node_ids), GFP_KERNEL);
+       if (!mm_list)
+               return NULL;
+
+       INIT_LIST_HEAD(&mm_list->head);
+       spin_lock_init(&mm_list->lock);
+
+       for_each_node(nid) {
+               mm_list->nodes[nid].cur_seq = MIN_NR_GENS;
+               mm_list->nodes[nid].iter = &mm_list->head;
+               init_waitqueue_head(&mm_list->nodes[nid].wait);
+       }
+
+       return mm_list;
+}
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+#ifdef CONFIG_MEMCG
+       if (!mem_cgroup_disabled())
+               return memcg ? memcg->mm_list : root_mem_cgroup->mm_list;
+#endif
+       VM_BUG_ON(memcg);
+
+       return global_mm_list;
+}
+
+void lru_gen_init_mm(struct mm_struct *mm)
+{
+       int file;
+
+       INIT_LIST_HEAD(&mm->lrugen.list);
+#ifdef CONFIG_MEMCG
+       mm->lrugen.memcg = NULL;
+#endif
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       atomic_set(&mm->lrugen.nr_cpus, 0);
+#endif
+       for (file = 0; file < ANON_AND_FILE; file++)
+               nodes_clear(mm->lrugen.nodes[file]);
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+       struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+       struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+       VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
+#ifdef CONFIG_MEMCG
+       VM_BUG_ON_MM(mm->lrugen.memcg, mm);
+       WRITE_ONCE(mm->lrugen.memcg, memcg);
+#endif
+       spin_lock(&mm_list->lock);
+       list_add_tail(&mm->lrugen.list, &mm_list->head);
+       spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+       int nid;
+#ifdef CONFIG_MEMCG
+       struct lru_gen_mm_list *mm_list = get_mm_list(mm->lrugen.memcg);
+#else
+       struct lru_gen_mm_list *mm_list = get_mm_list(NULL);
+#endif
+
+       spin_lock(&mm_list->lock);
+
+       for_each_node(nid) {
+               if (mm_list->nodes[nid].iter != &mm->lrugen.list)
+                       continue;
+
+               mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
+               if (mm_list->nodes[nid].iter == &mm_list->head)
+                       WRITE_ONCE(mm_list->nodes[nid].cur_seq,
+                                  mm_list->nodes[nid].cur_seq + 1);
+       }
+
+       list_del_init(&mm->lrugen.list);
+
+       spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+       mem_cgroup_put(mm->lrugen.memcg);
+       WRITE_ONCE(mm->lrugen.memcg, NULL);
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
+{
+       if (mem_cgroup_disabled())
+               return 0;
+
+       memcg->mm_list = alloc_mm_list();
+
+       return memcg->mm_list ? 0 : -ENOMEM;
+}
+
+void lru_gen_free_mm_list(struct mem_cgroup *memcg)
+{
+       kfree(memcg->mm_list);
+       memcg->mm_list = NULL;
+}
+
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+       struct mem_cgroup *memcg;
+
+       lockdep_assert_held(&mm->owner->alloc_lock);
+
+       if (mem_cgroup_disabled())
+               return;
+
+       rcu_read_lock();
+       memcg = mem_cgroup_from_task(mm->owner);
+       rcu_read_unlock();
+       if (memcg == mm->lrugen.memcg)
+               return;
+
+       VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
+       VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
+
+       lru_gen_del_mm(mm);
+       lru_gen_add_mm(mm);
+}
+
+static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg)
+{
+       return READ_ONCE(mm->lrugen.memcg) != memcg;
+}
+#else
+static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg)
+{
+       return false;
+}
+#endif
+
+struct mm_walk_args {
+       struct mem_cgroup *memcg;
+       unsigned long max_seq;
+       unsigned long next_addr;
+       unsigned long start_pfn;
+       unsigned long end_pfn;
+       int node_id;
+       int batch_size;
+       int mm_stats[NR_MM_STATS];
+       int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+       bool should_walk[ANON_AND_FILE];
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || 
defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)
+       unsigned long bitmap[BITS_TO_LONGS(PTRS_PER_PMD)];
+#endif
+};
+
+static void reset_mm_stats(struct lru_gen_mm_list *mm_list, bool last,
+                          struct mm_walk_args *args)
+{
+       int i;
+       int nid = args->node_id;
+       int sid = sid_from_seq_or_gen(args->max_seq);
+
+       lockdep_assert_held(&mm_list->lock);
+
+       for (i = 0; i < NR_MM_STATS; i++) {
+               WRITE_ONCE(mm_list->nodes[nid].stats[sid][i],
+                          mm_list->nodes[nid].stats[sid][i] + 
args->mm_stats[i]);
+               args->mm_stats[i] = 0;
+       }
+
+       if (!last || NR_STAT_GENS == 1)
+               return;
+
+       sid = sid_from_seq_or_gen(args->max_seq + 1);
+       for (i = 0; i < NR_MM_STATS; i++)
+               WRITE_ONCE(mm_list->nodes[nid].stats[sid][i], 0);
+}
+
+static bool should_skip_mm(struct mm_struct *mm, int nid, int swappiness)
+{
+       int file;
+       unsigned long size = 0;
+
+       if (mm_is_oom_victim(mm))
+               return true;
+
+       for (file = !swappiness; file < ANON_AND_FILE; file++) {
+               if (lru_gen_mm_is_active(mm) || node_isset(nid, 
mm->lrugen.nodes[file]))
+                       size += file ? get_mm_counter(mm, MM_FILEPAGES) :
+                                      get_mm_counter(mm, MM_ANONPAGES) +
+                                      get_mm_counter(mm, MM_SHMEMPAGES);
+       }
+
+       /* leave the legwork to the rmap if mapped pages are too sparse */
+       if (size < max(SWAP_CLUSTER_MAX, mm_pgtables_bytes(mm) / PAGE_SIZE))
+               return true;
+
+       return !mmget_not_zero(mm);
+}
+
+/* To support multiple workers that concurrently walk mm_struct list. */
+static bool get_next_mm(struct mm_walk_args *args, int swappiness, struct 
mm_struct **iter)
+{
+       bool last = true;
+       struct mm_struct *mm = NULL;
+       int nid = args->node_id;
+       struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
+
+       if (*iter)
+               mmput_async(*iter);
+       else if (args->max_seq <= READ_ONCE(mm_list->nodes[nid].cur_seq))
+               return false;
+
+       spin_lock(&mm_list->lock);
+
+       VM_BUG_ON(args->max_seq > mm_list->nodes[nid].cur_seq + 1);
+       VM_BUG_ON(*iter && args->max_seq < mm_list->nodes[nid].cur_seq);
+       VM_BUG_ON(*iter && !mm_list->nodes[nid].nr_workers);
+
+       if (args->max_seq <= mm_list->nodes[nid].cur_seq) {
+               last = *iter;
+               goto done;
+       }
+
+       if (mm_list->nodes[nid].iter == &mm_list->head) {
+               VM_BUG_ON(*iter || mm_list->nodes[nid].nr_workers);
+               mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
+       }
+
+       while (!mm && mm_list->nodes[nid].iter != &mm_list->head) {
+               mm = list_entry(mm_list->nodes[nid].iter, struct mm_struct, 
lrugen.list);
+               mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
+               if (should_skip_mm(mm, nid, swappiness))
+                       mm = NULL;
+
+               args->mm_stats[mm ? MM_SCHED_ACTIVE : MM_SCHED_INACTIVE]++;
+       }
+
+       if (mm_list->nodes[nid].iter == &mm_list->head)
+               WRITE_ONCE(mm_list->nodes[nid].cur_seq,
+                          mm_list->nodes[nid].cur_seq + 1);
+done:
+       if (*iter && !mm)
+               mm_list->nodes[nid].nr_workers--;
+       if (!*iter && mm)
+               mm_list->nodes[nid].nr_workers++;
+
+       last = last && !mm_list->nodes[nid].nr_workers &&
+              mm_list->nodes[nid].iter == &mm_list->head;
+
+       reset_mm_stats(mm_list, last, args);
+
+       spin_unlock(&mm_list->lock);
+
+       *iter = mm;
+
+       return last;
+}
+
 /******************************************************************************
  *                          state change
  
******************************************************************************/
@@ -4694,6 +5001,15 @@ static int __init init_lru_gen(void)
 {
        BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
        BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+       BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
+
+       if (mem_cgroup_disabled()) {
+               global_mm_list = alloc_mm_list();
+               if (!global_mm_list) {
+                       pr_err("lru_gen: failed to allocate global mm_struct 
list\n");
+                       return -ENOMEM;
+               }
+       }
 
        if (hotplug_memory_notifier(lru_gen_online_mem, 0))
                pr_err("lru_gen: failed to subscribe hotplug notifications\n");
-- 
2.31.1.295.g9ea45b61b8-goog

Reply via email to