When a system is under memory pressure, it may be beneficial to prioritize
some memory cgroups to keep their pages resident ahead of other cgroups'
pages. Add a new interface to memory cgroups, memory.priority, that enables
kswapd and direct reclaim to scan more pages in lower-priority cgroups
before looking at higher-priority cgroups.

Signed-off-by: Tim Murray <timmur...@google.com>
---
 include/linux/memcontrol.h | 20 +++++++++++++++++++-
 mm/memcontrol.c            | 33 +++++++++++++++++++++++++++++++++
 mm/vmscan.c                |  3 ++-
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5af377303880..0d0f95839a8d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -206,7 +206,9 @@ struct mem_cgroup {
        bool            oom_lock;
        int             under_oom;
 
-       int     swappiness;
+       int             swappiness;
+       int             priority;
+
        /* OOM-Killer disable */
        int             oom_kill_disable;
 
@@ -487,6 +489,16 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 
 bool mem_cgroup_oom_synchronize(bool wait);
 
+static inline int mem_cgroup_priority(struct mem_cgroup *memcg)
+{
+       /* root ? */
+       if (mem_cgroup_disabled() || !memcg->css.parent)
+               return 0;
+
+       return memcg->priority;
+}
+
+
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
 #endif
@@ -766,6 +778,12 @@ static inline
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+
+static inline int mem_cgroup_priority(struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
 #endif /* CONFIG_MEMCG */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd7541d7c11..7343ca106a36 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -81,6 +81,8 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
 
 #define MEM_CGROUP_RECLAIM_RETRIES     5
 
+#define MEM_CGROUP_PRIORITY_MAX        10
+
 /* Socket memory accounting disabled? */
 static bool cgroup_memory_nosocket;
 
@@ -241,6 +243,7 @@ enum res_type {
        _OOM_TYPE,
        _KMEM,
        _TCP,
+       _PRIO,
 };
 
 #define MEMFILE_PRIVATE(x, val)        ((x) << 16 | (val))
@@ -842,6 +845,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                 */
                memcg = mem_cgroup_from_css(css);
 
+               if (reclaim && reclaim->priority &&
+                   (DEF_PRIORITY - memcg->priority) < reclaim->priority)
+                       continue;
+
                if (css == &root->css)
                        break;
 
@@ -2773,6 +2780,7 @@ enum {
        RES_MAX_USAGE,
        RES_FAILCNT,
        RES_SOFT_LIMIT,
+       RES_PRIORITY,
 };
 
 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
@@ -2783,6 +2791,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state 
*css,
 
        switch (MEMFILE_TYPE(cft->private)) {
        case _MEM:
+       case _PRIO:
                counter = &memcg->memory;
                break;
        case _MEMSWAP:
@@ -2813,6 +2822,8 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state 
*css,
                return counter->failcnt;
        case RES_SOFT_LIMIT:
                return (u64)memcg->soft_limit * PAGE_SIZE;
+       case RES_PRIORITY:
+               return (u64)memcg->priority;
        default:
                BUG();
        }
@@ -2966,6 +2977,22 @@ static int memcg_update_tcp_limit(struct mem_cgroup 
*memcg, unsigned long limit)
        return ret;
 }
 
+static ssize_t mem_cgroup_update_prio(struct kernfs_open_file *of,
+                                     char *buf, size_t nbytes, loff_t off)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       unsigned long long prio = -1;
+
+       buf = strstrip(buf);
+       prio = memparse(buf, NULL);
+
+       if (prio >= 0 && prio <= MEM_CGROUP_PRIORITY_MAX) {
+               memcg->priority = (int)prio;
+               return nbytes;
+       }
+       return -EINVAL;
+}
+
 /*
  * The user of this function is...
  * RES_LIMIT.
@@ -3940,6 +3967,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
                .read_u64 = mem_cgroup_read_u64,
        },
        {
+               .name = "priority",
+               .private = MEMFILE_PRIVATE(_PRIO, RES_PRIORITY),
+               .write = mem_cgroup_update_prio,
+               .read_u64 = mem_cgroup_read_u64,
+       },
+       {
                .name = "stat",
                .seq_show = memcg_stat_show,
        },
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc8031ef994d..c47b21326ab0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2116,6 +2116,7 @@ static void get_scan_count(struct lruvec *lruvec, struct 
mem_cgroup *memcg,
                           unsigned long *lru_pages)
 {
        int swappiness = mem_cgroup_swappiness(memcg);
+       int priority = mem_cgroup_priority(memcg);
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        u64 fraction[2];
        u64 denominator = 0;    /* gcc */
@@ -2287,7 +2288,7 @@ static void get_scan_count(struct lruvec *lruvec, struct 
mem_cgroup *memcg,
                        unsigned long scan;
 
                        size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-                       scan = size >> sc->priority;
+                       scan = size >> (sc->priority + priority);
 
                        if (!scan && pass && force_scan)
                                scan = min(size, SWAP_CLUSTER_MAX);
-- 
2.12.0.367.g23dc2f6d3c-goog

Reply via email to