Currently the slab objects already reparent to it's parent memcg on
cgroup removal. But there are still some corner objects which are
not reparent (e.g. allocations larger than order-1 page on SLUB).
Actually those objects are allocated directly from the buddy allocator.
And they are chared as kmem to memcg via __memcg_kmem_charge_page().
Such objects are not reparent on cgroup removal.

So this patch aims to reparent kmem pages on cgroup removal. Doing
this is simple with help of the infrastructures of obj_cgroup.
Finally, the page->memcg_data points to an object cgroup for the
kmem page.

Signed-off-by: Muchun Song <songmuc...@bytedance.com>
---
 include/linux/memcontrol.h |  66 +++++++++++--------
 mm/memcontrol.c            | 155 ++++++++++++++++++++++++---------------------
 2 files changed, 124 insertions(+), 97 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1d2c82464c8c..27043478220f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -370,23 +370,15 @@ static inline bool page_memcg_charged(struct page *page)
 }
 
 /*
- * page_memcg_kmem - get the memory cgroup associated with a kmem page.
- * @page: a pointer to the page struct
+ * After the initialization objcg->memcg is always pointing at
+ * a valid memcg, but can be atomically swapped to the parent memcg.
  *
- * Returns a pointer to the memory cgroup associated with the kmem page,
- * or NULL. This function assumes that the page is known to have a proper
- * memory cgroup pointer. It is only suitable for kmem pages which means
- * PageMemcgKmem() returns true for this page.
+ * The caller must ensure that the returned memcg won't be released:
+ * e.g. acquire the rcu_read_lock or css_set_lock.
  */
-static inline struct mem_cgroup *page_memcg_kmem(struct page *page)
+static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
 {
-       unsigned long memcg_data = page->memcg_data;
-
-       VM_BUG_ON_PAGE(PageSlab(page), page);
-       VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
-       VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
-
-       return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+       return READ_ONCE(objcg->memcg);
 }
 
 /*
@@ -462,6 +454,17 @@ static inline struct mem_cgroup *page_memcg_check(struct 
page *page)
        if (memcg_data & MEMCG_DATA_OBJCGS)
                return NULL;
 
+       if (memcg_data & MEMCG_DATA_KMEM) {
+               struct obj_cgroup *objcg;
+
+               /*
+                * The caller must ensure that the returned memcg won't be
+                * released: e.g. acquire the rcu_read_lock or css_set_lock.
+                */
+               objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+               return obj_cgroup_memcg(objcg);
+       }
+
        return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
@@ -520,6 +523,24 @@ static inline struct obj_cgroup **page_objcgs_check(struct 
page *page)
        return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
+/*
+ * page_objcg - get the object cgroup associated with a kmem page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroup associated with the kmem page,
+ * or NULL. This function assumes that the page is known to have an
+ * associated object cgroup. It's only safe to call this function
+ * against kmem pages (PageMemcgKmem() returns true).
+ */
+static inline struct obj_cgroup *page_objcg(struct page *page)
+{
+       unsigned long memcg_data = page->memcg_data;
+
+       VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+       VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+
+       return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
 #else
 static inline struct obj_cgroup **page_objcgs(struct page *page)
 {
@@ -530,6 +551,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct 
page *page)
 {
        return NULL;
 }
+
+static inline struct obj_cgroup *page_objcg(struct page *page)
+{
+       return NULL;
+}
 #endif
 
 static __always_inline bool memcg_stat_item_in_bytes(int idx)
@@ -748,18 +774,6 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
        percpu_ref_put(&objcg->refcnt);
 }
 
-/*
- * After the initialization objcg->memcg is always pointing at
- * a valid memcg, but can be atomically swapped to the parent memcg.
- *
- * The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
- */
-static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
-{
-       return READ_ONCE(objcg->memcg);
-}
-
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
        if (memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bfd6efe1e196..39cb8c5bf8b2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -856,10 +856,16 @@ void __mod_lruvec_page_state(struct page *page, enum 
node_stat_item idx,
 {
        struct page *head = compound_head(page); /* rmap on tail pages */
        struct mem_cgroup *memcg;
-       pg_data_t *pgdat = page_pgdat(page);
+       pg_data_t *pgdat;
        struct lruvec *lruvec;
 
-       memcg = PageMemcgKmem(head) ? page_memcg_kmem(head) : page_memcg(head);
+       if (PageMemcgKmem(head)) {
+               __mod_lruvec_kmem_state(page_to_virt(head), idx, val);
+               return;
+       }
+
+       pgdat = page_pgdat(head);
+       memcg = page_memcg(head);
        /* Untracked pages have no memcg, no lruvec. Update only the node */
        if (!memcg) {
                __mod_node_page_state(pgdat, idx, val);
@@ -1056,24 +1062,6 @@ static __always_inline struct mem_cgroup 
*active_memcg(void)
                return current->active_memcg;
 }
 
-static __always_inline struct mem_cgroup *get_active_memcg(void)
-{
-       struct mem_cgroup *memcg;
-
-       rcu_read_lock();
-       memcg = active_memcg();
-       if (memcg) {
-               /* current->active_memcg must hold a ref. */
-               if (WARN_ON_ONCE(!css_tryget(&memcg->css)))
-                       memcg = root_mem_cgroup;
-               else
-                       memcg = current->active_memcg;
-       }
-       rcu_read_unlock();
-
-       return memcg;
-}
-
 static __always_inline bool memcg_kmem_bypass(void)
 {
        /* Allow remote memcg charging from any context. */
@@ -1088,20 +1076,6 @@ static __always_inline bool memcg_kmem_bypass(void)
 }
 
 /**
- * If active memcg is set, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
-{
-       if (memcg_kmem_bypass())
-               return NULL;
-
-       if (unlikely(active_memcg()))
-               return get_active_memcg();
-
-       return get_mem_cgroup_from_mm(current->mm);
-}
-
-/**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
@@ -3148,18 +3122,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup 
*memcg, unsigned int nr_page
  */
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
 {
-       struct mem_cgroup *memcg;
+       struct obj_cgroup *objcg;
        int ret = 0;
 
-       memcg = get_mem_cgroup_from_current();
-       if (memcg && !mem_cgroup_is_root(memcg)) {
-               ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+       objcg = get_obj_cgroup_from_current();
+       if (objcg) {
+               ret = obj_cgroup_charge_page(objcg, gfp, 1 << order);
                if (!ret) {
-                       page->memcg_data = (unsigned long)memcg |
+                       page->memcg_data = (unsigned long)objcg |
                                MEMCG_DATA_KMEM;
                        return 0;
                }
-               css_put(&memcg->css);
+               obj_cgroup_put(objcg);
        }
        return ret;
 }
@@ -3171,17 +3145,18 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t 
gfp, int order)
  */
 void __memcg_kmem_uncharge_page(struct page *page, int order)
 {
-       struct mem_cgroup *memcg;
+       struct obj_cgroup *objcg;
        unsigned int nr_pages = 1 << order;
 
        if (!page_memcg_charged(page))
                return;
 
-       memcg = page_memcg_kmem(page);
-       VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-       __memcg_kmem_uncharge(memcg, nr_pages);
+       VM_BUG_ON_PAGE(!PageMemcgKmem(page), page);
+
+       objcg = page_objcg(page);
+       obj_cgroup_uncharge_page(objcg, nr_pages);
        page->memcg_data = 0;
-       css_put(&memcg->css);
+       obj_cgroup_put(objcg);
 }
 
 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -6798,8 +6773,12 @@ struct uncharge_gather {
        struct mem_cgroup *memcg;
        unsigned long nr_pages;
        unsigned long pgpgout;
-       unsigned long nr_kmem;
        struct page *dummy_page;
+
+#ifdef CONFIG_MEMCG_KMEM
+       struct obj_cgroup *objcg;
+       unsigned long nr_kmem;
+#endif
 };
 
 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
@@ -6811,12 +6790,21 @@ static void uncharge_batch(const struct uncharge_gather 
*ug)
 {
        unsigned long flags;
 
+#ifdef CONFIG_MEMCG_KMEM
+       if (ug->objcg) {
+               obj_cgroup_uncharge_page(ug->objcg, ug->nr_kmem);
+               /* drop reference from uncharge_kmem_page */
+               obj_cgroup_put(ug->objcg);
+       }
+#endif
+
+       if (!ug->memcg)
+               return;
+
        if (!mem_cgroup_is_root(ug->memcg)) {
                page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
                if (do_memsw_account())
                        page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
-               if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
-                       page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
                memcg_oom_recover(ug->memcg);
        }
 
@@ -6826,26 +6814,40 @@ static void uncharge_batch(const struct uncharge_gather 
*ug)
        memcg_check_events(ug->memcg, ug->dummy_page);
        local_irq_restore(flags);
 
-       /* drop reference from uncharge_page */
+       /* drop reference from uncharge_user_page */
        css_put(&ug->memcg->css);
 }
 
-static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+#ifdef CONFIG_MEMCG_KMEM
+static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
 {
-       unsigned long nr_pages;
-       struct mem_cgroup *memcg;
+       struct obj_cgroup *objcg = page_objcg(page);
 
-       VM_BUG_ON_PAGE(PageLRU(page), page);
+       if (ug->objcg != objcg) {
+               if (ug->objcg) {
+                       uncharge_batch(ug);
+                       uncharge_gather_clear(ug);
+               }
+               ug->objcg = objcg;
 
-       if (!page_memcg_charged(page))
-               return;
+               /* pairs with obj_cgroup_put in uncharge_batch */
+               obj_cgroup_get(ug->objcg);
+       }
+
+       ug->nr_kmem += compound_nr(page);
+       page->memcg_data = 0;
+       obj_cgroup_put(ug->objcg);
+}
+#else
+static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
+{
+}
+#endif
+
+static void uncharge_user_page(struct page *page, struct uncharge_gather *ug)
+{
+       struct mem_cgroup *memcg = page_memcg(page);
 
-       /*
-        * Nobody should be changing or seriously looking at
-        * page memcg at this point, we have fully exclusive
-        * access to the page.
-        */
-       memcg = PageMemcgKmem(page) ? page_memcg_kmem(page) : page_memcg(page);
        if (ug->memcg != memcg) {
                if (ug->memcg) {
                        uncharge_batch(ug);
@@ -6856,18 +6858,30 @@ static void uncharge_page(struct page *page, struct 
uncharge_gather *ug)
                /* pairs with css_put in uncharge_batch */
                css_get(&ug->memcg->css);
        }
+       ug->pgpgout++;
+       ug->dummy_page = page;
+
+       ug->nr_pages += compound_nr(page);
+       page->memcg_data = 0;
+       css_put(&ug->memcg->css);
+}
 
-       nr_pages = compound_nr(page);
-       ug->nr_pages += nr_pages;
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+       VM_BUG_ON_PAGE(PageLRU(page), page);
 
+       if (!page_memcg_charged(page))
+               return;
+
+       /*
+        * Nobody should be changing or seriously looking at
+        * page memcg at this point, we have fully exclusive
+        * access to the page.
+        */
        if (PageMemcgKmem(page))
-               ug->nr_kmem += nr_pages;
+               uncharge_kmem_page(page, ug);
        else
-               ug->pgpgout++;
-
-       ug->dummy_page = page;
-       page->memcg_data = 0;
-       css_put(&ug->memcg->css);
+               uncharge_user_page(page, ug);
 }
 
 /**
@@ -6910,8 +6924,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
        uncharge_gather_clear(&ug);
        list_for_each_entry(page, page_list, lru)
                uncharge_page(page, &ug);
-       if (ug.memcg)
-               uncharge_batch(&ug);
+       uncharge_batch(&ug);
 }
 
 /**
-- 
2.11.0

Reply via email to