memcontrol: fix cache counter imbalance during task migration

Konstantin Khorenko Mon, 28 Jul 2025 09:48:31 -0700

When a task migrates between memory cgroups in memcg v1, cache pages
(page cache, tmpfs, shmem) are not properly accounted in the cache
counter, leading to imbalanced accounting and potential issues with
cache limits and OOM behavior.


The problem occurs because:
1. mem_cgroup_count_precharge() counts all pages uniformly without
   distinguishing cache pages from regular pages
2. mem_cgroup_do_precharge() charges everything as regular memory
   without charging the cache counter for cache pages
3. During cleanup, cache pages are not properly uncharged from the
   cache counter

This patch introduces separate tracking for cache pages during task
migration:

- Add precharge_cache and moved_charge_cache fields to track cache
  pages separately from regular pages
- Modify mem_cgroup_count_precharge_pte_range() to identify cache
  pages using folio_memcg_cache() and count them separately
- Update mem_cgroup_do_precharge() to handle both regular and cache
  precharging, ensuring cache pages charge both memory and cache
  counters as expected
- Extend mem_cgroup_cancel_charge() API to support cache uncharging
- Add proper cleanup logic in __mem_cgroup_clear_mc() to handle
  cache-specific accounting
- Implement smart fallback logic in mem_cgroup_move_charge_pte_range()
  to detect when additional cache precharge is needed

The fix ensures that cache pages are properly accounted in both the
memory and cache counters during task migration, maintaining consistent
accounting and preventing cache counter imbalances that could lead to
incorrect OOM decisions or cache limit enforcement.

Notes:
 1. The patch assumes huge pages are not used for page cache.
 2. New fields in struct move_charge_struct:
    - .precharge_cache is always a part of .precharge
    - .moved_charge_cache is always a part of .moved_charge
 3. In mem_cgroup_do_precharge() arguments are under the same rule:
    "cache_count" is a part of "count".

Fixes: 6a2ca4d515c5 ("mm: Memory cgroup page cache limit")
https://virtuozzo.atlassian.net/browse/VSTOR-111756

Signed-off-by: Konstantin Khorenko <khore...@virtuozzo.com>

Feature: mm: Memory cgroup page cache limit
---
 include/linux/memcontrol.h |   4 +-
 mm/hugetlb.c               |   4 +-
 mm/memcontrol-v1.c         | 137 ++++++++++++++++++++++++++++++-------
 mm/memcontrol.c            |   5 +-
 4 files changed, 122 insertions(+), 28 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e0df2d395d03f..3a6b90cd36e36 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -770,7 +770,7 @@ static inline void mem_cgroup_uncharge_folios(struct 
folio_batch *folios)
        __mem_cgroup_uncharge_folios(folios);
 }
 
-void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages, 
bool cache);
 void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
 void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
@@ -1299,7 +1299,7 @@ static inline void mem_cgroup_uncharge_folios(struct 
folio_batch *folios)
 }
 
 static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
-               unsigned int nr_pages)
+                                           unsigned int nr_pages, bool cache)
 {
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 93886dba813e0..a4b5141f97bd0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3004,7 +3004,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct 
*vma,
        map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
        if (map_chg < 0) {
                if (!memcg_charge_ret)
-                       mem_cgroup_cancel_charge(memcg, nr_pages);
+                       mem_cgroup_cancel_charge(memcg, nr_pages, false);
                mem_cgroup_put(memcg);
                return ERR_PTR(-ENOMEM);
        }
@@ -3124,7 +3124,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct 
*vma,
 out_end_reservation:
        vma_end_reservation(h, vma, addr);
        if (!memcg_charge_ret)
-               mem_cgroup_cancel_charge(memcg, nr_pages);
+               mem_cgroup_cancel_charge(memcg, nr_pages, false);
        mem_cgroup_put(memcg);
        return ERR_PTR(-ENOSPC);
 }
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index fd0a563a352b1..9b3969b774d8d 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -58,6 +58,8 @@ static struct move_charge_struct {
        unsigned long flags;
        unsigned long precharge;
        unsigned long moved_charge;
+       unsigned long precharge_cache;
+       unsigned long moved_charge_cache;
        unsigned long moved_swap;
        struct task_struct *moving_task;        /* a task moving charges */
        wait_queue_head_t waitq;                /* a waitq for other context */
@@ -629,26 +631,53 @@ static int mem_cgroup_move_charge_write(struct 
cgroup_subsys_state *css,
 
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
-static int mem_cgroup_do_precharge(unsigned long count)
+static int mem_cgroup_do_precharge(unsigned long count, unsigned long 
cache_count)
 {
+       unsigned long noncache_count = count - cache_count;
        int ret;
 
        /* Try a single bulk charge without reclaim first, kswapd may wake */
-       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count,
-                        false);
-       if (!ret) {
-               mc.precharge += count;
-               return ret;
+       /* First try to precharge non-cache amount, if needed. */
+       if (noncache_count) {
+               ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM,
+                               noncache_count, false);
+               if (!ret) {
+                       mc.precharge += noncache_count;
+                       noncache_count = 0;
+               }
+       }
+
+       /* Second try to precharge cache amount, if needed. */
+       if (cache_count) {
+               ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM,
+                               cache_count, true);
+               if (!ret) {
+                       mc.precharge += cache_count;
+                       mc.precharge_cache += cache_count;
+                       cache_count = 0;
+               }
        }
 
        /* Try charges one by one with reclaim, but do not retry */
-       while (count--) {
+       /* Precharge one by one non-cache */
+       while (noncache_count--) {
                ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1, false);
                if (ret)
                        return ret;
                mc.precharge++;
                cond_resched();
        }
+
+       /* Precharge one by one cache */
+       while (cache_count--) {
+               ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1, true);
+               if (ret)
+                       return ret;
+               mc.precharge++;
+               mc.precharge_cache++;
+               cond_resched();
+       }
+
        return 0;
 }
 
@@ -1040,9 +1069,39 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t 
*pmd,
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (!pte)
                return 0;
-       for (; addr != end; pte++, addr += PAGE_SIZE)
-               if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
-                       mc.precharge++; /* increment precharge temporarily */
+       for (; addr != end; pte++, addr += PAGE_SIZE) {
+               pte_t ptent = ptep_get(pte);
+               union mc_target target;
+               struct folio *folio;
+
+               switch (get_mctgt_type(vma, addr, ptent, &target)) {
+               case MC_TARGET_DEVICE:
+               case MC_TARGET_PAGE:
+                       folio = target.folio;
+                       /*
+                        * We can have a part of the split pmd here. Moving it
+                        * can be done but it would be too convoluted so simply
+                        * ignore such a partial THP and keep it in original
+                        * memcg. There should be somebody mapping the head.
+                        */
+                       if (folio_test_large(folio))
+                               goto put;
+
+                       if (folio_memcg_cache(folio))
+                               mc.precharge_cache++;
+                       mc.precharge++;
+
+put:                   /* get_mctgt_type() gets & locks the page */
+                       folio_unlock(folio);
+                       folio_put(folio);
+                       break;
+               case MC_TARGET_SWAP:
+                       mc.precharge++;
+                       break;
+               default: /* MC_TARGET_NONE */
+                       break;
+               }
+       }
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
 
@@ -1054,27 +1113,31 @@ static const struct mm_walk_ops precharge_walk_ops = {
        .walk_lock      = PGWALK_RDLOCK,
 };
 
-static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
+/*
+ * It fills global mc.precharge and mc.precharge_cache,
+ * the calling function mem_cgroup_precharge_mc() will clear them up.
+ */
+static void mem_cgroup_count_precharge(struct mm_struct *mm)
 {
-       unsigned long precharge;
-
        mmap_read_lock(mm);
        walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
        mmap_read_unlock(mm);
-
-       precharge = mc.precharge;
-       mc.precharge = 0;
-
-       return precharge;
 }
 
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
-       unsigned long precharge = mem_cgroup_count_precharge(mm);
+       unsigned long precharge, precharge_cache;
+
+       mem_cgroup_count_precharge(mm);
+       precharge = mc.precharge;
+       mc.precharge = 0;
+       precharge_cache = mc.precharge_cache;
+       mc.precharge_cache = 0;
 
        VM_BUG_ON(mc.moving_task);
        mc.moving_task = current;
-       return mem_cgroup_do_precharge(precharge);
+
+       return mem_cgroup_do_precharge(precharge, precharge_cache);
 }
 
 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
@@ -1084,16 +1147,26 @@ static void __mem_cgroup_clear_mc(void)
        struct mem_cgroup *to = mc.to;
 
        /* we must uncharge all the leftover precharges from mc.to */
+       if (mc.precharge_cache) {
+               mem_cgroup_cancel_charge(mc.to, mc.precharge_cache, true);
+               mc.precharge -= mc.precharge_cache;
+               mc.precharge_cache = 0;
+       }
        if (mc.precharge) {
-               mem_cgroup_cancel_charge(mc.to, mc.precharge);
+               mem_cgroup_cancel_charge(mc.to, mc.precharge, false);
                mc.precharge = 0;
        }
        /*
         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
         * we must uncharge here.
         */
+       if (mc.moved_charge_cache) {
+               mem_cgroup_cancel_charge(mc.from, mc.moved_charge_cache, true);
+               mc.moved_charge -= mc.moved_charge_cache;
+               mc.moved_charge_cache = 0;
+       }
        if (mc.moved_charge) {
-               mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+               mem_cgroup_cancel_charge(mc.from, mc.moved_charge, false);
                mc.moved_charge = 0;
        }
        /* we must fixup refcnts and charges */
@@ -1188,6 +1261,8 @@ int memcg1_can_attach(struct cgroup_taskset *tset)
                VM_BUG_ON(mc.to);
                VM_BUG_ON(mc.precharge);
                VM_BUG_ON(mc.moved_charge);
+               VM_BUG_ON(mc.precharge_cache);
+               VM_BUG_ON(mc.moved_charge_cache);
                VM_BUG_ON(mc.moved_swap);
 
                spin_lock(&mc.lock);
@@ -1225,6 +1300,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        union mc_target target;
        struct folio *folio;
        bool tried_split_before = false;
+       int need_cache = 0;
 
 retry_pmd:
        ptl = pmd_trans_huge_lock(pmd, vma);
@@ -1299,6 +1375,12 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                        fallthrough;
                case MC_TARGET_PAGE:
                        folio = target.folio;
+
+                       if (folio_memcg_cache(folio) && !mc.precharge_cache) {
+                               need_cache = 1;
+                               goto put;
+                       }
+
                        /*
                         * We can have a part of the split pmd here. Moving it
                         * can be done but it would be too convoluted so simply
@@ -1314,6 +1396,11 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
+
+                               if (folio_memcg_cache(folio)) {
+                                       mc.precharge_cache--;
+                                       mc.moved_charge_cache++;
+                               }
                        }
                        if (!device)
                                folio_putback_lru(folio);
@@ -1333,6 +1420,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                default:
                        break;
                }
+
+               if (need_cache)
+                       break;
        }
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
@@ -1344,7 +1434,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                 * charges to mc.to if we have failed in charge once in attach()
                 * phase.
                 */
-               ret = mem_cgroup_do_precharge(1);
+               ret = mem_cgroup_do_precharge(1, need_cache);
+               need_cache = 0;
                if (!ret)
                        goto retry;
        }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 28b8f50504e06..f0b53e5f4d243 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2514,7 +2514,8 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t 
gfp_mask,
  * @memcg: memcg previously charged.
  * @nr_pages: number of pages previously charged.
  */
-void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages,
+                             bool cache)
 {
        if (mem_cgroup_is_root(memcg))
                return;
@@ -2522,6 +2523,8 @@ void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 
unsigned int nr_pages)
        page_counter_uncharge(&memcg->memory, nr_pages);
        if (do_memsw_account())
                page_counter_uncharge(&memcg->memsw, nr_pages);
+       if (cache)
+               page_counter_uncharge(&memcg->cache, nr_pages);
 }
 
 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg,
-- 
2.43.0

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH vz10 v3 2/2] mm/memcontrol: fix cache counter imbalance during task migration

Reply via email to