From: Andrey Ryabinin <ryabinin....@gmail.com> Forward port feature: mm: per memory cgroup page cache limit.
The original implementation consisted of these commits: commit 758d52e33a67 ("configs: Enable CONFIG_PAGE_EXTENSION") commit 741beaa93c89 ("mm: introduce page vz extension (using page_ext)") commit d42d3c8b849d ("mm/memcg: limit page cache in memcg hack") This port drops the page vz extensions in favor of using a memcg_data bit to mark a page as cache. The benefit is that the implementation and porting got more simple. If we require new flags then the newly introduced folio can be used. https://jira.sw.ru/browse/PSBM-144609 Signed-off-by: Alexander Atanasov <alexander.atana...@virtuozzo.com> Signed-off-by: Andrey Ryabinin <ryabinin....@gmail.com> --- include/linux/memcontrol.h | 29 ++++- mm/filemap.c | 3 +- mm/memcontrol.c | 219 ++++++++++++++++++++++++++++++------- 3 files changed, 207 insertions(+), 44 deletions(-) v1->v2: addressing Pavel's comments for v1 - fixed compilation without MEMCG - try to preserve author - fixed line alignment - add missed bug traps and WARN_ONs - fixed spelling error diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 561db06f1fd8..1a49416300c9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -273,6 +273,7 @@ struct mem_cgroup { /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ + struct page_counter cache; /* Range enforcement for interrupt charges */ struct work_struct high_work; @@ -405,8 +406,10 @@ enum page_memcg_data_flags { MEMCG_DATA_OBJCGS = (1UL << 0), /* page has been accounted as a non-slab kernel page */ MEMCG_DATA_KMEM = (1UL << 1), + /* page has been accounted as a cache page */ + MEMCG_DATA_PGCACHE = (1UL << 2), /* the next bit after the last actual flag */ - __NR_MEMCG_DATA_FLAGS = (1UL << 2), + __NR_MEMCG_DATA_FLAGS = (1UL << 3), }; #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) @@ -771,11 +774,25 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { - if (mem_cgroup_disabled()) - return 0; return __mem_cgroup_charge(folio, mm, gfp); } +int mem_cgroup_charge_cache(struct folio *folio, struct mm_struct *mm, + gfp_t gfp); + +/* + * folio_memcg_cache - Check if the folio has the pgcache flag set. + * @folio: Pointer to the folio. + * + * Checks if the folio has page cache flag set. The caller must ensure + * that the folio has an associated memory cgroup. It's not safe to call + * this function against some types of folios, e.g. slab folios. + */ +static inline bool folio_memcg_cache(struct folio *folio) +{ + return folio->memcg_data & MEMCG_DATA_PGCACHE; +} + int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1339,6 +1356,12 @@ static inline int mem_cgroup_charge(struct folio *folio, return 0; } +static inline int mem_cgroup_charge_cache(struct folio *folio, + struct mm_struct *mm, gfp_t gfp) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { diff --git a/mm/filemap.c b/mm/filemap.c index 2d63e53980e4..d568ffc0d416 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -841,7 +841,8 @@ noinline int __filemap_add_folio(struct address_space *mapping, mapping_set_update(&xas, mapping); if (!huge) { - int error = mem_cgroup_charge(folio, NULL, gfp); + int error = mem_cgroup_charge_cache(folio, NULL, gfp); + VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); if (error) return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6fa13539f3e5..6b462152e77f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -218,6 +218,7 @@ enum res_type { _OOM_TYPE, _KMEM, _TCP, + _CACHE, }; #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) @@ -2207,6 +2208,7 @@ struct memcg_stock_pcp { int nr_slab_unreclaimable_b; #endif + unsigned int cache_nr_pages; struct work_struct work; unsigned long flags; #define FLUSHING_CACHED_CHARGE 0 @@ -2248,7 +2250,8 @@ static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) * * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, + bool cache) { struct memcg_stock_pcp *stock; unsigned long flags; @@ -2260,9 +2263,16 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (memcg == stock->cached && stock->nr_pages >= nr_pages) { - stock->nr_pages -= nr_pages; - ret = true; + if (memcg == stock->cached) { + if (cache && stock->cache_nr_pages >= nr_pages) { + stock->cache_nr_pages -= nr_pages; + ret = true; + } + + if (!cache && stock->nr_pages >= nr_pages) { + stock->nr_pages -= nr_pages; + ret = true; + } } local_unlock_irqrestore(&memcg_stock.stock_lock, flags); @@ -2276,15 +2286,20 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; + unsigned long nr_pages = stock->nr_pages + stock->cache_nr_pages; if (!old) return; - if (stock->nr_pages) { - page_counter_uncharge(&old->memory, stock->nr_pages); + if (stock->cache_nr_pages) + page_counter_uncharge(&old->cache, stock->cache_nr_pages); + + if (nr_pages) { + page_counter_uncharge(&old->memory, nr_pages); if (do_memsw_account()) - page_counter_uncharge(&old->memsw, stock->nr_pages); + page_counter_uncharge(&old->memsw, nr_pages); stock->nr_pages = 0; + stock->cache_nr_pages = 0; } css_put(&old->css); @@ -2318,9 +2333,11 @@ static void drain_local_stock(struct work_struct *dummy) * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages, + bool cache) { struct memcg_stock_pcp *stock; + unsigned long stock_nr_pages; stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ @@ -2328,18 +2345,23 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) css_get(&memcg->css); stock->cached = memcg; } - stock->nr_pages += nr_pages; + if (!cache) + stock->nr_pages += nr_pages; + else + stock->cache_nr_pages += nr_pages; - if (stock->nr_pages > MEMCG_CHARGE_BATCH) + stock_nr_pages = stock->nr_pages + stock->cache_nr_pages; + if (stock_nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); } -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages, + bool cache) { unsigned long flags; local_lock_irqsave(&memcg_stock.stock_lock, flags); - __refill_stock(memcg, nr_pages); + __refill_stock(memcg, nr_pages, cache); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } @@ -2366,10 +2388,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; bool flush = false; + unsigned long nr_pages = stock->nr_pages + + stock->cache_nr_pages; rcu_read_lock(); memcg = stock->cached; - if (memcg && stock->nr_pages && + if (memcg && nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; else if (obj_stock_flush_required(stock, root_memcg)) @@ -2406,17 +2430,27 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, do { unsigned long pflags; + long cache_overused; - if (page_counter_read(&memcg->memory) <= - READ_ONCE(memcg->memory.high)) - continue; + if (page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high)) { + memcg_memory_event(memcg, MEMCG_HIGH); + + psi_memstall_enter(&pflags); + nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, + nr_pages, gfp_mask, true); + psi_memstall_leave(&pflags); + } - memcg_memory_event(memcg, MEMCG_HIGH); + cache_overused = page_counter_read(&memcg->cache) - + memcg->cache.max; - psi_memstall_enter(&pflags); - nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, - gfp_mask, true); - psi_memstall_leave(&pflags); + if (cache_overused > 0) { + psi_memstall_enter(&pflags); + nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, + cache_overused, gfp_mask, false); + psi_memstall_leave(&pflags); + } } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2652,7 +2686,7 @@ void mem_cgroup_handle_over_high(void) } static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages) + unsigned int nr_pages, bool cache_charge) { unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MAX_RECLAIM_RETRIES; @@ -2666,8 +2700,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long pflags; retry: - if (consume_stock(memcg, nr_pages)) - return 0; + if (consume_stock(memcg, nr_pages, cache_charge)) + goto done; if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { @@ -2780,13 +2814,19 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); + if (cache_charge) + page_counter_charge(&memcg->cache, nr_pages); return 0; done_restock: + if (cache_charge) + page_counter_charge(&memcg->cache, batch); + if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); + refill_stock(memcg, batch - nr_pages, cache_charge); +done: /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here @@ -2826,6 +2866,9 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; + } else if (page_counter_read(&memcg->cache) > memcg->cache.max) { + if (!work_pending(&memcg->high_work)) + schedule_work(&memcg->high_work); } } while ((memcg = parent_mem_cgroup(memcg))); @@ -2833,12 +2876,12 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, } static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages) + unsigned int nr_pages, bool cache_charge) { if (mem_cgroup_is_root(memcg)) return 0; - return try_charge_memcg(memcg, gfp_mask, nr_pages); + return try_charge_memcg(memcg, gfp_mask, nr_pages, cache_charge); } static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -3024,7 +3067,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, memcg = get_mem_cgroup_from_objcg(objcg); memcg_account_kmem(memcg, -nr_pages); - refill_stock(memcg, nr_pages); + refill_stock(memcg, nr_pages, false); css_put(&memcg->css); } @@ -3045,7 +3088,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, memcg = get_mem_cgroup_from_objcg(objcg); - ret = try_charge_memcg(memcg, gfp, nr_pages); + ret = try_charge_memcg(memcg, gfp, nr_pages, false); if (ret) goto out; @@ -3204,7 +3247,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) memcg = get_mem_cgroup_from_objcg(old); memcg_account_kmem(memcg, -nr_pages); - __refill_stock(memcg, nr_pages); + __refill_stock(memcg, nr_pages, false); css_put(&memcg->css); } @@ -3352,7 +3395,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, { int ret = 0; - ret = try_charge(memcg, gfp, nr_pages); + ret = try_charge(memcg, gfp, nr_pages, false); if (!ret) page_counter_charge(&memcg->kmem, nr_pages); @@ -3711,6 +3754,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, case _TCP: counter = &memcg->tcpmem; break; + case _CACHE: + counter = &memcg->cache; + break; default: BUG(); } @@ -3829,6 +3875,43 @@ static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) return ret; } +static int memcg_update_cache_max(struct mem_cgroup *memcg, + unsigned long limit) +{ + unsigned long nr_pages; + bool enlarge = false; + int ret; + + do { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + mutex_lock(&memcg_max_mutex); + + if (limit > memcg->cache.max) + enlarge = true; + + ret = page_counter_set_max(&memcg->cache, limit); + mutex_unlock(&memcg_max_mutex); + + if (!ret) + break; + + nr_pages = max_t(long, 1, page_counter_read(&memcg->cache) - limit); + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages, + GFP_KERNEL, false)) { + ret = -EBUSY; + break; + } + } while (1); + + if (!ret && enlarge) + memcg_oom_recover(memcg); + + return ret; +} + /* * The user of this function is... * RES_LIMIT. @@ -3865,6 +3948,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, case _TCP: ret = memcg_update_tcp_max(memcg, nr_pages); break; + case _CACHE: + ret = memcg_update_cache_max(memcg, nr_pages); + break; } break; case RES_SOFT_LIMIT: @@ -3898,6 +3984,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, case _TCP: counter = &memcg->tcpmem; break; + case _CACHE: + counter = &memcg->cache; + break; default: BUG(); } @@ -5541,6 +5630,17 @@ static struct cftype mem_cgroup_legacy_files[] = { { .name = "pressure_level", }, + { + .name = "cache.limit_in_bytes", + .private = MEMFILE_PRIVATE(_CACHE, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "cache.usage_in_bytes", + .private = MEMFILE_PRIVATE(_CACHE, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -5825,11 +5925,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + page_counter_init(&memcg->cache, &parent->cache); } else { page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + page_counter_init(&memcg->cache, NULL); root_mem_cgroup = memcg; return &memcg->css; @@ -5950,6 +6052,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->cache, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); @@ -6051,7 +6154,8 @@ static int mem_cgroup_do_precharge(unsigned long count) int ret; /* Try a single bulk charge without reclaim first, kswapd may wake */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count, + false); if (!ret) { mc.precharge += count; return ret; @@ -6059,7 +6163,7 @@ static int mem_cgroup_do_precharge(unsigned long count) /* Try charges one by one with reclaim, but do not retry */ while (count--) { - ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); + ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1, false); if (ret) return ret; mc.precharge++; @@ -7285,18 +7389,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, } static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, - gfp_t gfp) + gfp_t gfp, bool cache_charge) { long nr_pages = folio_nr_pages(folio); int ret; - ret = try_charge(memcg, gfp, nr_pages); + ret = try_charge(memcg, gfp, nr_pages, cache_charge); if (ret) goto out; css_get(&memcg->css); commit_charge(folio, memcg); + /* + * We always cleanup this flag on uncharging, it means + * that during charging we shouldn't have this flag set + */ + + VM_BUG_ON(folio_memcg_cache(folio)); + if (cache_charge) + WRITE_ONCE(folio->memcg_data, + READ_ONCE(folio->memcg_data) | MEMCG_DATA_PGCACHE); local_irq_disable(); mem_cgroup_charge_statistics(memcg, nr_pages); memcg_check_events(memcg, folio_nid(folio)); @@ -7305,18 +7418,32 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, return ret; } -int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) +static int __mem_cgroup_charge_gen(struct folio *folio, struct mm_struct *mm, + gfp_t gfp_mask, bool cache_charge) { struct mem_cgroup *memcg; int ret; + if (mem_cgroup_disabled()) + return 0; + memcg = get_mem_cgroup_from_mm(mm); - ret = charge_memcg(folio, memcg, gfp); + ret = charge_memcg(folio, memcg, gfp_mask, cache_charge); css_put(&memcg->css); return ret; } +int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) +{ + return __mem_cgroup_charge_gen(folio, mm, gfp, false); +} + +int mem_cgroup_charge_cache(struct folio *folio, struct mm_struct *mm, gfp_t gfp) +{ + return __mem_cgroup_charge_gen(folio, mm, gfp, true); +} + /** * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin * @page: page to charge @@ -7347,7 +7474,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); - ret = charge_memcg(folio, memcg, gfp); + ret = charge_memcg(folio, memcg, gfp, false); css_put(&memcg->css); return ret; @@ -7391,6 +7518,7 @@ struct uncharge_gather { unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; + unsigned long nr_pgcache; int nid; }; @@ -7409,6 +7537,9 @@ static void uncharge_batch(const struct uncharge_gather *ug) page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); if (ug->nr_kmem) memcg_account_kmem(ug->memcg, -ug->nr_kmem); + if (ug->nr_pgcache) + page_counter_uncharge(&ug->memcg->cache, ug->nr_pgcache); + memcg_oom_recover(ug->memcg); } @@ -7470,6 +7601,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) folio->memcg_data = 0; obj_cgroup_put(objcg); } else { + if (folio_memcg_cache(folio)) + ug->nr_pgcache += nr_pages; /* LRU pages aren't accounted at the root level */ if (!mem_cgroup_is_root(memcg)) ug->nr_memory += nr_pages; @@ -7553,6 +7686,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) page_counter_charge(&memcg->memsw, nr_pages); } + WARN_ON((!PageAnon(&new->page) && !PageSwapBacked(&new->page)) | + folio_memcg_cache(new)); + + if (folio_memcg_cache(new)) + page_counter_charge(&memcg->cache, nr_pages); + css_get(&memcg->css); commit_charge(new, memcg); @@ -7621,7 +7760,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, return false; } - if (try_charge(memcg, gfp_mask, nr_pages) == 0) { + if (try_charge(memcg, gfp_mask, nr_pages, false) == 0) { mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); return true; } @@ -7643,7 +7782,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); - refill_stock(memcg, nr_pages); + refill_stock(memcg, nr_pages, false); } static int __init cgroup_memory(char *s) -- 2.31.1 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel