On 02.09.25 06:06, Dave Airlie wrote: > From: Dave Airlie <airl...@redhat.com> > > This enables all the backend code to use the list lru in memcg mode, > and set the shrinker to be memcg aware. > > It adds the loop case for when pooled pages end up being reparented > to a higher memcg group, that newer memcg can search for them there > and take them back.
I can only repeat that as far as I can see that makes no sense at all. This just enables stealing pages from the page pool per cgroup and won't give them back if another cgroup runs into a low memery situation. Maybe Thomas and the XE guys have an use case for that, but as far as I can see that behavior is not something we would ever want. Regards, Christian. > > Signed-off-by: Dave Airlie <airl...@redhat.com> > > --- > v2: just use the proper stats. > --- > drivers/gpu/drm/ttm/ttm_pool.c | 127 ++++++++++++++++++++++++++------- > mm/list_lru.c | 1 + > 2 files changed, 104 insertions(+), 24 deletions(-) > > diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c > index 2c9969de7517..1e6da2cc1f06 100644 > --- a/drivers/gpu/drm/ttm/ttm_pool.c > +++ b/drivers/gpu/drm/ttm/ttm_pool.c > @@ -142,7 +142,9 @@ static int ttm_pool_nid(struct ttm_pool *pool) { > } > > /* Allocate pages of size 1 << order with the given gfp_flags */ > -static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t > gfp_flags, > +static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, > + struct obj_cgroup *objcg, > + gfp_t gfp_flags, > unsigned int order) > { > unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS; > @@ -162,7 +164,10 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool > *pool, gfp_t gfp_flags, > p = alloc_pages_node(pool->nid, gfp_flags, order); > if (p) { > p->private = order; > - mod_lruvec_page_state(p, NR_GPU_ACTIVE, 1 << order); > + if (!mem_cgroup_charge_gpu_page(objcg, p, order, > gfp_flags, false)) { > + __free_pages(p, order); > + return NULL; > + } > } > return p; > } > @@ -213,8 +218,7 @@ static void ttm_pool_free_page(struct ttm_pool *pool, > enum ttm_caching caching, > #endif > > if (!pool || !pool->use_dma_alloc) { > - mod_lruvec_page_state(p, reclaim ? NR_GPU_RECLAIM : > NR_GPU_ACTIVE, > - -(1 << order)); > + mem_cgroup_uncharge_gpu_page(p, order, reclaim); > __free_pages(p, order); > return; > } > @@ -301,12 +305,11 @@ static void ttm_pool_type_give(struct ttm_pool_type > *pt, struct page *p) > > INIT_LIST_HEAD(&p->lru); > rcu_read_lock(); > - list_lru_add(&pt->pages, &p->lru, nid, NULL); > + list_lru_add(&pt->pages, &p->lru, nid, page_memcg_check(p)); > rcu_read_unlock(); > > - atomic_long_add(num_pages, &allocated_pages[nid]); > - mod_lruvec_page_state(p, NR_GPU_ACTIVE, -num_pages); > - mod_lruvec_page_state(p, NR_GPU_RECLAIM, num_pages); > + atomic_long_add(num_pages, &allocated_pages[nid]); > + mem_cgroup_move_gpu_page_reclaim(NULL, p, pt->order, true); > } > > static enum lru_status take_one_from_lru(struct list_head *item, > @@ -321,20 +324,56 @@ static enum lru_status take_one_from_lru(struct > list_head *item, > return LRU_REMOVED; > } > > -/* Take pages from a specific pool_type, return NULL when nothing available > */ > -static struct page *ttm_pool_type_take(struct ttm_pool_type *pt, int nid) > +static int pool_lru_get_page(struct ttm_pool_type *pt, int nid, > + struct page **page_out, > + struct obj_cgroup *objcg, > + struct mem_cgroup *memcg) > { > int ret; > struct page *p = NULL; > unsigned long nr_to_walk = 1; > + unsigned int num_pages = 1 << pt->order; > > - ret = list_lru_walk_node(&pt->pages, nid, take_one_from_lru, (void > *)&p, &nr_to_walk); > + ret = list_lru_walk_one(&pt->pages, nid, memcg, take_one_from_lru, > (void *)&p, &nr_to_walk); > if (ret == 1 && p) { > - atomic_long_sub(1 << pt->order, &allocated_pages[nid]); > - mod_lruvec_page_state(p, NR_GPU_ACTIVE, (1 << pt->order)); > - mod_lruvec_page_state(p, NR_GPU_RECLAIM, -(1 << pt->order)); > > + atomic_long_sub(num_pages, &allocated_pages[nid]); > + > + if (!mem_cgroup_move_gpu_page_reclaim(objcg, p, pt->order, > false)) { > + __free_pages(p, pt->order); > + p = NULL; > + } > } > - return p; > + *page_out = p; > + return ret; > +} > + > +/* Take pages from a specific pool_type, return NULL when nothing available > */ > +static struct page *ttm_pool_type_take(struct ttm_pool_type *pt, int nid, > + struct obj_cgroup *orig_objcg) > +{ > + struct page *page_out = NULL; > + int ret; > + struct mem_cgroup *orig_memcg = orig_objcg ? > get_mem_cgroup_from_objcg(orig_objcg) : NULL; > + struct mem_cgroup *memcg = orig_memcg; > + > + /* > + * Attempt to get a page from the current memcg, but if it hasn't got > any in it's level, > + * go up to the parent and check there. This helps the scenario where > multiple apps get > + * started into their own cgroup from a common parent and want to reuse > the pools. > + */ > + while (!page_out) { > + ret = pool_lru_get_page(pt, nid, &page_out, orig_objcg, memcg); > + if (ret == 1) > + break; > + if (!memcg) > + break; > + memcg = parent_mem_cgroup(memcg); > + if (!memcg) > + break; > + } > + > + mem_cgroup_put(orig_memcg); > + return page_out; > } > > /* Initialize and add a pool type to the global shrinker list */ > @@ -344,7 +383,7 @@ static void ttm_pool_type_init(struct ttm_pool_type *pt, > struct ttm_pool *pool, > pt->pool = pool; > pt->caching = caching; > pt->order = order; > - list_lru_init(&pt->pages); > + list_lru_init_memcg(&pt->pages, mm_shrinker); > > spin_lock(&shrinker_lock); > list_add_tail(&pt->shrinker_list, &shrinker_list); > @@ -387,6 +426,30 @@ static void ttm_pool_type_fini(struct ttm_pool_type *pt) > ttm_pool_dispose_list(pt, &dispose); > } > > +static int ttm_pool_check_objcg(struct obj_cgroup *objcg) > +{ > +#ifdef CONFIG_MEMCG > + int r = 0; > + struct mem_cgroup *memcg; > + if (!objcg) > + return 0; > + > + memcg = get_mem_cgroup_from_objcg(objcg); > + for (unsigned i = 0; i < NR_PAGE_ORDERS; i++) { > + r = memcg_list_lru_alloc(memcg, > &global_write_combined[i].pages, GFP_KERNEL); > + if (r) { > + break; > + } > + r = memcg_list_lru_alloc(memcg, &global_uncached[i].pages, > GFP_KERNEL); > + if (r) { > + break; > + } > + } > + mem_cgroup_put(memcg); > +#endif > + return 0; > +} > + > /* Return the pool_type to use for the given caching and order */ > static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool, > enum ttm_caching caching, > @@ -416,7 +479,9 @@ static struct ttm_pool_type *ttm_pool_select_type(struct > ttm_pool *pool, > } > > /* Free pages using the per-node shrinker list */ > -static unsigned int ttm_pool_shrink(int nid, unsigned long num_to_free) > +static unsigned int ttm_pool_shrink(int nid, > + struct mem_cgroup *memcg, > + unsigned long num_to_free) > { > LIST_HEAD(dispose); > struct ttm_pool_type *pt; > @@ -428,7 +493,11 @@ static unsigned int ttm_pool_shrink(int nid, unsigned > long num_to_free) > list_move_tail(&pt->shrinker_list, &shrinker_list); > spin_unlock(&shrinker_lock); > > - num_pages = list_lru_walk_node(&pt->pages, nid, > pool_move_to_dispose_list, &dispose, &num_to_free); > + if (!memcg) { > + num_pages = list_lru_walk_node(&pt->pages, nid, > pool_move_to_dispose_list, &dispose, &num_to_free); > + } else { > + num_pages = list_lru_walk_one(&pt->pages, nid, memcg, > pool_move_to_dispose_list, &dispose, &num_to_free); > + } > num_pages *= 1 << pt->order; > > ttm_pool_dispose_list(pt, &dispose); > @@ -593,6 +662,7 @@ static int ttm_pool_restore_commit(struct > ttm_pool_tt_restore *restore, > */ > ttm_pool_split_for_swap(restore->pool, p); > copy_highpage(restore->alloced_page + i, p); > + p->memcg_data = 0; > __free_pages(p, 0); > } > > @@ -754,6 +824,7 @@ static int __ttm_pool_alloc(struct ttm_pool *pool, struct > ttm_tt *tt, > bool allow_pools; > struct page *p; > int r; > + struct obj_cgroup *objcg = memcg_account ? tt->objcg : NULL; > > WARN_ON(!alloc->remaining_pages || ttm_tt_is_populated(tt)); > WARN_ON(alloc->dma_addr && !pool->dev); > @@ -771,6 +842,9 @@ static int __ttm_pool_alloc(struct ttm_pool *pool, struct > ttm_tt *tt, > > page_caching = tt->caching; > allow_pools = true; > + > + ttm_pool_check_objcg(objcg); > + > for (order = ttm_pool_alloc_find_order(MAX_PAGE_ORDER, alloc); > alloc->remaining_pages; > order = ttm_pool_alloc_find_order(order, alloc)) { > @@ -780,7 +854,7 @@ static int __ttm_pool_alloc(struct ttm_pool *pool, struct > ttm_tt *tt, > p = NULL; > pt = ttm_pool_select_type(pool, page_caching, order); > if (pt && allow_pools) > - p = ttm_pool_type_take(pt, ttm_pool_nid(pool)); > + p = ttm_pool_type_take(pt, ttm_pool_nid(pool), objcg); > > /* > * If that fails or previously failed, allocate from system. > @@ -791,7 +865,7 @@ static int __ttm_pool_alloc(struct ttm_pool *pool, struct > ttm_tt *tt, > if (!p) { > page_caching = ttm_cached; > allow_pools = false; > - p = ttm_pool_alloc_page(pool, gfp_flags, order); > + p = ttm_pool_alloc_page(pool, objcg, gfp_flags, order); > } > /* If that fails, lower the order if possible and retry. */ > if (!p) { > @@ -935,7 +1009,7 @@ void ttm_pool_free(struct ttm_pool *pool, struct ttm_tt > *tt) > > while (atomic_long_read(&allocated_pages[nid]) > pool_node_limit[nid]) { > unsigned long diff = pool_node_limit[nid] - > atomic_long_read(&allocated_pages[nid]); > - ttm_pool_shrink(nid, diff); > + ttm_pool_shrink(nid, NULL, diff); > } > } > EXPORT_SYMBOL(ttm_pool_free); > @@ -1055,6 +1129,7 @@ long ttm_pool_backup(struct ttm_pool *pool, struct > ttm_tt *tt, > if (flags->purge) { > shrunken += num_pages; > page->private = 0; > + page->memcg_data = 0; > __free_pages(page, order); > memset(tt->pages + i, 0, > num_pages * sizeof(*tt->pages)); > @@ -1191,10 +1266,14 @@ static unsigned long ttm_pool_shrinker_scan(struct > shrinker *shrink, > struct shrink_control *sc) > { > unsigned long num_freed = 0; > + int num_pools; > + spin_lock(&shrinker_lock); > + num_pools = list_count_nodes(&shrinker_list); > + spin_unlock(&shrinker_lock); > > do > - num_freed += ttm_pool_shrink(sc->nid, sc->nr_to_scan); > - while (num_freed < sc->nr_to_scan && > + num_freed += ttm_pool_shrink(sc->nid, sc->memcg, > sc->nr_to_scan); > + while (num_pools-- >= 0 && num_freed < sc->nr_to_scan && > atomic_long_read(&allocated_pages[sc->nid])); > > sc->nr_scanned = num_freed; > @@ -1381,7 +1460,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) > spin_lock_init(&shrinker_lock); > INIT_LIST_HEAD(&shrinker_list); > > - mm_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "drm-ttm_pool"); > + mm_shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE | > SHRINKER_NUMA_AWARE, "drm-ttm_pool"); > if (!mm_shrinker) > return -ENOMEM; > > diff --git a/mm/list_lru.c b/mm/list_lru.c > index 627589d75320..6a277f479dc3 100644 > --- a/mm/list_lru.c > +++ b/mm/list_lru.c > @@ -562,6 +562,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct > list_lru *lru, > > return xas_error(&xas); > } > +EXPORT_SYMBOL_GPL(memcg_list_lru_alloc); > #else > static inline void memcg_init_list_lru(struct list_lru *lru, bool > memcg_aware) > {