To return unused memory to the system schedule an async
depopulation of percpu chunks.

To balance between scanning too much and creating an overhead because
of the pcpu_lock contention and scanning not enough, let's track an
amount of chunks to scan and mark chunks which are potentially a good
target for the depopulation with a new boolean flag.  The async
depopulation work will clear the flag after trying to depopulate a
chunk (successfully or not).

This commit suggest the following logic: if a chunk
  1) has more than 1/4 of total pages free and populated
  2) isn't a reserved chunk
  3) isn't entirely free
  4) isn't alone in the corresponding slot
it's a good target for depopulation.

If there are 2 or more of such chunks, an async depopulation
is scheduled.

Because chunk population and depopulation are opposite processes
which make a little sense together, split out the shrinking part of
pcpu_balance_populated() into pcpu_grow_populated() and make
pcpu_balance_populated() calling into pcpu_grow_populated() or
pcpu_shrink_populated() conditionally.

Signed-off-by: Roman Gushchin <g...@fb.com>
---
 mm/percpu-internal.h |   1 +
 mm/percpu.c          | 111 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 18b768ac7dca..1c5b92af02eb 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -67,6 +67,7 @@ struct pcpu_chunk {
 
        void                    *data;          /* chunk data */
        bool                    immutable;      /* no [de]population allowed */
+       bool                    depopulate;     /* depopulation hint */
        int                     start_offset;   /* the overlap with the previous
                                                   region to have a page aligned
                                                   base_addr */
diff --git a/mm/percpu.c b/mm/percpu.c
index 015d076893f5..148137f0fc0b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -178,6 +178,12 @@ static LIST_HEAD(pcpu_map_extend_chunks);
  */
 int pcpu_nr_empty_pop_pages;
 
+/*
+ * Track the number of chunks with a lot of free memory.
+ * It's used to release unused pages to the system.
+ */
+static int pcpu_nr_chunks_to_depopulate;
+
 /*
  * The number of populated pages in use by the allocator, protected by
  * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
@@ -1955,6 +1961,11 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, 
list))
                        continue;
 
+               if (chunk->depopulate) {
+                       chunk->depopulate = false;
+                       pcpu_nr_chunks_to_depopulate--;
+               }
+
                list_move(&chunk->list, &to_free);
        }
 
@@ -1976,7 +1987,7 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
 }
 
 /**
- * pcpu_balance_populated - manage the amount of populated pages
+ * pcpu_grow_populated - populate chunk(s) to satisfy atomic allocations
  * @type: chunk type
  *
  * Maintain a certain amount of populated pages to satisfy atomic allocations.
@@ -1985,35 +1996,15 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
  * allocation causes the failure as it is possible that requests can be
  * serviced from already backed regions.
  */
-static void pcpu_balance_populated(enum pcpu_chunk_type type)
+static void pcpu_grow_populated(enum pcpu_chunk_type type, int nr_to_pop)
 {
        /* gfp flags passed to underlying allocators */
        const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        struct list_head *pcpu_slot = pcpu_chunk_list(type);
        struct pcpu_chunk *chunk;
-       int slot, nr_to_pop, ret;
+       int slot, ret;
 
-       /*
-        * Ensure there are certain number of free populated pages for
-        * atomic allocs.  Fill up from the most packed so that atomic
-        * allocs don't increase fragmentation.  If atomic allocation
-        * failed previously, always populate the maximum amount.  This
-        * should prevent atomic allocs larger than PAGE_SIZE from keeping
-        * failing indefinitely; however, large atomic allocs are not
-        * something we support properly and can be highly unreliable and
-        * inefficient.
-        */
 retry_pop:
-       if (pcpu_atomic_alloc_failed) {
-               nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
-               /* best effort anyway, don't worry about synchronization */
-               pcpu_atomic_alloc_failed = false;
-       } else {
-               nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
-                                 pcpu_nr_empty_pop_pages,
-                                 0, PCPU_EMPTY_POP_PAGES_HIGH);
-       }
-
        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) 
{
                unsigned int nr_unpop = 0, rs, re;
 
@@ -2084,9 +2075,18 @@ static void pcpu_shrink_populated(enum pcpu_chunk_type 
type)
                list_for_each_entry(chunk, &pcpu_slot[slot], list) {
                        bool isolated = false;
 
-                       if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH)
+                       if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH 
||
+                           pcpu_nr_chunks_to_depopulate < 1)
                                break;
 
+                       /*
+                        * Don't try to depopulate a chunk again and again.
+                        */
+                       if (!chunk->depopulate)
+                               continue;
+                       chunk->depopulate = false;
+                       pcpu_nr_chunks_to_depopulate--;
+
                        for (i = 0, start = -1; i < chunk->nr_pages; i++) {
                                if (!chunk->nr_empty_pop_pages)
                                        break;
@@ -2153,6 +2153,41 @@ static void pcpu_shrink_populated(enum pcpu_chunk_type 
type)
        spin_unlock_irq(&pcpu_lock);
 }
 
+/**
+ * pcpu_balance_populated - manage the amount of populated pages
+ * @type: chunk type
+ *
+ * Populate or depopulate chunks to maintain a certain amount
+ * of free pages to satisfy atomic allocations, but not waste
+ * large amounts of memory.
+ */
+static void pcpu_balance_populated(enum pcpu_chunk_type type)
+{
+       int nr_to_pop;
+
+       /*
+        * Ensure there are certain number of free populated pages for
+        * atomic allocs.  Fill up from the most packed so that atomic
+        * allocs don't increase fragmentation.  If atomic allocation
+        * failed previously, always populate the maximum amount.  This
+        * should prevent atomic allocs larger than PAGE_SIZE from keeping
+        * failing indefinitely; however, large atomic allocs are not
+        * something we support properly and can be highly unreliable and
+        * inefficient.
+        */
+       if (pcpu_atomic_alloc_failed) {
+               nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
+               /* best effort anyway, don't worry about synchronization */
+               pcpu_atomic_alloc_failed = false;
+               pcpu_grow_populated(type, nr_to_pop);
+       } else if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
+               nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH - pcpu_nr_empty_pop_pages;
+               pcpu_grow_populated(type, nr_to_pop);
+       } else if (pcpu_nr_chunks_to_depopulate > 0) {
+               pcpu_shrink_populated(type);
+       }
+}
+
 /**
  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
  * @work: unused
@@ -2188,6 +2223,7 @@ void free_percpu(void __percpu *ptr)
        int size, off;
        bool need_balance = false;
        struct list_head *pcpu_slot;
+       struct pcpu_chunk *pos;
 
        if (!ptr)
                return;
@@ -2207,15 +2243,36 @@ void free_percpu(void __percpu *ptr)
 
        pcpu_memcg_free_hook(chunk, off, size);
 
-       /* if there are more than one fully free chunks, wake up grim reaper */
        if (chunk->free_bytes == pcpu_unit_size) {
-               struct pcpu_chunk *pos;
-
+               /*
+                * If there are more than one fully free chunks,
+                * wake up grim reaper.
+                */
                list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
                        if (pos != chunk) {
                                need_balance = true;
                                break;
                        }
+
+       } else if (chunk->nr_empty_pop_pages > chunk->nr_pages / 4) {
+               /*
+                * If there is more than one chunk in the slot and
+                * at least 1/4 of its pages are empty, mark the chunk
+                * as a target for the depopulation. If there is more
+                * than one chunk like this, schedule an async balancing.
+                */
+               int nslot = pcpu_chunk_slot(chunk);
+
+               list_for_each_entry(pos, &pcpu_slot[nslot], list)
+                       if (pos != chunk && !chunk->depopulate &&
+                           !chunk->immutable) {
+                               chunk->depopulate = true;
+                               pcpu_nr_chunks_to_depopulate++;
+                               break;
+                       }
+
+               if (pcpu_nr_chunks_to_depopulate > 1)
+                       need_balance = true;
        }
 
        trace_percpu_free_percpu(chunk->base_addr, off, ptr);
-- 
2.30.2

Reply via email to