Hello,

On Wed, Apr 07, 2021 at 11:26:18AM -0700, Roman Gushchin wrote:
> This patch implements partial depopulation of percpu chunks.
> 
> As now, a chunk can be depopulated only as a part of the final
> destruction, if there are no more outstanding allocations. However
> to minimize a memory waste it might be useful to depopulate a
> partially filed chunk, if a small number of outstanding allocations
> prevents the chunk from being fully reclaimed.
> 
> This patch implements the following depopulation process: it scans
> over the chunk pages, looks for a range of empty and populated pages
> and performs the depopulation. To avoid races with new allocations,
> the chunk is previously isolated. After the depopulation the chunk is
> sidelined to a special list or freed. New allocations can't be served
> using a sidelined chunk. The chunk can be moved back to a corresponding
> slot if there are not enough chunks with empty populated pages.
> 
> The depopulation is scheduled on the free path. Is the chunk:
>   1) has more than 1/4 of total pages free and populated
>   2) the system has enough free percpu pages aside of this chunk
>   3) isn't the reserved chunk
>   4) isn't the first chunk
>   5) isn't entirely free
> it's a good target for depopulation. If it's already depopulated
> but got free populated pages, it's a good target too.
> The chunk is moved to a special pcpu_depopulate_list, chunk->isolate
> flag is set and the async balancing is scheduled.
> 
> The async balancing moves pcpu_depopulate_list to a local list
> (because pcpu_depopulate_list can be changed when pcpu_lock is
> releases), and then tries to depopulate each chunk.  The depopulation
> is performed in the reverse direction to keep populated pages close to
> the beginning, if the global number of empty pages is reached.
> Depopulated chunks are sidelined to prevent further allocations.
> Skipped and fully empty chunks are returned to the corresponding slot.
> 
> On the allocation path, if there are no suitable chunks found,
> the list of sidelined chunks in scanned prior to creating a new chunk.
> If there is a good sidelined chunk, it's placed back to the slot
> and the scanning is restarted.
> 
> Many thanks to Dennis Zhou for his great ideas and a very constructive
> discussion which led to many improvements in this patchset!
> 
> Signed-off-by: Roman Gushchin <g...@fb.com>
> ---
>  mm/percpu-internal.h |   2 +
>  mm/percpu.c          | 164 ++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 164 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
> index 095d7eaa0db4..8e432663c41e 100644
> --- a/mm/percpu-internal.h
> +++ b/mm/percpu-internal.h
> @@ -67,6 +67,8 @@ struct pcpu_chunk {
>  
>       void                    *data;          /* chunk data */
>       bool                    immutable;      /* no [de]population allowed */
> +     bool                    isolated;       /* isolated from chunk slot 
> lists */
> +     bool                    depopulated;    /* sidelined after depopulation 
> */
>       int                     start_offset;   /* the overlap with the previous
>                                                  region to have a page aligned
>                                                  base_addr */
> diff --git a/mm/percpu.c b/mm/percpu.c
> index e20119668c42..0a5a5e84e0a4 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -181,6 +181,19 @@ static LIST_HEAD(pcpu_map_extend_chunks);
>   */
>  int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];
>  
> +/*
> + * List of chunks with a lot of free pages.  Used to depopulate them
> + * asynchronously.
> + */
> +static struct list_head pcpu_depopulate_list[PCPU_NR_CHUNK_TYPES];
> +
> +/*
> + * List of previously depopulated chunks.  They are not usually used for new
> + * allocations, but can be returned back to service if a need arises.
> + */
> +static struct list_head pcpu_sideline_list[PCPU_NR_CHUNK_TYPES];
> +
> +
>  /*
>   * The number of populated pages in use by the allocator, protected by
>   * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page 
> gets
> @@ -542,6 +555,12 @@ static void pcpu_chunk_relocate(struct pcpu_chunk 
> *chunk, int oslot)
>  {
>       int nslot = pcpu_chunk_slot(chunk);
>  
> +     /*
> +      * Keep isolated and depopulated chunks on a sideline.
> +      */
> +     if (chunk->isolated || chunk->depopulated)
> +             return;
> +
>       if (oslot != nslot)
>               __pcpu_chunk_move(chunk, nslot, oslot < nslot);
>  }
> @@ -1778,6 +1797,25 @@ static void __percpu *pcpu_alloc(size_t size, size_t 
> align, bool reserved,
>               }
>       }
>  
> +     /* search through sidelined depopulated chunks */
> +     list_for_each_entry(chunk, &pcpu_sideline_list[type], list) {
> +             struct pcpu_block_md *chunk_md = &chunk->chunk_md;
> +             int bit_off;
> +
> +             /*
> +              * If the allocation can fit in the chunk's contig hint,
> +              * place the chunk back into corresponding slot and restart
> +              * the scanning.
> +              */
> +             bit_off = ALIGN(chunk_md->contig_hint_start, align) -
> +                     chunk_md->contig_hint_start;
> +             if (bit_off + bits > chunk_md->contig_hint) {
> +                     chunk->depopulated = false;
> +                     pcpu_chunk_relocate(chunk, -1);
> +                     goto restart;
> +             }

This check should be bit_off + bits < chunk_md->contig_hint.
Can you please factor that out to a function:

pcpu_check_chunk_hint(chunk_md, bits)
{
  int bit_off = (ALIGN(chunk_md->contig_hint_start, align) -
                 chunk_md->contig_hint_start);

  return (bit_off + bits < chunk_md->contig_hint);
}

Then your use case can just call pcpu_check_chunk_hint() and the other
user pcpu_find_block_fit() can use !pcpu_check_chunk_hint().

> +     }
> +
>       spin_unlock_irqrestore(&pcpu_lock, flags);
>  
>       /*
> @@ -2048,6 +2086,106 @@ static void pcpu_grow_populated(enum pcpu_chunk_type 
> type, int nr_to_pop)
>       }
>  }
>  
> +/**
> + * pcpu_shrink_populated - scan chunks and release unused pages to the system
> + * @type: chunk type
> + *
> + * Scan over chunks in the depopulate list, try to release unused populated
> + * pages to the system.  Depopulated chunks are sidelined to prevent further
> + * allocations without a need.  Skipped and fully free chunks are returned
> + * to corresponding slots.  Stop depopulating if the number of empty 
> populated
> + * pages reaches the threshold.  Each chunk is scanned in the reverse order 
> to
> + * keep populated pages close to the beginning of the chunk.
> + */
> +static void pcpu_shrink_populated(enum pcpu_chunk_type type)
> +{
> +     struct pcpu_block_md *block;
> +     struct pcpu_chunk *chunk, *tmp;
> +     LIST_HEAD(to_depopulate);
> +     bool depopulated;
> +     int i, end;
> +
> +     spin_lock_irq(&pcpu_lock);
> +
> +     list_splice_init(&pcpu_depopulate_list[type], &to_depopulate);
> +
> +     list_for_each_entry_safe(chunk, tmp, &to_depopulate, list) {
> +             WARN_ON(chunk->immutable);
> +             depopulated = false;
> +
> +             /*
> +              * Scan chunk's pages in the reverse order to keep populated
> +              * pages close to the beginning of the chunk.
> +              */
> +             for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
> +                     /*
> +                      * If the chunk has no empty pages or
> +                      * we're short on empty pages in general,
> +                      * just put the chunk back into the original slot.
> +                      */
> +                     if (!chunk->nr_empty_pop_pages ||
> +                         pcpu_nr_empty_pop_pages[type] <=
> +                         PCPU_EMPTY_POP_PAGES_HIGH)
> +                             break;
> +
> +                     /*
> +                      * If the page is empty and populated, start or
> +                      * extend the (i, end) range.  If i == 0, decrease
> +                      * i and perform the depopulation to cover the last
> +                      * (first) page in the chunk.
> +                      */
> +                     block = chunk->md_blocks + i;
> +                     if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
> +                         test_bit(i, chunk->populated)) {
> +                             if (end == -1)
> +                                     end = i;
> +                             if (i > 0)
> +                                     continue;
> +                             i--;
> +                     }
> +
> +                     /*
> +                      * Otherwise check if there is an active range,
> +                      * and if yes, depopulate it.
> +                      */
> +                     if (end == -1)
> +                             continue;
> +
> +                     depopulated = true;
> +
> +                     spin_unlock_irq(&pcpu_lock);
> +                     pcpu_depopulate_chunk(chunk, i + 1, end + 1);
> +                     cond_resched();
> +                     spin_lock_irq(&pcpu_lock);
> +
> +                     pcpu_chunk_depopulated(chunk, i + 1, end + 1);
> +
> +                     /*
> +                      * Reset the range and continue.
> +                      */
> +                     end = -1;
> +             }
> +
> +             chunk->isolated = false;
> +             if (chunk->free_bytes == pcpu_unit_size || !depopulated) {
> +                     /*
> +                      * If the chunk is empty or hasn't been depopulated,
> +                      * return it to the original slot.
> +                      */
> +                     pcpu_chunk_relocate(chunk, -1);
> +             } else {
> +                     /*
> +                      * Otherwise put the chunk to the list of depopulated
> +                      * chunks.
> +                      */
> +                     chunk->depopulated = true;
> +                     list_move(&chunk->list, &pcpu_sideline_list[type]);
> +             }
> +     }
> +
> +     spin_unlock_irq(&pcpu_lock);
> +}
> +
>  /**
>   * pcpu_balance_populated - manage the amount of populated pages
>   * @type: chunk type
> @@ -2078,6 +2216,8 @@ static void pcpu_balance_populated(enum pcpu_chunk_type 
> type)
>       } else if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_HIGH) {
>               nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH - 
> pcpu_nr_empty_pop_pages[type];
>               pcpu_grow_populated(type, nr_to_pop);
> +     } else if (!list_empty(&pcpu_depopulate_list[type])) {
> +             pcpu_shrink_populated(type);
>       }
>  }
>  
> @@ -2135,7 +2275,13 @@ void free_percpu(void __percpu *ptr)
>  
>       pcpu_memcg_free_hook(chunk, off, size);
>  
> -     /* if there are more than one fully free chunks, wake up grim reaper */
> +     /*
> +      * If there are more than one fully free chunks, wake up grim reaper.
> +      * Otherwise if at least 1/4 of its pages are empty and there is no
> +      * system-wide shortage of empty pages aside from this chunk, isolate
> +      * the chunk and schedule an async depopulation.  If the chunk was
> +      * depopulated previously and got free pages, depopulate it too.
> +      */
>       if (chunk->free_bytes == pcpu_unit_size) {
>               struct pcpu_chunk *pos;
>  
> @@ -2144,6 +2290,16 @@ void free_percpu(void __percpu *ptr)
>                               need_balance = true;
>                               break;
>                       }
> +     } else if (chunk != pcpu_first_chunk && chunk != pcpu_reserved_chunk &&
> +                !chunk->isolated &&
> +                pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] >
> +                PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages &&

nit: can you add parethesis around this condition?

> +                ((chunk->depopulated && chunk->nr_empty_pop_pages) ||
> +                 (chunk->nr_empty_pop_pages >= chunk->nr_pages / 4))) {
> +             list_move(&chunk->list, 
> &pcpu_depopulate_list[pcpu_chunk_type(chunk)]);
> +             chunk->isolated = true;
> +             chunk->depopulated = false;
> +             need_balance = true;
>       }
>  
>       trace_percpu_free_percpu(chunk->base_addr, off, ptr);
> @@ -2571,10 +2727,14 @@ void __init pcpu_setup_first_chunk(const struct 
> pcpu_alloc_info *ai,
>                     pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
>                     PCPU_NR_CHUNK_TYPES);
>  
> -     for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
> +     for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
>               for (i = 0; i < pcpu_nr_slots; i++)
>                       INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
>  
> +             INIT_LIST_HEAD(&pcpu_depopulate_list[type]);
> +             INIT_LIST_HEAD(&pcpu_sideline_list[type]);
> +     }
> +
>       /*
>        * The end of the static region needs to be aligned with the
>        * minimum allocation size as this offsets the reserved and
> -- 
> 2.30.2
> 

Thanks,
Dennis

Reply via email to