Le 01/09/2022 à 19:35, Suren Baghdasaryan a écrit :
> call_rcu() can take a long time when callback offloading is enabled.
> Its use in the vm_area_free can cause regressions in the exit path when
> multiple VMAs are being freed. To minimize that impact, place VMAs into
> a list and free them in groups using one call_rcu() call per group.
> 
> Signed-off-by: Suren Baghdasaryan <sur...@google.com>
> ---
>  include/linux/mm.h       |  1 +
>  include/linux/mm_types.h | 11 ++++++-
>  kernel/fork.c            | 68 +++++++++++++++++++++++++++++++++++-----
>  mm/init-mm.c             |  3 ++
>  mm/mmap.c                |  1 +
>  5 files changed, 75 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a3cbaa7b9119..81dff694ac14 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -249,6 +249,7 @@ void setup_initial_init_mm(void *start_code, void 
> *end_code,
>  struct vm_area_struct *vm_area_alloc(struct mm_struct *);
>  struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
>  void vm_area_free(struct vm_area_struct *);
> +void drain_free_vmas(struct mm_struct *mm);
>  
>  #ifndef CONFIG_MMU
>  extern struct rb_root nommu_region_tree;
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 36562e702baf..6f3effc493b1 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -412,7 +412,11 @@ struct vm_area_struct {
>                       struct vm_area_struct *vm_next, *vm_prev;
>               };
>  #ifdef CONFIG_PER_VMA_LOCK
> -             struct rcu_head vm_rcu; /* Used for deferred freeing. */
> +             struct {
> +                     struct list_head vm_free_list;
> +                     /* Used for deferred freeing. */
> +                     struct rcu_head vm_rcu;
> +             };
>  #endif
>       };
>  
> @@ -573,6 +577,11 @@ struct mm_struct {
>                                         */
>  #ifdef CONFIG_PER_VMA_LOCK
>               int mm_lock_seq;
> +             struct {
> +                     struct list_head head;
> +                     spinlock_t lock;
> +                     int size;
> +             } vma_free_list;
>  #endif
>  
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b443ba3a247a..7c88710aed72 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -483,26 +483,75 @@ struct vm_area_struct *vm_area_dup(struct 
> vm_area_struct *orig)
>  }
>  
>  #ifdef CONFIG_PER_VMA_LOCK
> -static void __vm_area_free(struct rcu_head *head)
> +static inline void __vm_area_free(struct vm_area_struct *vma)
>  {
> -     struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
> -                                               vm_rcu);
>       /* The vma should either have no lock holders or be write-locked. */
>       vma_assert_no_reader(vma);
>       kmem_cache_free(vm_area_cachep, vma);
>  }
> -#endif
> +
> +static void vma_free_rcu_callback(struct rcu_head *head)
> +{
> +     struct vm_area_struct *first_vma;
> +     struct vm_area_struct *vma, *vma2;
> +
> +     first_vma = container_of(head, struct vm_area_struct, vm_rcu);
> +     list_for_each_entry_safe(vma, vma2, &first_vma->vm_free_list, 
> vm_free_list)

Is that safe to walk the list against concurrent calls to
list_splice_init(), or list_add()?

> +             __vm_area_free(vma);
> +     __vm_area_free(first_vma);
> +}
> +
> +void drain_free_vmas(struct mm_struct *mm)
> +{
> +     struct vm_area_struct *first_vma;
> +     LIST_HEAD(to_destroy);
> +
> +     spin_lock(&mm->vma_free_list.lock);
> +     list_splice_init(&mm->vma_free_list.head, &to_destroy);
> +     mm->vma_free_list.size = 0;
> +     spin_unlock(&mm->vma_free_list.lock);
> +
> +     if (list_empty(&to_destroy))
> +             return;
> +
> +     first_vma = list_first_entry(&to_destroy, struct vm_area_struct, 
> vm_free_list);
> +     /* Remove the head which is allocated on the stack */
> +     list_del(&to_destroy);
> +
> +     call_rcu(&first_vma->vm_rcu, vma_free_rcu_callback);
> +}
> +
> +#define VM_AREA_FREE_LIST_MAX        32
> +
> +void vm_area_free(struct vm_area_struct *vma)
> +{
> +     struct mm_struct *mm = vma->vm_mm;
> +     bool drain;
> +
> +     free_anon_vma_name(vma);
> +
> +     spin_lock(&mm->vma_free_list.lock);
> +     list_add(&vma->vm_free_list, &mm->vma_free_list.head);
> +     mm->vma_free_list.size++;
> +     drain = mm->vma_free_list.size > VM_AREA_FREE_LIST_MAX;
> +     spin_unlock(&mm->vma_free_list.lock);
> +
> +     if (drain)
> +             drain_free_vmas(mm);
> +}
> +
> +#else /* CONFIG_PER_VMA_LOCK */
> +
> +void drain_free_vmas(struct mm_struct *mm) {}
>  
>  void vm_area_free(struct vm_area_struct *vma)
>  {
>       free_anon_vma_name(vma);
> -#ifdef CONFIG_PER_VMA_LOCK
> -     call_rcu(&vma->vm_rcu, __vm_area_free);
> -#else
>       kmem_cache_free(vm_area_cachep, vma);
> -#endif
>  }
>  
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
>  static void account_kernel_stack(struct task_struct *tsk, int account)
>  {
>       if (IS_ENABLED(CONFIG_VMAP_STACK)) {
> @@ -1137,6 +1186,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
> struct task_struct *p,
>       INIT_LIST_HEAD(&mm->mmlist);
>  #ifdef CONFIG_PER_VMA_LOCK
>       WRITE_ONCE(mm->mm_lock_seq, 0);
> +     INIT_LIST_HEAD(&mm->vma_free_list.head);
> +     spin_lock_init(&mm->vma_free_list.lock);
> +     mm->vma_free_list.size = 0;
>  #endif
>       mm_pgtables_bytes_init(mm);
>       mm->map_count = 0;
> diff --git a/mm/init-mm.c b/mm/init-mm.c
> index 8399f90d631c..7b6d2460545f 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -39,6 +39,9 @@ struct mm_struct init_mm = {
>       .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
>  #ifdef CONFIG_PER_VMA_LOCK
>       .mm_lock_seq    = 0,
> +     .vma_free_list.head = LIST_HEAD_INIT(init_mm.vma_free_list.head),
> +     .vma_free_list.lock =  __SPIN_LOCK_UNLOCKED(init_mm.vma_free_list.lock),
> +     .vma_free_list.size = 0,
>  #endif
>       .user_ns        = &init_user_ns,
>       .cpu_bitmap     = CPU_BITS_NONE,
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 1edfcd384f5e..d61b7ef84ba6 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3149,6 +3149,7 @@ void exit_mmap(struct mm_struct *mm)
>       }
>       mm->mmap = NULL;
>       mmap_write_unlock(mm);
> +     drain_free_vmas(mm);
>       vm_unacct_memory(nr_accounted);
>  }
>  

Reply via email to