On Wed, Nov 11, 2020 at 03:40:06PM -0500, Zi Yan wrote:
> From: Zi Yan <z...@nvidia.com>
> 
> To split a THP to any lower order pages, we need to reform THPs on
> subpages at given order and add page refcount based on the new page
> order. Also we need to reinitialize page_deferred_list after removing
> the page from the split_queue, otherwise a subsequent split will see
> list corruption when checking the page_deferred_list again.
> 
> It has many uses, like minimizing the number of pages after
> truncating a pagecache THP. For anonymous THPs, we can only split them
> to order-0 like before until we add support for any size anonymous THPs.
> 
> Signed-off-by: Zi Yan <z...@nvidia.com>
> ---
>  include/linux/huge_mm.h |  8 +++++
>  mm/huge_memory.c        | 78 +++++++++++++++++++++++++++++------------
>  mm/swap.c               |  1 -
>  3 files changed, 63 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 60a907a19f7d..9819cd9b4619 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -189,6 +189,8 @@ bool is_transparent_hugepage(struct page *page);
>  
>  bool can_split_huge_page(struct page *page, int *pextra_pins);
>  int split_huge_page_to_list(struct page *page, struct list_head *list);
> +int split_huge_page_to_list_to_order(struct page *page, struct list_head 
> *list,
> +             unsigned int new_order);
>  static inline int split_huge_page(struct page *page)
>  {
>       return split_huge_page_to_list(page, NULL);
> @@ -396,6 +398,12 @@ split_huge_page_to_list(struct page *page, struct 
> list_head *list)
>  {
>       return 0;
>  }
> +static inline int
> +split_huge_page_to_order_to_list(struct page *page, struct list_head *list,
> +             unsigned int new_order)

It was
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order);
above.


> +{
> +     return 0;
> +}
>  static inline int split_huge_page(struct page *page)
>  {
>       return 0;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 8b7d771ee962..88f50da40c9b 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2327,11 +2327,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
>  static void unmap_page(struct page *page)
>  {
>       enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
> -             TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
> +             TTU_RMAP_LOCKED;
>       bool unmap_success;
>  
>       VM_BUG_ON_PAGE(!PageHead(page), page);
>  
> +     if (thp_order(page) >= HPAGE_PMD_ORDER)
> +             ttu_flags |= TTU_SPLIT_HUGE_PMD;
> +
>       if (PageAnon(page))
>               ttu_flags |= TTU_SPLIT_FREEZE;
>  
> @@ -2339,21 +2342,22 @@ static void unmap_page(struct page *page)
>       VM_BUG_ON_PAGE(!unmap_success, page);
>  }
>  
> -static void remap_page(struct page *page, unsigned int nr)
> +static void remap_page(struct page *page, unsigned int nr, unsigned int 
> new_nr)
>  {
>       int i;
> -     if (PageTransHuge(page)) {
> +     if (thp_nr_pages(page) == nr) {
>               remove_migration_ptes(page, page, true);
>       } else {
> -             for (i = 0; i < nr; i++)
> +             for (i = 0; i < nr; i += new_nr)
>                       remove_migration_ptes(page + i, page + i, true);
>       }
>  }
>  
>  static void __split_huge_page_tail(struct page *head, int tail,
> -             struct lruvec *lruvec, struct list_head *list)
> +             struct lruvec *lruvec, struct list_head *list, unsigned int 
> new_order)
>  {
>       struct page *page_tail = head + tail;
> +     unsigned long compound_head_flag = new_order ? (1L << PG_head) : 0;
>  
>       VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
>  
> @@ -2377,6 +2381,7 @@ static void __split_huge_page_tail(struct page *head, 
> int tail,
>  #ifdef CONFIG_64BIT
>                        (1L << PG_arch_2) |
>  #endif
> +                      compound_head_flag |
>                        (1L << PG_dirty)));
>  
>       /* ->mapping in first tail page is compound_mapcount */
> @@ -2395,10 +2400,15 @@ static void __split_huge_page_tail(struct page *head, 
> int tail,
>        * which needs correct compound_head().
>        */
>       clear_compound_head(page_tail);
> +     if (new_order) {
> +             prep_compound_page(page_tail, new_order);
> +             thp_prep(page_tail);
> +     }
>  
>       /* Finally unfreeze refcount. Additional reference from page cache. */
> -     page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
> -                                       PageSwapCache(head)));
> +     page_ref_unfreeze(page_tail, 1 + ((!PageAnon(head) ||
> +                                        PageSwapCache(head)) ?
> +                                             thp_nr_pages(page_tail) : 0));
>  
>       if (page_is_young(head))
>               set_page_young(page_tail);
> @@ -2416,7 +2426,7 @@ static void __split_huge_page_tail(struct page *head, 
> int tail,
>  }
>  
>  static void __split_huge_page(struct page *page, struct list_head *list,
> -             pgoff_t end, unsigned long flags)
> +             pgoff_t end, unsigned long flags, unsigned int new_order)
>  {
>       struct page *head = compound_head(page);
>       pg_data_t *pgdat = page_pgdat(head);
> @@ -2424,12 +2434,13 @@ static void __split_huge_page(struct page *page, 
> struct list_head *list,
>       struct address_space *swap_cache = NULL;
>       unsigned long offset = 0;
>       unsigned int nr = thp_nr_pages(head);
> +     unsigned int new_nr = 1 << new_order;
>       int i;
>  
>       lruvec = mem_cgroup_page_lruvec(head, pgdat);
>  
>       /* complete memcg works before add pages to LRU */
> -     mem_cgroup_split_huge_fixup(head, 1);
> +     mem_cgroup_split_huge_fixup(head, new_nr);
>  
>       if (PageAnon(head) && PageSwapCache(head)) {
>               swp_entry_t entry = { .val = page_private(head) };
> @@ -2439,14 +2450,14 @@ static void __split_huge_page(struct page *page, 
> struct list_head *list,
>               xa_lock(&swap_cache->i_pages);
>       }
>  
> -     for (i = nr - 1; i >= 1; i--) {
> -             __split_huge_page_tail(head, i, lruvec, list);
> +     for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
> +             __split_huge_page_tail(head, i, lruvec, list, new_order);
>               /* Some pages can be beyond i_size: drop them from page cache */
>               if (head[i].index >= end) {
>                       ClearPageDirty(head + i);
>                       __delete_from_page_cache(head + i, NULL);
>                       if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
> -                             shmem_uncharge(head->mapping->host, 1);
> +                             shmem_uncharge(head->mapping->host, new_nr);
>                       put_page(head + i);
>               } else if (!PageAnon(page)) {
>                       __xa_store(&head->mapping->i_pages, head[i].index,
> @@ -2457,28 +2468,31 @@ static void __split_huge_page(struct page *page, 
> struct list_head *list,
>               }
>       }
>  
> -     ClearPageCompound(head);
> +     if (!new_order)
> +             ClearPageCompound(head);
> +     else
> +             set_compound_order(head, new_order);
>  
> -     split_page_owner(head, nr, 1);
> +     split_page_owner(head, nr, new_nr);
>  
>       /* See comment in __split_huge_page_tail() */
>       if (PageAnon(head)) {
>               /* Additional pin to swap cache */
>               if (PageSwapCache(head)) {
> -                     page_ref_add(head, 2);
> +                     page_ref_add(head, 1 + new_nr);
>                       xa_unlock(&swap_cache->i_pages);
>               } else {
>                       page_ref_inc(head);
>               }
>       } else {
>               /* Additional pin to page cache */
> -             page_ref_add(head, 2);
> +             page_ref_add(head, 1 + new_nr);
>               xa_unlock(&head->mapping->i_pages);
>       }
>  
>       spin_unlock_irqrestore(&pgdat->lru_lock, flags);
>  
> -     remap_page(head, nr);
> +     remap_page(head, nr, new_nr);
>  
>       if (PageSwapCache(head)) {
>               swp_entry_t entry = { .val = page_private(head) };
> @@ -2486,7 +2500,7 @@ static void __split_huge_page(struct page *page, struct 
> list_head *list,
>               split_swap_cluster(entry);
>       }
>  
> -     for (i = 0; i < nr; i++) {
> +     for (i = 0; i < nr; i += new_nr) {
>               struct page *subpage = head + i;
>               if (subpage == page)
>                       continue;
> @@ -2620,21 +2634,39 @@ bool can_split_huge_page(struct page *page, int 
> *pextra_pins)
>   * us.
>   */
>  int split_huge_page_to_list(struct page *page, struct list_head *list)
> +{
> +     return split_huge_page_to_list_to_order(page, list, 0);
> +}
> +
> +int split_huge_page_to_list_to_order(struct page *page, struct list_head 
> *list,
> +                                  unsigned int new_order)
>  {
>       struct page *head = compound_head(page);
>       struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
>       struct deferred_split *ds_queue = get_deferred_split_queue(head);
> -     XA_STATE(xas, &head->mapping->i_pages, head->index);
> +     /* reset xarray order to new order after split */
> +     XA_STATE_ORDER(xas, &head->mapping->i_pages, head->index, new_order);
>       struct anon_vma *anon_vma = NULL;
>       struct address_space *mapping = NULL;
>       int count, mapcount, extra_pins, ret;
>       unsigned long flags;
>       pgoff_t end;
>  
> +     VM_BUG_ON(thp_order(head) <= new_order);
>       VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
>       VM_BUG_ON_PAGE(!PageLocked(head), head);
>       VM_BUG_ON_PAGE(!PageCompound(head), head);
>  
> +     if (new_order == 1) {
> +             WARN_ONCE(1, "Cannot split THP to order-1 (no order-1 THPs)");
> +             return -EINVAL;
> +     }
> +
> +     if (PageAnon(head) && new_order) {
> +             WARN_ONCE(1, "Split anonymous THP to non-zero order not 
> support");
> +             return -EINVAL;
> +     }

I'd convert those into VM_BUG_ON()'s. Unlikely they will be hit at arbitrary 
moments
by random users.

Thanks!

Reply via email to