On Tue, May 12, 2026 at 05:06:48PM -0400, Michael S. Tsirkin wrote:
> When a guest reports free pages to the hypervisor via the page reporting
> framework (used by virtio-balloon and hv_balloon), the host typically
> zeros those pages when reclaiming their backing memory.  However, when
> those pages are later allocated in the guest, post_alloc_hook()
> unconditionally zeros them again if __GFP_ZERO is set.  This
> double-zeroing is wasteful, especially for large pages.
> 
> Avoid redundant zeroing:
> 
> - Add a host_zeroes_pages flag to page_reporting_dev_info, allowing
>   drivers to declare that their host zeros reported pages on reclaim.
>   A static key (page_reporting_host_zeroes) gates the fast path.
> 
> - Add PG_zeroed page flag (sharing PG_private bit) to mark pages
>   that have been zeroed by the host.  Set it in
>   page_reporting_drain() after the host reports them.
> 
> - Thread the zeroed bool through rmqueue -> prep_new_page ->
>   post_alloc_hook, where it skips redundant zeroing for __GFP_ZERO
>   allocations.
> 
> No driver sets host_zeroes_pages yet; a follow-up patch to
> virtio_balloon is needed to opt in.
> 
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> Assisted-by: Claude:claude-opus-4-6
> Assisted-by: cursor-agent:GPT-5.4-xhigh
> ---
>  include/linux/page-flags.h     |  9 +++++
>  include/linux/page_reporting.h |  3 ++
>  mm/compaction.c                |  6 ++--
>  mm/internal.h                  |  2 +-
>  mm/page_alloc.c                | 66 +++++++++++++++++++++++-----------
>  mm/page_reporting.c            | 14 +++++++-
>  mm/page_reporting.h            | 12 +++++++
>  7 files changed, 87 insertions(+), 25 deletions(-)
> 

Similar question to prior comment - we're adding plumbing in this patch
specifically to handle the zeroed page flag.

Should we instead just take a snapshot of entire page flags state
and plumb that all the way through?

I imagine we might want future post-alloc ops that want to know about
buddy-internal state of the page, so it seems useful/extensible.

David and others may have a differing opinion on whether that should
wait for another user.

~Gregory

> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 0e03d816e8b9..4ee64134acc3 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -135,6 +135,8 @@ enum pageflags {
>       PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
>       /* Some filesystems */
>       PG_checked = PG_owner_priv_1,
> +     /* Page contents are known to be zero */
> +     PG_zeroed = PG_private,
>  
>       /*
>        * Depending on the way an anonymous folio can be mapped into a page
> @@ -673,6 +675,13 @@ FOLIO_TEST_CLEAR_FLAG_FALSE(young)
>  FOLIO_FLAG_FALSE(idle)
>  #endif
>  
> +/*
> + * PageZeroed() tracks pages known to be zero.  The allocator
> + * uses this to skip redundant zeroing in post_alloc_hook().
> + */
> +__PAGEFLAG(Zeroed, zeroed, PF_NO_COMPOUND)
> +#define __PG_ZEROED (1UL << PG_zeroed)
> +
>  /*
>   * PageReported() is used to track reported free pages within the Buddy
>   * allocator. We can use the non-atomic version of the test and set
> diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
> index 5ab5be02fa15..c331c6b36687 100644
> --- a/include/linux/page_reporting.h
> +++ b/include/linux/page_reporting.h
> @@ -14,6 +14,9 @@ struct page_reporting_dev_info {
>       int (*report)(struct page_reporting_dev_info *prdev,
>                     struct scatterlist *sg, unsigned int nents);
>  
> +     /* If true, host zeros reported pages on reclaim */
> +     bool host_zeroes_pages;
> +
>       /* work struct for processing reports */
>       struct delayed_work work;
>  
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 4336e433c99b..8000fc5e0a2e 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -82,7 +82,8 @@ static inline bool is_via_compact_memory(int order) { 
> return false; }
>  
>  static struct page *mark_allocated_noprof(struct page *page, unsigned int 
> order, gfp_t gfp_flags)
>  {
> -     post_alloc_hook(page, order, __GFP_MOVABLE, USER_ADDR_NONE);
> +     __ClearPageZeroed(page);
> +     post_alloc_hook(page, order, __GFP_MOVABLE, false, USER_ADDR_NONE);
>       set_page_refcounted(page);
>       return page;
>  }
> @@ -1849,9 +1850,10 @@ static struct folio *compaction_alloc_noprof(struct 
> folio *src, unsigned long da
>               set_page_private(&freepage[size], start_order);
>       }
>       dst = (struct folio *)freepage;
> +     __ClearPageZeroed(&dst->page);
>       if (order)
>               prep_compound_page(&dst->page, order);
> -     post_alloc_hook(&dst->page, order, __GFP_MOVABLE, USER_ADDR_NONE);
> +     post_alloc_hook(&dst->page, order, __GFP_MOVABLE, false, 
> USER_ADDR_NONE);
>       set_page_refcounted(&dst->page);
>       cc->nr_freepages -= 1 << order;
>       cc->nr_migratepages -= 1 << order;
> diff --git a/mm/internal.h b/mm/internal.h
> index 389098200aa6..fd910743ddc3 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -928,7 +928,7 @@ static inline void init_compound_tail(struct page *tail,
>  }
>  
>  void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags,
> -                  unsigned long user_addr);
> +                  bool zeroed, unsigned long user_addr);
>  extern bool free_pages_prepare(struct page *page, unsigned int order);
>  
>  extern int user_min_free_kbytes;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 76f39dd026ff..bd3b909cacdf 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1743,6 +1743,7 @@ static __always_inline void page_del_and_expand(struct 
> zone *zone,
>       bool was_reported = page_reported(page);
>  
>       __del_page_from_free_list(page, zone, high, migratetype);
> +
>       nr_pages -= expand(zone, page, low, high, migratetype, was_reported);
>       account_freepages(zone, -nr_pages, migratetype);
>  }
> @@ -1815,8 +1816,10 @@ static inline bool should_skip_init(gfp_t flags)
>       return (flags & __GFP_SKIP_ZERO);
>  }
>  
> +
>  inline void post_alloc_hook(struct page *page, unsigned int order,
> -                             gfp_t gfp_flags, unsigned long user_addr)
> +                             gfp_t gfp_flags, bool zeroed,
> +                             unsigned long user_addr)
>  {
>       bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
>                       !should_skip_init(gfp_flags);
> @@ -1825,6 +1828,14 @@ inline void post_alloc_hook(struct page *page, 
> unsigned int order,
>  
>       set_page_private(page, 0);
>  
> +     /*
> +      * If the page is zeroed, skip memory initialization.
> +      * We still need to handle tag zeroing separately since the host
> +      * does not know about memory tags.
> +      */
> +     if (zeroed && init && !zero_tags)
> +             init = false;
> +
>       arch_alloc_page(page, order);
>       debug_pagealloc_map_pages(page, 1 << order);
>  
> @@ -1882,13 +1893,13 @@ inline void post_alloc_hook(struct page *page, 
> unsigned int order,
>  }
>  
>  static void prep_new_page(struct page *page, unsigned int order, gfp_t 
> gfp_flags,
> -                                                     unsigned int 
> alloc_flags,
> -                                                     unsigned long user_addr)
> +                       unsigned int alloc_flags, bool zeroed,
> +                       unsigned long user_addr)
>  {
>       if (order && (gfp_flags & __GFP_COMP))
>               prep_compound_page(page, order);
>  
> -     post_alloc_hook(page, order, gfp_flags, user_addr);
> +     post_alloc_hook(page, order, gfp_flags, zeroed, user_addr);
>  
>       /*
>        * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
> @@ -3154,6 +3165,7 @@ int __isolate_free_page(struct page *page, unsigned int 
> order)
>       }
>  
>       del_page_from_free_list(page, zone, order, mt);
> +     __ClearPageZeroed(page);
>  
>       /*
>        * Set the pageblock if the isolated page is at least half of a
> @@ -3226,7 +3238,7 @@ static inline void zone_statistics(struct zone 
> *preferred_zone, struct zone *z,
>  static __always_inline
>  struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
>                          unsigned int order, unsigned int alloc_flags,
> -                        int migratetype)
> +                        int migratetype, bool *zeroed)
>  {
>       struct page *page;
>       unsigned long flags;
> @@ -3261,6 +3273,8 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, 
> struct zone *zone,
>                       }
>               }
>               spin_unlock_irqrestore(&zone->lock, flags);
> +             *zeroed = PageZeroed(page);
> +             __ClearPageZeroed(page);
>       } while (check_new_pages(page, order));
>  
>       /*
> @@ -3329,10 +3343,9 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, 
> struct zone *zone, int order)
>  /* Remove page from the per-cpu list, caller must protect the list */
>  static inline
>  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
> -                     int migratetype,
> -                     unsigned int alloc_flags,
> +                     int migratetype, unsigned int alloc_flags,
>                       struct per_cpu_pages *pcp,
> -                     struct list_head *list)
> +                     struct list_head *list, bool *zeroed)
>  {
>       struct page *page;
>  
> @@ -3367,6 +3380,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, 
> unsigned int order,
>               page = list_first_entry(list, struct page, pcp_list);
>               list_del(&page->pcp_list);
>               pcp->count -= 1 << order;
> +             *zeroed = PageZeroed(page);
> +             __ClearPageZeroed(page);
>       } while (check_new_pages(page, order));
>  
>       return page;
> @@ -3375,7 +3390,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, 
> unsigned int order,
>  /* Lock and remove page from the per-cpu list */
>  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
>                       struct zone *zone, unsigned int order,
> -                     int migratetype, unsigned int alloc_flags)
> +                     int migratetype, unsigned int alloc_flags,
> +                     bool *zeroed)
>  {
>       struct per_cpu_pages *pcp;
>       struct list_head *list;
> @@ -3393,7 +3409,8 @@ static struct page *rmqueue_pcplist(struct zone 
> *preferred_zone,
>        */
>       pcp->free_count >>= 1;
>       list = &pcp->lists[order_to_pindex(migratetype, order)];
> -     page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, 
> list);
> +     page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags,
> +                              pcp, list, zeroed);
>       pcp_spin_unlock(pcp);
>       if (page) {
>               __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
> @@ -3418,19 +3435,19 @@ static inline
>  struct page *rmqueue(struct zone *preferred_zone,
>                       struct zone *zone, unsigned int order,
>                       gfp_t gfp_flags, unsigned int alloc_flags,
> -                     int migratetype)
> +                     int migratetype, bool *zeroed)
>  {
>       struct page *page;
>  
>       if (likely(pcp_allowed_order(order))) {
>               page = rmqueue_pcplist(preferred_zone, zone, order,
> -                                    migratetype, alloc_flags);
> +                                    migratetype, alloc_flags, zeroed);
>               if (likely(page))
>                       goto out;
>       }
>  
>       page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
> -                                                     migratetype);
> +                          migratetype, zeroed);
>  
>  out:
>       /* Separate test+clear to avoid unnecessary atomics */
> @@ -3821,6 +3838,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int 
> order, int alloc_flags,
>       struct pglist_data *last_pgdat = NULL;
>       bool last_pgdat_dirty_ok = false;
>       bool no_fallback;
> +     bool zeroed;
>       bool skip_kswapd_nodes = nr_online_nodes > 1;
>       bool skipped_kswapd_nodes = false;
>  
> @@ -3965,10 +3983,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int 
> order, int alloc_flags,
>  
>  try_this_zone:
>               page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, 
> order,
> -                             gfp_mask, alloc_flags, ac->migratetype);
> +                                     gfp_mask, alloc_flags, ac->migratetype,
> +                                     &zeroed);
>               if (page) {
>                       prep_new_page(page, order, gfp_mask, alloc_flags,
> -                                   ac->user_addr);
> +                                   zeroed, ac->user_addr);
>  
>                       return page;
>               } else {
> @@ -4195,9 +4214,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned 
> int order,
>       count_vm_event(COMPACTSTALL);
>  
>       /* Prep a captured page if available */
> -     if (page)
> -             prep_new_page(page, order, gfp_mask, alloc_flags,
> +     if (page) {
> +             __ClearPageZeroed(page);
> +             prep_new_page(page, order, gfp_mask, alloc_flags, false,
>                             ac->user_addr);
> +     }
>  
>       /* Try get a page from the freelist if available */
>       if (!page)
> @@ -5170,6 +5191,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int 
> preferred_nid,
>       /* Attempt the batch allocation */
>       pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
>       while (nr_populated < nr_pages) {
> +             bool zeroed = false;
>  
>               /* Skip existing pages */
>               if (page_array[nr_populated]) {
> @@ -5178,7 +5200,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int 
> preferred_nid,
>               }
>  
>               page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
> -                                                             pcp, pcp_list);
> +                                      pcp, pcp_list, &zeroed);
>               if (unlikely(!page)) {
>                       /* Try and allocate at least one page */
>                       if (!nr_account) {
> @@ -5189,7 +5211,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int 
> preferred_nid,
>               }
>               nr_account++;
>  
> -             prep_new_page(page, 0, gfp, 0, USER_ADDR_NONE);
> +             prep_new_page(page, 0, gfp, 0, zeroed, USER_ADDR_NONE);
>               set_page_refcounted(page);
>               page_array[nr_populated++] = page;
>       }
> @@ -6929,7 +6951,8 @@ static void split_free_frozen_pages(struct list_head 
> *list, gfp_t gfp_mask)
>               list_for_each_entry_safe(page, next, &list[order], lru) {
>                       int i;
>  
> -                     post_alloc_hook(page, order, gfp_mask, USER_ADDR_NONE);
> +                     __ClearPageZeroed(page);
> +                     post_alloc_hook(page, order, gfp_mask, false, 
> USER_ADDR_NONE);
>                       if (!order)
>                               continue;
>  
> @@ -7134,8 +7157,9 @@ int alloc_contig_frozen_range_noprof(unsigned long 
> start, unsigned long end,
>       } else if (start == outer_start && end == outer_end && 
> is_power_of_2(end - start)) {
>               struct page *head = pfn_to_page(start);
>  
> +             __ClearPageZeroed(head);
>               check_new_pages(head, order);
> -             prep_new_page(head, order, gfp_mask, 0, USER_ADDR_NONE);
> +             prep_new_page(head, order, gfp_mask, 0, false, USER_ADDR_NONE);
>       } else {
>               ret = -EINVAL;
>               WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, 
> %lu)\n",
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 006f7cdddc18..37e4fce9eb38 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -50,6 +50,8 @@ EXPORT_SYMBOL_GPL(page_reporting_order);
>  #define PAGE_REPORTING_DELAY (2 * HZ)
>  static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
>  
> +DEFINE_STATIC_KEY_FALSE(page_reporting_host_zeroes);
> +
>  enum {
>       PAGE_REPORTING_IDLE = 0,
>       PAGE_REPORTING_REQUESTED,
> @@ -129,8 +131,11 @@ page_reporting_drain(struct page_reporting_dev_info 
> *prdev,
>                * report on the new larger page when we make our way
>                * up to that higher order.
>                */
> -             if (PageBuddy(page) && buddy_order(page) == order)
> +             if (PageBuddy(page) && buddy_order(page) == order) {
>                       __SetPageReported(page);
> +                     if (page_reporting_host_zeroes_pages())
> +                             __SetPageZeroed(page);
> +             }
>       } while ((sg = sg_next(sg)));
>  
>       /* reinitialize scatterlist now that it is empty */
> @@ -391,6 +396,10 @@ int page_reporting_register(struct 
> page_reporting_dev_info *prdev)
>       /* Assign device to allow notifications */
>       rcu_assign_pointer(pr_dev_info, prdev);
>  
> +     /* enable zeroed page optimization if host zeroes reported pages */
> +     if (prdev->host_zeroes_pages)
> +             static_branch_enable(&page_reporting_host_zeroes);
> +
>       /* enable page reporting notification */
>       if (!static_key_enabled(&page_reporting_enabled)) {
>               static_branch_enable(&page_reporting_enabled);
> @@ -415,6 +424,9 @@ void page_reporting_unregister(struct 
> page_reporting_dev_info *prdev)
>  
>               /* Flush any existing work, and lock it out */
>               cancel_delayed_work_sync(&prdev->work);
> +
> +             if (prdev->host_zeroes_pages)
> +                     static_branch_disable(&page_reporting_host_zeroes);
>       }
>  
>       mutex_unlock(&page_reporting_mutex);
> diff --git a/mm/page_reporting.h b/mm/page_reporting.h
> index c51dbc228b94..736ea7b37e9e 100644
> --- a/mm/page_reporting.h
> +++ b/mm/page_reporting.h
> @@ -15,6 +15,13 @@ DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
>  extern unsigned int page_reporting_order;
>  void __page_reporting_notify(void);
>  
> +DECLARE_STATIC_KEY_FALSE(page_reporting_host_zeroes);
> +
> +static inline bool page_reporting_host_zeroes_pages(void)
> +{
> +     return static_branch_unlikely(&page_reporting_host_zeroes);
> +}
> +
>  static inline bool page_reported(struct page *page)
>  {
>       return static_branch_unlikely(&page_reporting_enabled) &&
> @@ -46,6 +53,11 @@ static inline void page_reporting_notify_free(unsigned int 
> order)
>  #else /* CONFIG_PAGE_REPORTING */
>  #define page_reported(_page) false
>  
> +static inline bool page_reporting_host_zeroes_pages(void)
> +{
> +     return false;
> +}
> +
>  static inline void page_reporting_notify_free(unsigned int order)
>  {
>  }
> -- 
> MST
> 

Reply via email to