On 03/12/2018 01:00 AM, David Rientjes wrote:
> Kswapd will not wakeup if per-zone watermarks are not failing or if too
> many previous attempts at background reclaim have failed.
> 
> This can be true if there is a lot of free memory available.  For high-
> order allocations, kswapd is responsible for waking up kcompactd for
> background compaction.  If the zone is now below its watermarks or
                                         not ?

> reclaim has recently failed (lots of free memory, nothing left to
> reclaim), kcompactd does not get woken up.
> 
> When __GFP_DIRECT_RECLAIM is not allowed, allow kcompactd to still be
> woken up even if kswapd will not reclaim.  This allows high-order
> allocations, such as thp, to still trigger background compaction even
> when the zone has an abundance of free memory.
> 
> Signed-off-by: David Rientjes <rient...@google.com>

Acked-by: Vlastimil Babka <vba...@suse.cz>

> ---
>  .../postprocess/trace-vmscan-postprocess.pl   |  4 +--
>  include/linux/mmzone.h                        |  3 +-
>  include/trace/events/vmscan.h                 | 17 ++++++----
>  mm/page_alloc.c                               | 14 ++++----
>  mm/vmscan.c                                   | 32 +++++++++++++------
>  5 files changed, 45 insertions(+), 25 deletions(-)
> 
> diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl 
> b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
> --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
> +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
> @@ -111,7 +111,7 @@ my $regex_direct_begin_default = 'order=([0-9]*) 
> may_writepage=([0-9]*) gfp_flag
>  my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
>  my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
>  my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
> -my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
> +my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) 
> gfp_flags=([A-Z_|]*)';
>  my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) 
> classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) 
> nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
>  my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) 
> nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) 
> nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) 
> nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) 
> flags=([A-Z_|]*)';
>  my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) 
> nr_rotated=([0-9]*) priority=([0-9]*)';
> @@ -201,7 +201,7 @@ $regex_kswapd_sleep = generate_traceevent_regex(
>  $regex_wakeup_kswapd = generate_traceevent_regex(
>                       "vmscan/mm_vmscan_wakeup_kswapd",
>                       $regex_wakeup_kswapd_default,
> -                     "nid", "zid", "order");
> +                     "nid", "zid", "order", "gfp_flags");
>  $regex_lru_isolate = generate_traceevent_regex(
>                       "vmscan/mm_vmscan_lru_isolate",
>                       $regex_lru_isolate_default,
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -775,7 +775,8 @@ static inline bool is_dev_zone(const struct zone *zone)
>  #include <linux/memory_hotplug.h>
>  
>  void build_all_zonelists(pg_data_t *pgdat);
> -void wakeup_kswapd(struct zone *zone, int order, enum zone_type 
> classzone_idx);
> +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
> +                enum zone_type classzone_idx);
>  bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long 
> mark,
>                        int classzone_idx, unsigned int alloc_flags,
>                        long free_pages);
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -78,26 +78,29 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
>  
>  TRACE_EVENT(mm_vmscan_wakeup_kswapd,
>  
> -     TP_PROTO(int nid, int zid, int order),
> +     TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
>  
> -     TP_ARGS(nid, zid, order),
> +     TP_ARGS(nid, zid, order, gfp_flags),
>  
>       TP_STRUCT__entry(
> -             __field(        int,            nid     )
> -             __field(        int,            zid     )
> -             __field(        int,            order   )
> +             __field(        int,    nid             )
> +             __field(        int,    zid             )
> +             __field(        int,    order           )
> +             __field(        gfp_t,  gfp_flags       )
>       ),
>  
>       TP_fast_assign(
>               __entry->nid            = nid;
>               __entry->zid            = zid;
>               __entry->order          = order;
> +             __entry->gfp_flags      = gfp_flags;
>       ),
>  
> -     TP_printk("nid=%d zid=%d order=%d",
> +     TP_printk("nid=%d zid=%d order=%d gfp_flags=%s",
>               __entry->nid,
>               __entry->zid,
> -             __entry->order)
> +             __entry->order,
> +             show_gfp_flags(__entry->gfp_flags))
>  );
>  
>  DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3683,16 +3683,18 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned 
> int order,
>       return page;
>  }
>  
> -static void wake_all_kswapds(unsigned int order, const struct alloc_context 
> *ac)
> +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
> +                          const struct alloc_context *ac)
>  {
>       struct zoneref *z;
>       struct zone *zone;
>       pg_data_t *last_pgdat = NULL;
> +     enum zone_type high_zoneidx = ac->high_zoneidx;
>  
> -     for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
> -                                     ac->high_zoneidx, ac->nodemask) {
> +     for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
> +                                     ac->nodemask) {
>               if (last_pgdat != zone->zone_pgdat)
> -                     wakeup_kswapd(zone, order, ac->high_zoneidx);
> +                     wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
>               last_pgdat = zone->zone_pgdat;
>       }
>  }
> @@ -3971,7 +3973,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int 
> order,
>               goto nopage;
>  
>       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
> -             wake_all_kswapds(order, ac);
> +             wake_all_kswapds(order, gfp_mask, ac);
>  
>       /*
>        * The adjusted alloc_flags might result in immediate success, so try
> @@ -4029,7 +4031,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int 
> order,
>  retry:
>       /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
>       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
> -             wake_all_kswapds(order, ac);
> +             wake_all_kswapds(order, gfp_mask, ac);
>  
>       reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
>       if (reserve_flags)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3546,16 +3546,21 @@ static int kswapd(void *p)
>  }
>  
>  /*
> - * A zone is low on free memory, so wake its kswapd task to service it.
> + * A zone is low on free memory or too fragmented for high-order memory.  If
> + * kswapd should reclaim (direct reclaim is deferred), wake it up for the 
> zone's
> + * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd 
> reclaim
> + * has failed or is not needed, still wake up kcompactd if only compaction is
> + * needed.
>   */
> -void wakeup_kswapd(struct zone *zone, int order, enum zone_type 
> classzone_idx)
> +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
> +                enum zone_type classzone_idx)
>  {
>       pg_data_t *pgdat;
>  
>       if (!managed_zone(zone))
>               return;
>  
> -     if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
> +     if (!cpuset_zone_allowed(zone, gfp_flags))
>               return;
>       pgdat = zone->zone_pgdat;
>       pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
> @@ -3564,14 +3569,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum 
> zone_type classzone_idx)
>       if (!waitqueue_active(&pgdat->kswapd_wait))
>               return;
>  
> -     /* Hopeless node, leave it to direct reclaim */
> -     if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
> -             return;
> -
> -     if (pgdat_balanced(pgdat, order, classzone_idx))
> +     /* Hopeless node, leave it to direct reclaim if possible */
> +     if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
> +         pgdat_balanced(pgdat, order, classzone_idx)) {
> +             /*
> +              * There may be plenty of free memory available, but it's too
> +              * fragmented for high-order allocations.  Wake up kcompactd
> +              * and rely on compaction_suitable() to determine if it's
> +              * needed.  If it fails, it will defer subsequent attempts to
> +              * ratelimit its work.
> +              */
> +             if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
> +                     wakeup_kcompactd(pgdat, order, classzone_idx);
>               return;
> +     }
>  
> -     trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
> +     trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
> +                                   gfp_flags);
>       wake_up_interruptible(&pgdat->kswapd_wait);
>  }
>  
> 

Reply via email to