On Tue, Oct 21, 2014 at 02:22:39PM -0400, Johannes Weiner wrote:
> On Tue, Oct 21, 2014 at 05:15:50PM +0400, Vladimir Davydov wrote:
> > mem_cgroup_reclaimable() checks whether a cgroup has reclaimable pages
> > on *any* NUMA node. However, the only place where it's called is
> > mem_cgroup_soft_reclaim(), which tries to reclaim memory from a
> > *specific* zone. So the way how it's used is incorrect - it will return
> > true even if the cgroup doesn't have pages on the zone we're scanning.
> > 
> > I think we can get rid of this check completely, because
> > mem_cgroup_shrink_node_zone(), which is called by
> > mem_cgroup_soft_reclaim() if mem_cgroup_reclaimable() returns true, is
> > equivalent to shrink_lruvec(), which exits almost immediately if the
> > lruvec passed to it is empty. So there's no need to optimize anything
> > here. Besides, we don't have such a check in the general scan path
> > (shrink_zone) either.
> > 
> > Signed-off-by: Vladimir Davydov <vdavy...@parallels.com>
> 
> Acked-by: Johannes Weiner <han...@cmpxchg.org>
> 
> How about this on top?
> 
> ---
> 
> From 27bd24b00433d9f6c8d60ba2b13dbff158b06c13 Mon Sep 17 00:00:00 2001
> From: Johannes Weiner <han...@cmpxchg.org>
> Date: Tue, 21 Oct 2014 09:53:54 -0400
> Subject: [patch] mm: memcontrol: do not filter reclaimable nodes in NUMA
>  round-robin
> 
> The round-robin node reclaim currently tries to include only nodes
> that have memory of the memcg in question, which is quite elaborate.
> 
> Just use plain round-robin over the nodes that are allowed by the
> task's cpuset, which are the most likely to contain that memcg's
> memory.  But even if zones without memcg memory are encountered,
> direct reclaim will skip over them without too much hassle.
> 
> Signed-off-by: Johannes Weiner <han...@cmpxchg.org>

Totally agree.

Acked-by: Vladimir Davydov <vdavy...@parallels.com>

> ---
>  mm/memcontrol.c | 97 
> +++------------------------------------------------------
>  1 file changed, 5 insertions(+), 92 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d353d9e1fdca..293db8234179 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -54,6 +54,7 @@
>  #include <linux/page_cgroup.h>
>  #include <linux/cpu.h>
>  #include <linux/oom.h>
> +#include <linux/cpuset.h>
>  #include <linux/lockdep.h>
>  #include <linux/file.h>
>  #include "internal.h"
> @@ -129,12 +130,10 @@ static const char * const mem_cgroup_lru_names[] = {
>  enum mem_cgroup_events_target {
>       MEM_CGROUP_TARGET_THRESH,
>       MEM_CGROUP_TARGET_SOFTLIMIT,
> -     MEM_CGROUP_TARGET_NUMAINFO,
>       MEM_CGROUP_NTARGETS,
>  };
>  #define THRESHOLDS_EVENTS_TARGET 128
>  #define SOFTLIMIT_EVENTS_TARGET 1024
> -#define NUMAINFO_EVENTS_TARGET       1024
>  
>  struct mem_cgroup_stat_cpu {
>       long count[MEM_CGROUP_STAT_NSTATS];
> @@ -352,11 +351,6 @@ struct mem_cgroup {
>  #endif
>  
>       int last_scanned_node;
> -#if MAX_NUMNODES > 1
> -     nodemask_t      scan_nodes;
> -     atomic_t        numainfo_events;
> -     atomic_t        numainfo_updating;
> -#endif
>  
>       /* List of events which userspace want to receive */
>       struct list_head event_list;
> @@ -965,9 +959,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup 
> *memcg,
>               case MEM_CGROUP_TARGET_SOFTLIMIT:
>                       next = val + SOFTLIMIT_EVENTS_TARGET;
>                       break;
> -             case MEM_CGROUP_TARGET_NUMAINFO:
> -                     next = val + NUMAINFO_EVENTS_TARGET;
> -                     break;
>               default:
>                       break;
>               }
> @@ -986,22 +977,10 @@ static void memcg_check_events(struct mem_cgroup 
> *memcg, struct page *page)
>       /* threshold event is triggered in finer grain than soft limit */
>       if (unlikely(mem_cgroup_event_ratelimit(memcg,
>                                               MEM_CGROUP_TARGET_THRESH))) {
> -             bool do_softlimit;
> -             bool do_numainfo __maybe_unused;
> -
> -             do_softlimit = mem_cgroup_event_ratelimit(memcg,
> -                                             MEM_CGROUP_TARGET_SOFTLIMIT);
> -#if MAX_NUMNODES > 1
> -             do_numainfo = mem_cgroup_event_ratelimit(memcg,
> -                                             MEM_CGROUP_TARGET_NUMAINFO);
> -#endif
>               mem_cgroup_threshold(memcg);
> -             if (unlikely(do_softlimit))
> +             if (mem_cgroup_event_ratelimit(memcg,
> +                                            MEM_CGROUP_TARGET_SOFTLIMIT))
>                       mem_cgroup_update_tree(memcg, page);
> -#if MAX_NUMNODES > 1
> -             if (unlikely(do_numainfo))
> -                     atomic_inc(&memcg->numainfo_events);
> -#endif
>       }
>  }
>  
> @@ -1708,61 +1687,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup 
> *memcg, gfp_t gfp_mask,
>                        NULL, "Memory cgroup out of memory");
>  }
>  
> -/**
> - * test_mem_cgroup_node_reclaimable
> - * @memcg: the target memcg
> - * @nid: the node ID to be checked.
> - * @noswap : specify true here if the user wants flle only information.
> - *
> - * This function returns whether the specified memcg contains any
> - * reclaimable pages on a node. Returns true if there are any reclaimable
> - * pages in the node.
> - */
> -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
> -             int nid, bool noswap)
> -{
> -     if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
> -             return true;
> -     if (noswap || !total_swap_pages)
> -             return false;
> -     if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
> -             return true;
> -     return false;
> -
> -}
>  #if MAX_NUMNODES > 1
> -
> -/*
> - * Always updating the nodemask is not very good - even if we have an empty
> - * list or the wrong list here, we can start from some node and traverse all
> - * nodes based on the zonelist. So update the list loosely once per 10 secs.
> - *
> - */
> -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
> -{
> -     int nid;
> -     /*
> -      * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
> -      * pagein/pageout changes since the last update.
> -      */
> -     if (!atomic_read(&memcg->numainfo_events))
> -             return;
> -     if (atomic_inc_return(&memcg->numainfo_updating) > 1)
> -             return;
> -
> -     /* make a nodemask where this memcg uses memory from */
> -     memcg->scan_nodes = node_states[N_MEMORY];
> -
> -     for_each_node_mask(nid, node_states[N_MEMORY]) {
> -
> -             if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
> -                     node_clear(nid, memcg->scan_nodes);
> -     }
> -
> -     atomic_set(&memcg->numainfo_events, 0);
> -     atomic_set(&memcg->numainfo_updating, 0);
> -}
> -
>  /*
>   * Selecting a node where we start reclaim from. Because what we need is just
>   * reducing usage counter, start from anywhere is O,K. Considering
> @@ -1779,21 +1704,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup 
> *memcg)
>  {
>       int node;
>  
> -     mem_cgroup_may_update_nodemask(memcg);
> -     node = memcg->last_scanned_node;
> -
> -     node = next_node(node, memcg->scan_nodes);
> +     node = next_node(memcg->last_scanned_node, cpuset_current_mems_allowed);
>       if (node == MAX_NUMNODES)
> -             node = first_node(memcg->scan_nodes);
> -     /*
> -      * We call this when we hit limit, not when pages are added to LRU.
> -      * No LRU may hold pages because all pages are UNEVICTABLE or
> -      * memcg is too small and all pages are not on LRU. In that case,
> -      * we use curret node.
> -      */
> -     if (unlikely(node == MAX_NUMNODES))
> -             node = numa_node_id();
> -
> +             node = first_node(cpuset_current_mems_allowed);
>       memcg->last_scanned_node = node;
>       return node;
>  }
> -- 
> 2.1.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to