space will get tight once we need to identify the memcg. add this to
stretch out the necessary distance by sacrificing granularity.

Signed-off-by: Johannes Weiner <han...@cmpxchg.org>
---
 mm/workingset.c | 40 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/mm/workingset.c b/mm/workingset.c
index 61ead9e5549d..6f3ba184ffb2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,8 +152,23 @@
  * refault distance will immediately activate the refaulting page.
  */
 
+#define EVICTION_SHIFT (NODES_SHIFT + ZONES_SHIFT +    \
+                       RADIX_TREE_EXCEPTIONAL_SHIFT)
+#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+
+/*
+ * Eviction timestamps need to be able to cover the full range of
+ * actionable refaults. However, bits are tight in the radix tree
+ * entry, and after storing the identifier for the lruvec there might
+ * not be enough left to represent every single actionable refault. In
+ * that case, we have to sacrifice granularity for distance, and group
+ * evictions into coarser buckets by shaving off lower timestamp bits.
+ */
+static unsigned int bucket_order;
+
 static void *pack_shadow(unsigned long eviction, struct zone *zone)
 {
+       eviction >>= bucket_order;
        eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
        eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
        eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -168,7 +183,6 @@ static void unpack_shadow(void *shadow,
        unsigned long entry = (unsigned long)shadow;
        unsigned long eviction;
        unsigned long refault;
-       unsigned long mask;
        int zid, nid;
 
        entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
@@ -176,13 +190,12 @@ static void unpack_shadow(void *shadow,
        entry >>= ZONES_SHIFT;
        nid = entry & ((1UL << NODES_SHIFT) - 1);
        entry >>= NODES_SHIFT;
-       eviction = entry;
+       eviction = entry << bucket_order;
 
        *zone = NODE_DATA(nid)->node_zones + zid;
 
        refault = atomic_long_read(&(*zone)->inactive_age);
-       mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
-                       RADIX_TREE_EXCEPTIONAL_SHIFT);
+
        /*
         * The unsigned subtraction here gives an accurate distance
         * across inactive_age overflows in most cases.
@@ -199,7 +212,7 @@ static void unpack_shadow(void *shadow,
         * inappropriate activation leading to pressure on the active
         * list is not a problem.
         */
-       *distance = (refault - eviction) & mask;
+       *distance = (refault - eviction) & EVICTION_MASK;
 }
 
 /**
@@ -398,8 +411,25 @@ static struct lock_class_key shadow_nodes_key;
 
 static int __init workingset_init(void)
 {
+       unsigned int timestamp_bits;
+       unsigned int max_order;
        int ret;
 
+       BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+       /*
+        * Calculate the eviction bucket size to cover the longest
+        * actionable refault distance, which is currently half of
+        * memory (totalram_pages/2). However, memory hotplug may add
+        * some more pages at runtime, so keep working with up to
+        * double the initial memory by using totalram_pages as-is.
+        */
+       timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+       max_order = fls_long(totalram_pages - 1);
+       if (max_order > timestamp_bits)
+               bucket_order = max_order - timestamp_bits;
+       printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
+              timestamp_bits, max_order, bucket_order);
+
        ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
        if (ret)
                goto err;
-- 
2.7.0

Reply via email to