A patch provide a interface to limit total page cache in
/proc/sys/vm/pagecache_ratio. The default value is 90 percent. Any
feedback is appreciated.

-Roy

diff -urp a/include/linux/pagemap.h b/include/linux/pagemap.h
--- a/include/linux/pagemap.h   2006-11-30 05:57:37.000000000 +0800
+++ b/include/linux/pagemap.h   2007-01-15 17:03:09.000000000 +0800
@@ -12,6 +12,12 @@
#include <asm/uaccess.h>
#include <linux/gfp.h>

+extern int pagecache_ratio;
+extern long pagecache_limit;
+
+int pagecache_ratio_sysctl_handler(struct ctl_table *, int,
+                       struct file *, void __user *, size_t *, loff_t *);
+
/*
 * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
 * allocation mode flags.
diff -urp a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h    2007-01-15 17:18:46.000000000 +0800
+++ b/include/linux/sysctl.h    2007-01-15 17:03:09.000000000 +0800
@@ -202,6 +202,7 @@ enum
        VM_PANIC_ON_OOM=33,     /* panic at out-of-memory */
        VM_VDSO_ENABLED=34,     /* map VDSO into new processes? */
        VM_MIN_SLAB=35,          /* Percent pages ignored by zone reclaim */
+       VM_PAGECACHE_RATIO=36,  /* Percent memory is used as page cache */
};


diff -urp a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c   2007-01-15 17:18:46.000000000 +0800
+++ b/kernel/sysctl.c   2007-01-15 17:03:09.000000000 +0800
@@ -1035,6 +1035,15 @@ static ctl_table vm_table[] = {
                .extra1         = &zero,
        },
#endif
+       {
+               .ctl_name       = VM_PAGECACHE_RATIO,
+               .procname       = "pagecache_ratio",
+               .data           = &pagecache_ratio,
+               .maxlen         = sizeof(pagecache_ratio),
+               .mode           = 0644,
+               .proc_handler   = &pagecache_ratio_sysctl_handler,
+               .strategy       = &sysctl_intvec,
+       },
        { .ctl_name = 0 }
};

diff -urp a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c      2007-01-15 17:18:46.000000000 +0800
+++ b/mm/filemap.c      2007-01-15 17:03:09.000000000 +0800
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
+#include <linux/sysctl.h>
#include "filemap.h"
#include "internal.h"

@@ -108,6 +109,48 @@ generic_file_direct_IO(int rw, struct ki
 */

/*
+ * Start release pagecache (via kswapd) at the percentage.
+ */
+int pagecache_ratio __read_mostly = 90;
+
+long pagecache_limit = 0;
+
+int setup_pagecache_limit(void)
+{
+       pagecache_limit = pagecache_ratio * nr_free_pagecache_pages() / 100;
+       return 0;
+}
+
+int pagecache_ratio_sysctl_handler(ctl_table *table, int write,
+       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+       proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+       setup_pagecache_limit();
+       return 0;
+}
+
+static inline int balance_pagecache(void)
+{
+       if (global_page_state(NR_FILE_PAGES) > pagecache_limit) {
+               int nid, j;
+               pg_data_t *pgdat;
+               struct zone *zone;
+
+               for_each_online_node(nid) {
+                       pgdat = NODE_DATA(nid);
+                       for (j = 0; j < MAX_NR_ZONES; j++) {
+                               zone = pgdat->node_zones + j;
+                               wakeup_kswapd(zone, 0);
+                       }
+               }
+       }
+
+       return 0;
+}
+
+module_init(setup_pagecache_limit)
+
+/*
 * Remove a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
 * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
@@ -1085,6 +1128,8 @@ out:
                page_cache_release(cached_page);
        if (filp)
                file_accessed(filp);
+
+       balance_pagecache();
}
EXPORT_SYMBOL(do_generic_mapping_read);

@@ -2212,6 +2257,8 @@ zero_length_segment:
                status = filemap_write_and_wait(mapping);

        pagevec_lru_add(&lru_pvec);
+       balance_pagecache();
+
        return written ? written : status;
}
EXPORT_SYMBOL(generic_file_buffered_write);
diff -urp a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c       2007-01-15 17:18:46.000000000 +0800
+++ b/mm/vmscan.c       2007-01-15 17:03:09.000000000 +0800
@@ -1316,6 +1316,7 @@ static int kswapd(void *p)
        order = 0;
        for ( ; ; ) {
                unsigned long new_order;
+               long over_limit;

                try_to_freeze();

@@ -1335,6 +1336,9 @@ static int kswapd(void *p)
                finish_wait(&pgdat->kswapd_wait, &wait);

                balance_pgdat(pgdat, order);
+               over_limit = global_page_state(NR_FILE_PAGES) - pagecache_limit;
+               if (over_limit > 0)
+                       shrink_all_memory(over_limit);
        }
        return 0;
}
@@ -1350,8 +1354,10 @@ void wakeup_kswapd(struct zone *zone, in
                return;

        pgdat = zone->zone_pgdat;
-       if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
-               return;
+       if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) {
+               if (global_page_state(NR_FILE_PAGES) < pagecache_limit)
+                       return;
+       }
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;
        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1361,7 +1367,6 @@ void wakeup_kswapd(struct zone *zone, in
        wake_up_interruptible(&pgdat->kswapd_wait);
}

-#ifdef CONFIG_PM
/*
 * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
 * from LRU lists system-wide, for given pass and priority, and returns the
@@ -1510,7 +1515,6 @@ out:

        return ret;
}
-#endif

/* It's optimal to keep kswapds on the same CPUs as their memory, but
   not required for correctness.  So if the last cpu in a node goes
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to