On Wed, 2007-05-16 at 11:43 -0700, Christoph Lameter wrote: > On Wed, 16 May 2007, Peter Zijlstra wrote: > > > On Tue, 2007-05-15 at 15:02 -0700, Christoph Lameter wrote: > > > On Tue, 15 May 2007, Peter Zijlstra wrote: > > > > > > > How about something like this; it seems to sustain a little stress. > > > > > > Argh again mods to kmem_cache. > > > > Hmm, I had not understood you minded that very much; I did stay away > > from all the fast paths this time. > > Well you added a new locking level and changed the locking hierachy! > > > The thing is, I wanted to fold all the emergency allocs into a single > > slab, not a per cpu thing. And once you loose the per cpu thing, you > > need some extra serialization. Currently the top level lock is > > slab_lock(page), but that only works because we have interrupts disabled > > and work per cpu. > > SLUB can only allocate from a per cpu slab. You will have to reserve one > slab per cpu anyways unless we flush the cpu slab after each access. Same > thing is true for SLAB. It wants objects in its per cpu queues. > > > Why is it bad to extend kmem_cache a bit? > > Because it is for all practical purposes a heavily accessed read only > structure. Modifications only occur to per node and per cpu structures. > In a 4k systems any write will kick out the kmem_cache cacheline in 4k > processors.
If this 4k cpu system ever gets to touch the new lock it is in way deeper problems than a bouncing cache-line. Please look at it more carefully. We differentiate pages allocated at the level where GFP_ATOMIC starts to fail. By not updating the percpu slabs those are retried every time, except for ALLOC_NO_WATERMARKS allocations; those are served from the ->reserve_slab. Once a regular slab allocation succeeds again, the ->reserve_slab is cleaned up and never again looked at it until we're in distress again. Signed-off-by: Peter Zijlstra <[EMAIL PROTECTED]> --- include/linux/slub_def.h | 2 + mm/slub.c | 85 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 9 deletions(-) Index: linux-2.6-git/include/linux/slub_def.h =================================================================== --- linux-2.6-git.orig/include/linux/slub_def.h +++ linux-2.6-git/include/linux/slub_def.h @@ -46,6 +46,8 @@ struct kmem_cache { struct list_head list; /* List of slab caches */ struct kobject kobj; /* For sysfs */ + struct page *reserve_slab; + #ifdef CONFIG_NUMA int defrag_ratio; struct kmem_cache_node *node[MAX_NUMNODES]; Index: linux-2.6-git/mm/slub.c =================================================================== --- linux-2.6-git.orig/mm/slub.c +++ linux-2.6-git/mm/slub.c @@ -20,11 +20,13 @@ #include <linux/mempolicy.h> #include <linux/ctype.h> #include <linux/kallsyms.h> +#include "internal.h" /* * Lock order: - * 1. slab_lock(page) - * 2. slab->list_lock + * 1. reserve_lock + * 2. slab_lock(page) + * 3. node->list_lock * * The slab_lock protects operations on the object of a particular * slab and its metadata in the page struct. If the slab lock @@ -259,6 +261,8 @@ static int sysfs_slab_alias(struct kmem_ static void sysfs_slab_remove(struct kmem_cache *s) {} #endif +static DEFINE_SPINLOCK(reserve_lock); + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -1007,7 +1011,7 @@ static void setup_object(struct kmem_cac s->ctor(object, s, 0); } -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *rank) { struct page *page; struct kmem_cache_node *n; @@ -1025,6 +1029,7 @@ static struct page *new_slab(struct kmem if (!page) goto out; + *rank = page->rank; n = get_node(s, page_to_nid(page)); if (n) atomic_long_inc(&n->nr_slabs); @@ -1311,7 +1316,7 @@ static void unfreeze_slab(struct kmem_ca /* * Remove the cpu slab */ -static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) +static void __deactivate_slab(struct kmem_cache *s, struct page *page) { /* * Merge cpu freelist into freelist. Typically we get here @@ -1330,10 +1335,15 @@ static void deactivate_slab(struct kmem_ page->freelist = object; page->inuse--; } - s->cpu_slab[cpu] = NULL; unfreeze_slab(s, page); } +static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) +{ + __deactive_slab(s, page); + s->cpu_slab[cpu] = NULL; +} + static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) { slab_lock(page); @@ -1395,6 +1405,7 @@ static void *__slab_alloc(struct kmem_ca { void **object; int cpu = smp_processor_id(); + int rank = 0; if (!page) goto new_slab; @@ -1424,10 +1435,26 @@ new_slab: if (page) { s->cpu_slab[cpu] = page; goto load_freelist; - } + } else if (unlikely(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS)) + goto try_reserve; - page = new_slab(s, gfpflags, node); - if (page) { +alloc_slab: + page = new_slab(s, gfpflags, node, &rank); + if (page && rank) { + if (unlikely(s->reserve_slab)) { + struct page *reserve; + + spin_lock(&reserve_lock); + reserve = s->reserve_slab; + s->reserve_slab = NULL; + spin_unlock(&reserve_lock); + + if (reserve) { + slab_lock(reserve); + __deactivate_slab(s, reserve); + putback_slab(s, reserve); + } + } cpu = smp_processor_id(); if (s->cpu_slab[cpu]) { /* @@ -1455,6 +1482,18 @@ new_slab: SetSlabFrozen(page); s->cpu_slab[cpu] = page; goto load_freelist; + } else if (page) { + spin_lock(&reserve_lock); + if (s->reserve_slab) { + discard_slab(s, page); + page = s->reserve_slab; + } + slab_lock(page); + SetPageActive(page); + s->reserve_slab = page; + spin_unlock(&reserve_lock); + + goto got_reserve; } return NULL; debug: @@ -1470,6 +1509,31 @@ debug: page->freelist = object[page->offset]; slab_unlock(page); return object; + +try_reserve: + spin_lock(&reserve_lock); + page = s->reserve_slab; + if (!page) { + spin_unlock(&reserve_lock); + goto alloc_slab; + } + + slab_lock(page); + if (!page->freelist) { + s->reserve_slab = NULL; + spin_unlock(&reserve_lock); + __deactivate_slab(s, page); + putback_slab(s, page); + goto alloc_slab; + } + spin_unlock(&reserve_lock); + +got_reserve: + object = page->freelist; + page->inuse++; + page->freelist = object[page->offset]; + slab_unlock(page); + return object; } /* @@ -1807,10 +1871,11 @@ static struct kmem_cache_node * __init e { struct page *page; struct kmem_cache_node *n; + int rank; BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); + page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node, &rank); /* new_slab() disables interupts */ local_irq_enable(); @@ -2018,6 +2083,8 @@ static int kmem_cache_open(struct kmem_c #ifdef CONFIG_NUMA s->defrag_ratio = 100; #endif + s->reserve_slab = NULL; + if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) return 1; error: - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/