Signed-off-by: Peter Zijlstra <[EMAIL PROTECTED]>
---
 init/Kconfig |    1 
 mm/slub.c    |  260 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 214 insertions(+), 47 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c
+++ linux-2.6/mm/slub.c
@@ -20,6 +20,7 @@
 #include <linux/mempolicy.h>
 #include <linux/ctype.h>
 #include <linux/kallsyms.h>
+#include <linux/pagemap.h>
 
 /*
  * Lock order:
@@ -99,6 +100,8 @@
  *                     the fast path and disables lockless freelists.
  */
 
+#ifndef CONFIG_PREEMPT_RT
+
 #define FROZEN (1 << PG_active)
 
 #ifdef CONFIG_SLUB_DEBUG
@@ -137,6 +140,46 @@ static inline void ClearSlabDebug(struct
        page->flags &= ~SLABDEBUG;
 }
 
+#else /* CONFIG_PREEMPT_RT */
+/*
+ * when the allocator is preemptible these operations might be concurrent with
+ * lock_page(), and hence need atomic ops.
+ */
+
+#define PG_frozen              PG_active
+#define PG_debug               PG_error
+
+static inline int SlabFrozen(struct page *page)
+{
+       return test_bit(PG_frozen, &page->flags);
+}
+
+static inline void SetSlabFrozen(struct page *page)
+{
+       set_bit(PG_frozen, &page->flags);
+}
+
+static inline void ClearSlabFrozen(struct page *page)
+{
+       clear_bit(PG_frozen, &page->flags);
+}
+
+static inline int SlabDebug(struct page *page)
+{
+       return test_bit(PG_debug, &page->flags);
+}
+
+static inline void SetSlabDebug(struct page *page)
+{
+       set_bit(PG_debug, &page->flags);
+}
+
+static inline void ClearSlabDebug(struct page *page)
+{
+       clear_bit(PG_debug, &page->flags);
+}
+#endif
+
 /*
  * Issues still to be resolved:
  *
@@ -1021,7 +1064,7 @@ static struct page *new_slab(struct kmem
        BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
 
        if (flags & __GFP_WAIT)
-               local_irq_enable();
+               local_irq_enable_nort();
 
        page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
        if (!page)
@@ -1057,7 +1100,7 @@ static struct page *new_slab(struct kmem
        page->inuse = 0;
 out:
        if (flags & __GFP_WAIT)
-               local_irq_disable();
+               local_irq_disable_nort();
        return page;
 }
 
@@ -1117,6 +1160,7 @@ static void discard_slab(struct kmem_cac
 /*
  * Per slab locking using the pagelock
  */
+#ifndef CONFIG_PREEMPT_RT
 static __always_inline void slab_lock(struct page *page)
 {
        bit_spin_lock(PG_locked, &page->flags);
@@ -1134,6 +1178,22 @@ static __always_inline int slab_trylock(
        rc = bit_spin_trylock(PG_locked, &page->flags);
        return rc;
 }
+#else
+static __always_inline void slab_lock(struct page *page)
+{
+       lock_page(page);
+}
+
+static __always_inline void slab_unlock(struct page *page)
+{
+       unlock_page(page);
+}
+
+static __always_inline int slab_trylock(struct page *page)
+{
+       return !TestSetPageLocked(page);
+}
+#endif
 
 /*
  * Management of partially allocated slabs
@@ -1154,8 +1214,7 @@ static void add_partial(struct kmem_cach
        spin_unlock(&n->list_lock);
 }
 
-static void remove_partial(struct kmem_cache *s,
-                                               struct page *page)
+static void remove_partial(struct kmem_cache *s, struct page *page)
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
@@ -1282,6 +1341,7 @@ static void unfreeze_slab(struct kmem_ca
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
+       BUG_ON(!SlabFrozen(page));
        ClearSlabFrozen(page);
        if (page->inuse) {
 
@@ -1310,29 +1370,52 @@ static void unfreeze_slab(struct kmem_ca
        }
 }
 
+static void **get_lockless_object(struct page *page)
+{
+       void **object;
+
+again:
+       object = page->lockless_freelist;
+       if (object && __local_cmpxchg(&page->lockless_freelist,
+                               object, object[page->offset]) != object)
+               goto again;
+
+       return object;
+}
+
 /*
  * Remove the cpu slab
  */
 static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
 {
        /*
+        * take away the slab page before merging the lockless free list into
+        * the regular free list to ensure that no new entries are put on the
+        * lockless list between the merge and removal.
+        */
+       BUG_ON(page != s->cpu_slab[cpu]);
+       s->cpu_slab[cpu] = NULL;
+       barrier();
+
+       /*
         * Merge cpu freelist into freelist. Typically we get here
         * because both freelists are empty. So this is unlikely
         * to occur.
         */
-       while (unlikely(page->lockless_freelist)) {
+       for (;;) {
                void **object;
 
                /* Retrieve object from cpu_freelist */
-               object = page->lockless_freelist;
-               page->lockless_freelist = page->lockless_freelist[page->offset];
+               object = get_lockless_object(page);
+               if (likely(!object))
+                       break;
 
                /* And put onto the regular freelist */
                object[page->offset] = page->freelist;
                page->freelist = object;
                page->inuse--;
        }
-       s->cpu_slab[cpu] = NULL;
+
        unfreeze_slab(s, page);
 }
 
@@ -1354,6 +1437,55 @@ static void __flush_cpu_slab(struct kmem
                flush_slab(s, page, cpu);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+struct slab_work_struct {
+       struct work_struct work;
+       struct kmem_cache *s;
+};
+
+static struct workqueue_struct *flush_slab_workqueue;
+static DEFINE_PER_CPU(struct slab_work_struct, slab_works);
+static DEFINE_MUTEX(flush_slab_mutex); /* XXX kill this */
+
+static int __init flush_cpu_slab_init(void)
+{
+       flush_slab_workqueue = create_workqueue("slub_flushd");
+       if (!flush_slab_workqueue)
+               panic("Failed to create slub_flushd\n");
+
+       return 0;
+}
+
+core_initcall(flush_cpu_slab_init);
+
+static void flush_cpu_slab_wq(struct work_struct *work)
+{
+       struct slab_work_struct *sw;
+       int cpu = smp_processor_id();
+
+       sw = container_of(work, struct slab_work_struct, work);
+       __flush_cpu_slab(sw->s, cpu);
+}
+
+static void flush_all(struct kmem_cache *s)
+{
+       int cpu;
+       struct workqueue_struct *wq = flush_slab_workqueue;
+
+       mutex_lock(&flush_slab_mutex);
+       for_each_online_cpu(cpu) {
+               struct slab_work_struct *sw = &per_cpu(slab_works, cpu);
+
+               INIT_WORK(&sw->work, flush_cpu_slab_wq);
+               sw->s = s;
+               queue_work_cpu(wq, &sw->work, cpu);
+       }
+       flush_workqueue(wq);
+       mutex_unlock(&flush_slab_mutex);
+}
+
+#else
+
 static void flush_cpu_slab(void *d)
 {
        struct kmem_cache *s = d;
@@ -1374,6 +1506,7 @@ static void flush_all(struct kmem_cache 
        local_irq_restore(flags);
 #endif
 }
+#endif
 
 /*
  * Slow path. The lockless freelist is empty or we need to perform
@@ -1396,13 +1529,24 @@ static void *__slab_alloc(struct kmem_ca
                gfp_t gfpflags, int node, void *addr, struct page *page)
 {
        void **object;
+       unsigned long flags;
        int cpu = smp_processor_id();
 
+       local_irq_save_nort(flags);
+
+again:
        if (!page)
                goto new_slab;
 
        slab_lock(page);
-       if (unlikely(node != -1 && page_to_nid(page) != node))
+       if (!SlabFrozen(page) || page != s->cpu_slab[cpu]) {
+               slab_unlock(page);
+               page = s->cpu_slab[cpu];
+               goto again;
+       }
+
+       if (unlikely((node != -1 && page_to_nid(page) != node) ||
+                       page->lockless_freelist))  /* validate the need for 
this check */
                goto another_slab;
 load_freelist:
        object = page->freelist;
@@ -1415,7 +1559,9 @@ load_freelist:
        page->lockless_freelist = object[page->offset];
        page->inuse = s->objects;
        page->freelist = NULL;
+out:
        slab_unlock(page);
+       local_irq_restore_nort(flags);
        return object;
 
 another_slab:
@@ -1424,40 +1570,42 @@ another_slab:
 new_slab:
        page = get_partial(s, gfpflags, node);
        if (page) {
-               s->cpu_slab[cpu] = page;
+               struct page *cur_page;
+
+               cur_page = __local_cmpxchg(&s->cpu_slab[cpu], NULL, page);
+               if (cur_page) {
+                       /*
+                        * Someone else populated the cpu_slab while we got
+                        * preempted. We want the current one since its cache
+                        * hot
+                        */
+                       unfreeze_slab(s, page);
+                       page = cur_page;
+                       goto again;
+               }
                goto load_freelist;
        }
 
        page = new_slab(s, gfpflags, node);
        if (page) {
-               cpu = smp_processor_id();
-               if (s->cpu_slab[cpu]) {
+               struct page *cur_page;
+
+               slab_lock(page);
+               SetSlabFrozen(page);
+               cur_page = __local_cmpxchg(&s->cpu_slab[cpu], NULL, page);
+               if (cur_page) {
                        /*
-                        * Someone else populated the cpu_slab while we
-                        * enabled interrupts, or we have gotten scheduled
-                        * on another cpu. The page may not be on the
-                        * requested node even if __GFP_THISNODE was
-                        * specified. So we need to recheck.
+                        * Someone else populated the cpu_slab while we got
+                        * preempted. We want the current one since its cache
+                        * hot
                         */
-                       if (node == -1 ||
-                               page_to_nid(s->cpu_slab[cpu]) == node) {
-                               /*
-                                * Current cpuslab is acceptable and we
-                                * want the current one since its cache hot
-                                */
-                               discard_slab(s, page);
-                               page = s->cpu_slab[cpu];
-                               slab_lock(page);
-                               goto load_freelist;
-                       }
-                       /* New slab does not fit our expectations */
-                       flush_slab(s, s->cpu_slab[cpu], cpu);
+                       unfreeze_slab(s, page);
+                       page = cur_page;
+                       goto again;
                }
-               slab_lock(page);
-               SetSlabFrozen(page);
-               s->cpu_slab[cpu] = page;
                goto load_freelist;
        }
+       local_irq_restore_nort(flags);
        return NULL;
 debug:
        object = page->freelist;
@@ -1466,8 +1614,7 @@ debug:
 
        page->inuse++;
        page->freelist = object[page->offset];
-       slab_unlock(page);
-       return object;
+       goto out;
 }
 
 /*
@@ -1487,18 +1634,20 @@ static void __always_inline *slab_alloc(
        void **object;
        unsigned long flags;
 
-       local_irq_save(flags);
+       __local_begin(flags);
        page = s->cpu_slab[smp_processor_id()];
        if (unlikely(!page || !page->lockless_freelist ||
-                       (node != -1 && page_to_nid(page) != node)))
+                       (node != -1 && page_to_nid(page) != node))) {
 
+do_alloc:
                object = __slab_alloc(s, gfpflags, node, addr, page);
 
-       else {
-               object = page->lockless_freelist;
-               page->lockless_freelist = object[page->offset];
+       } else {
+               object = get_lockless_object(page);
+               if (unlikely(!object))
+                       goto do_alloc;
        }
-       local_irq_restore(flags);
+       __local_end(flags);
        return object;
 }
 
@@ -1529,7 +1678,9 @@ static void __slab_free(struct kmem_cach
 {
        void *prior;
        void **object = (void *)x;
+       unsigned long flags;
 
+       local_irq_save_nort(flags);
        slab_lock(page);
 
        if (unlikely(SlabDebug(page)))
@@ -1555,6 +1706,7 @@ checks_ok:
 
 out_unlock:
        slab_unlock(page);
+       local_irq_restore_nort(flags);
        return;
 
 slab_empty:
@@ -1566,6 +1718,7 @@ slab_empty:
 
        slab_unlock(page);
        discard_slab(s, page);
+       local_irq_restore_nort(flags);
        return;
 
 debug:
@@ -1591,15 +1744,30 @@ static void __always_inline slab_free(st
        void **object = (void *)x;
        unsigned long flags;
 
-       local_irq_save(flags);
+       __local_begin(flags);
+       /*
+        * We have to either take slab_lock(page) or disable preemption while
+        * trying to add to the lockless freelist because we have to guarantee
+        * page == s->cpu_slab[cpu] during the operation.
+        *
+        * fix this by allowing non active slabs to have a lockless_freelist?
+        * cannot do since Christoph is about to pull lockless_freelist from
+        * the struct page.
+        *
+        * preempt_disable() seems cheapest for these few instructions vs the
+        * atomic ops involved with slab_lock()
+        */
+       preempt_disable();
        if (likely(page == s->cpu_slab[smp_processor_id()] &&
-                                               !SlabDebug(page))) {
+                               !SlabDebug(page))) {
                object[page->offset] = page->lockless_freelist;
                page->lockless_freelist = object;
-       } else
+               preempt_enable();
+       } else {
+               preempt_enable();
                __slab_free(s, page, x, addr);
-
-       local_irq_restore(flags);
+       }
+       __local_end(flags);
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -578,7 +578,6 @@ config SLAB
 
 config SLUB
        bool "SLUB (Unqueued Allocator)"
-       depends on !PREEMPT_RT
        help
           SLUB is a slab allocator that minimizes cache line usage
           instead of managing queues of cached objects (SLAB approach).

--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to