When using a large number of threads performing AIO operations the
IOCTX list may get a significant number of entries which will cause
significant overhead. For example, when running this fio script:

rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
blocksize=1024; numjobs=512; thread; loops=100

on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
30% CPU time spent by lookup_ioctx:

 32.51%  [guest.kernel]  [g] lookup_ioctx
  9.19%  [guest.kernel]  [g] __lock_acquire.isra.28
  4.40%  [guest.kernel]  [g] lock_release
  4.19%  [guest.kernel]  [g] sched_clock_local
  3.86%  [guest.kernel]  [g] local_clock
  3.68%  [guest.kernel]  [g] native_sched_clock
  3.08%  [guest.kernel]  [g] sched_clock_cpu
  2.64%  [guest.kernel]  [g] lock_release_holdtime.part.11
  2.60%  [guest.kernel]  [g] memcpy
  2.33%  [guest.kernel]  [g] lock_acquired
  2.25%  [guest.kernel]  [g] lock_acquire
  1.84%  [guest.kernel]  [g] do_io_submit

This patchs converts the ioctx list to a radix tree. For a performance
comparison the above FIO script was run on a 2 sockets 8 core
machine. This are the results (average and %rsd of 10 runs) for the
original list based implementation and for the radix tree based
implementation:

cores         1         2         4         8         16        32
list       109376 ms  69119 ms  35682 ms  22671 ms  19724 ms  16408 ms
%rsd         0.69%      1.15%     1.17%     1.21%     1.71%     1.43%
radix       73651 ms  41748 ms  23028 ms  16766 ms  15232 ms   13787 ms
%rsd         1.19%      0.98%     0.69%     1.13%    0.72%      0.75%
% of radix
relative    66.12%     65.59%    66.63%    72.31%   77.26%     83.66%
to list

To consider the impact of the patch on the typical case of having
only one ctx per process the following FIO script was run:

rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1
blocksize=1024; numjobs=1; thread; loops=100

on the same system and the results are the following:

list        58892 ms
%rsd         0.91%
radix       59404 ms
%rsd         0.81%
% of radix
relative    100.87%
to list

Cc: Andi Kleen <a...@linux.intel.com>

Signed-off-by: Octavian Purdila <octavian.purd...@intel.com>
---
Changes since v2:
 * rebased on top of next/akpm
 * add %rsd to measurements 

Changes since v1:
 * add performance comparision for the typical case of having only one
   ctx per process
 * use ARRAY_SIZE and drop tracking idx as it is not needed

 arch/s390/mm/pgtable.c   |    4 +--
 fs/aio.c                 |   76 ++++++++++++++++++++++++++++------------------
 include/linux/mm_types.h |    3 +-
 kernel/fork.c            |    2 +-
 4 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 2accf71..54186e7 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -894,7 +894,7 @@ int s390_enable_sie(void)
        task_lock(tsk);
        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 #ifdef CONFIG_AIO
-           !hlist_empty(&tsk->mm->ioctx_list) ||
+           tsk->mm->ioctx_rtree.rnode ||
 #endif
            tsk->mm != tsk->active_mm) {
                task_unlock(tsk);
@@ -921,7 +921,7 @@ int s390_enable_sie(void)
        task_lock(tsk);
        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 #ifdef CONFIG_AIO
-           !hlist_empty(&tsk->mm->ioctx_list) ||
+           tsk->mm->ioctx_rtree.rnode ||
 #endif
            tsk->mm != tsk->active_mm) {
                mmput(mm);
diff --git a/fs/aio.c b/fs/aio.c
index 5b7ed78..4ceed70 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -38,6 +38,7 @@
 #include <linux/blkdev.h>
 #include <linux/compat.h>
 #include <linux/percpu-refcount.h>
+#include <linux/radix-tree.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -69,9 +70,7 @@ struct kioctx_cpu {
 struct kioctx {
        struct percpu_ref       users;
 
-       /* This needs improving */
        unsigned long           user_id;
-       struct hlist_node       list;
 
        struct __percpu kioctx_cpu *cpu;
 
@@ -457,10 +456,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
        aio_nr += ctx->max_reqs;
        spin_unlock(&aio_nr_lock);
 
-       /* now link into global list. */
+       /* now insert into the radix tree */
+       err = radix_tree_preload(GFP_KERNEL);
+       if (err)
+               goto out_cleanup;
        spin_lock(&mm->ioctx_lock);
-       hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
+       err = radix_tree_insert(&mm->ioctx_rtree, ctx->user_id, ctx);
        spin_unlock(&mm->ioctx_lock);
+       radix_tree_preload_end();
+       if (err) {
+               WARN_ONCE(1, "aio: insert into ioctx tree failed: %d", err);
+               goto out_cleanup;
+       }
 
        pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
                 ctx, ctx->user_id, mm, ctx->nr_events);
@@ -501,8 +508,8 @@ static void kill_ioctx_rcu(struct rcu_head *head)
 static void kill_ioctx(struct kioctx *ctx)
 {
        if (percpu_ref_kill(&ctx->users)) {
-               hlist_del_rcu(&ctx->list);
-               /* Between hlist_del_rcu() and dropping the initial ref */
+               radix_tree_delete(&current->mm->ioctx_rtree, ctx->user_id);
+               /* Between radix_tree_delete() and dropping the initial ref */
                synchronize_rcu();
 
                /*
@@ -542,25 +549,38 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
  */
 void exit_aio(struct mm_struct *mm)
 {
-       struct kioctx *ctx;
-       struct hlist_node *n;
+       struct kioctx *ctx[16];
+       unsigned long idx = 0;
+       int count;
 
-       hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
-               /*
-                * We don't need to bother with munmap() here -
-                * exit_mmap(mm) is coming and it'll unmap everything.
-                * Since aio_free_ring() uses non-zero ->mmap_size
-                * as indicator that it needs to unmap the area,
-                * just set it to 0; aio_free_ring() is the only
-                * place that uses ->mmap_size, so it's safe.
-                */
-               ctx->mmap_size = 0;
-
-               if (percpu_ref_kill(&ctx->users)) {
-                       hlist_del_rcu(&ctx->list);
-                       call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
+       do {
+               int i;
+
+               count = radix_tree_gang_lookup(&mm->ioctx_rtree, (void **)ctx,
+                                              idx, sizeof(ctx)/sizeof(void *));
+               for (i = 0; i < count; i++) {
+                       void *ret;
+
+                       BUG_ON(ctx[i]->user_id < idx);
+                       idx = ctx[i]->user_id;
+
+                       /*
+                        * We don't need to bother with munmap() here -
+                        * exit_mmap(mm) is coming and it'll unmap everything.
+                        * Since aio_free_ring() uses non-zero ->mmap_size
+                        * as indicator that it needs to unmap the area,
+                        * just set it to 0; aio_free_ring() is the only
+                        * place that uses ->mmap_size, so it's safe.
+                        */
+                       ctx[i]->mmap_size = 0;
+
+                       if (percpu_ref_kill(&ctx[i]->users)) {
+                               ret = radix_tree_delete(&mm->ioctx_rtree, idx);
+                               BUG_ON(!ret || ret != ctx[i]);
+                               call_rcu(&ctx[i]->rcu_head, kill_ioctx_rcu);
+                       }
                }
-       }
+       } while (count);
 }
 
 static void put_reqs_available(struct kioctx *ctx, unsigned nr)
@@ -665,12 +685,10 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 
        rcu_read_lock();
 
-       hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
-               if (ctx->user_id == ctx_id) {
-                       percpu_ref_get(&ctx->users);
-                       ret = ctx;
-                       break;
-               }
+       ctx = radix_tree_lookup(&mm->ioctx_rtree, ctx_id);
+       if (ctx) {
+               percpu_ref_get(&ctx->users);
+               ret = ctx;
        }
 
        rcu_read_unlock();
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fb425aa..1976fa9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/threads.h>
 #include <linux/list.h>
+#include <linux/radix-tree.h>
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
@@ -383,7 +384,7 @@ struct mm_struct {
        struct core_state *core_state; /* coredumping support */
 #ifdef CONFIG_AIO
        spinlock_t              ioctx_lock;
-       struct hlist_head       ioctx_list;
+       struct radix_tree_root  ioctx_rtree;
 #endif
 #ifdef CONFIG_MM_OWNER
        /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 6fcc2e2..62e0d0a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -522,7 +522,7 @@ static void mm_init_aio(struct mm_struct *mm)
 {
 #ifdef CONFIG_AIO
        spin_lock_init(&mm->ioctx_lock);
-       INIT_HLIST_HEAD(&mm->ioctx_list);
+       INIT_RADIX_TREE(&mm->ioctx_rtree, GFP_KERNEL);
 #endif
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to