Author: kmacy
Date: Tue Mar 16 22:17:21 2010
New Revision: 205231
URL: http://svn.freebsd.org/changeset/base/205231

Log:
  - reduce contention by breaking up ARC state locks in to 16 for data
    and 16 for metadata
  - export L2ARC tunables as sysctls
  - add several kstats to track L2ARC state more precisely
  - avoid holding a contended lock when atomically incrementing a
    contended counter (no lock protection needed for atomics)

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c   Tue Mar 16 
21:44:21 2010        (r205230)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c   Tue Mar 16 
22:17:21 2010        (r205231)
@@ -131,6 +131,7 @@
 #include <sys/kstat.h>
 #include <sys/sdt.h>
 
+#include <sys/ktr.h>
 #include <vm/vm_pageout.h>
 
 static kmutex_t                arc_reclaim_thr_lock;
@@ -186,6 +187,11 @@ SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min,
 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
     &zfs_mdcomp_disable, 0, "Disable metadata compression");
 
+#ifdef ZIO_USE_UMA
+extern kmem_cache_t    *zio_buf_cache[];
+extern kmem_cache_t    *zio_data_buf_cache[];
+#endif
+
 /*
  * Note that buffers can be in one of 6 states:
  *     ARC_anon        - anonymous (discussed below)
@@ -218,13 +224,31 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_di
  * second level ARC benefit from these fast lookups.
  */
 
+#define        ARCS_LOCK_PAD           128
+struct arcs_lock {
+       kmutex_t        arcs_lock;
+#ifdef _KERNEL
+       unsigned char   pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+/*
+ * must be power of two for mask use to work
+ *
+ */
+#define ARC_BUFC_NUMDATALISTS          16
+#define ARC_BUFC_NUMMETADATALISTS      16
+#define ARC_BUFC_NUMLISTS      
(ARC_BUFC_NUMMETADATALISTS+ARC_BUFC_NUMDATALISTS)
+
 typedef struct arc_state {
-       list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
        uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
        uint64_t arcs_size;     /* total amount of data in this state */
-       kmutex_t arcs_mtx;
+       list_t  arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
+       struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(128);
 } arc_state_t;
 
+#define ARCS_LOCK(s, i) &((s)->arcs_locks[(i)].arcs_lock)
+
 /* The 6 states: */
 static arc_state_t ARC_anon;
 static arc_state_t ARC_mru;
@@ -248,7 +272,9 @@ typedef struct arc_stats {
        kstat_named_t arcstat_mru_ghost_hits;
        kstat_named_t arcstat_mfu_hits;
        kstat_named_t arcstat_mfu_ghost_hits;
+       kstat_named_t arcstat_allocated;
        kstat_named_t arcstat_deleted;
+       kstat_named_t arcstat_stolen;
        kstat_named_t arcstat_recycle_miss;
        kstat_named_t arcstat_mutex_miss;
        kstat_named_t arcstat_evict_skip;
@@ -280,6 +306,19 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_size;
        kstat_named_t arcstat_l2_hdr_size;
        kstat_named_t arcstat_memory_throttle_count;
+       kstat_named_t arcstat_l2_write_trylock_fail;
+       kstat_named_t arcstat_l2_write_in_l2;
+       kstat_named_t arcstat_l2_write_passed_headroom;
+       kstat_named_t arcstat_l2_write_spa_mismatch;
+       kstat_named_t arcstat_l2_write_hdr_io_in_progress;
+       kstat_named_t arcstat_l2_write_not_cacheable;
+       kstat_named_t arcstat_l2_write_full;
+       kstat_named_t arcstat_l2_write_buffer_iter;
+       kstat_named_t arcstat_l2_write_pios;
+       kstat_named_t arcstat_l2_write_bytes_written;
+       kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
+       kstat_named_t arcstat_l2_write_buffer_list_iter;
+       kstat_named_t arcstat_l2_write_buffer_list_null_iter;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -297,7 +336,9 @@ static arc_stats_t arc_stats = {
        { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
        { "mfu_hits",                   KSTAT_DATA_UINT64 },
        { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
+       { "allocated",                  KSTAT_DATA_UINT64 },
        { "deleted",                    KSTAT_DATA_UINT64 },
+       { "stolen",                     KSTAT_DATA_UINT64 },
        { "recycle_miss",               KSTAT_DATA_UINT64 },
        { "mutex_miss",                 KSTAT_DATA_UINT64 },
        { "evict_skip",                 KSTAT_DATA_UINT64 },
@@ -328,7 +369,20 @@ static arc_stats_t arc_stats = {
        { "l2_io_error",                KSTAT_DATA_UINT64 },
        { "l2_size",                    KSTAT_DATA_UINT64 },
        { "l2_hdr_size",                KSTAT_DATA_UINT64 },
-       { "memory_throttle_count",      KSTAT_DATA_UINT64 }
+       { "memory_throttle_count",      KSTAT_DATA_UINT64 },
+       { "l2_write_trylock_fail",      KSTAT_DATA_UINT64 },
+       { "l2_write_in_l2",             KSTAT_DATA_UINT64 },
+       { "l2_write_passed_headroom",   KSTAT_DATA_UINT64 },
+       { "l2_write_spa_mismatch",      KSTAT_DATA_UINT64 },
+       { "l2_write_io_in_progress",    KSTAT_DATA_UINT64 },
+       { "l2_write_not_cacheable",     KSTAT_DATA_UINT64 },
+       { "l2_write_full",              KSTAT_DATA_UINT64 },
+       { "l2_write_buffer_iter",       KSTAT_DATA_UINT64 },
+       { "l2_write_pios",              KSTAT_DATA_UINT64 },
+       { "l2_write_bytes_written",             KSTAT_DATA_UINT64 },
+       { "l2_write_buffer_bytes_scanned",      KSTAT_DATA_UINT64 },
+       { "l2_write_buffer_list_iter",  KSTAT_DATA_UINT64 },
+       { "l2_write_buffer_list_null_iter",     KSTAT_DATA_UINT64 }
 };
 
 #define        ARCSTAT(stat)   (arc_stats.stat.value.ui64)
@@ -550,9 +604,10 @@ extern kmem_cache_t        *zio_data_buf_cache[
  * Level 2 ARC
  */
 
-#define        L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial 
write max */
-#define        L2ARC_HEADROOM          4               /* num of writes */
+#define        L2ARC_WRITE_SIZE        (64 * 1024 * 1024)      /* initial 
write max */
+#define        L2ARC_HEADROOM          128             /* num of writes */
 #define        L2ARC_FEED_SECS         1               /* caching interval */
+#define        L2ARC_FEED_SECS_SHIFT   1               /* caching interval 
shift */
 
 #define        l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 #define        l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
@@ -564,7 +619,66 @@ uint64_t l2arc_write_max = L2ARC_WRITE_S
 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
 uint64_t l2arc_headroom = L2ARC_HEADROOM;      /* number of dev writes */
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;    /* interval seconds */
-boolean_t l2arc_noprefetch = B_TRUE;           /* don't cache prefetch bufs */
+uint64_t l2arc_feed_secs_shift = L2ARC_FEED_SECS_SHIFT;        /* interval 
seconds shift */
+boolean_t l2arc_noprefetch = B_FALSE;          /* don't cache prefetch bufs */
+
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
+    &l2arc_write_max, 0, "max write size");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
+    &l2arc_write_boost, 0, "extra write during warmup");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
+    &l2arc_headroom, 0, "number of dev writes");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
+    &l2arc_feed_secs, 0, "interval seconds");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs_shift, CTLFLAG_RW,
+    &l2arc_feed_secs_shift, 0, "power of 2 division of feed seconds");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
+    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
+
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
+    &ARC_anon.arcs_size, 0, "size of anonymous state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
+    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
+    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
+    &ARC_mru.arcs_size, 0, "size of mru state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
+    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru 
state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
+    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+    "size of metadata in mru ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+    "size of data in mru ghost state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
+    &ARC_mfu.arcs_size, 0, "size of mfu state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
+    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu 
state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
+    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+    "size of metadata in mfu ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+    "size of data in mfu ghost state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
+    &ARC_l2c_only.arcs_size, 0, "size of mru state");
 
 /*
  * L2ARC Internals
@@ -958,20 +1072,42 @@ arc_buf_freeze(arc_buf_t *buf)
 }
 
 static void
+get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t 
**lock)
+{
+       uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
+
+       if (ab->b_type == ARC_BUFC_METADATA) 
+               buf_hashid &= (ARC_BUFC_NUMMETADATALISTS-1);
+       else {
+               buf_hashid &= (ARC_BUFC_NUMDATALISTS-1);
+               buf_hashid += ARC_BUFC_NUMMETADATALISTS;
+       }
+
+       *list = &state->arcs_lists[buf_hashid];
+       *lock = ARCS_LOCK(state, buf_hashid);
+}
+
+
+static void
 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 {
+
        ASSERT(MUTEX_HELD(hash_lock));
 
        if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
            (ab->b_state != arc_anon)) {
+               list_t *list;
+               kmutex_t *lock;
                uint64_t delta = ab->b_size * ab->b_datacnt;
-               list_t *list = &ab->b_state->arcs_list[ab->b_type];
                uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
 
-               ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
-               mutex_enter(&ab->b_state->arcs_mtx);
+               get_buf_info(ab, ab->b_state, &list, &lock);
+               ASSERT(!MUTEX_HELD(lock));
+               mutex_enter(lock);
                ASSERT(list_link_active(&ab->b_arc_node));
                list_remove(list, ab);
+               mutex_exit(lock);
+
                if (GHOST_STATE(ab->b_state)) {
                        ASSERT3U(ab->b_datacnt, ==, 0);
                        ASSERT3P(ab->b_buf, ==, NULL);
@@ -980,7 +1116,6 @@ add_reference(arc_buf_hdr_t *ab, kmutex_
                ASSERT(delta > 0);
                ASSERT3U(*size, >=, delta);
                atomic_add_64(size, -delta);
-               mutex_exit(&ab->b_state->arcs_mtx);
                /* remove the prefetch flag if we get a reference */
                if (ab->b_flags & ARC_PREFETCH)
                        ab->b_flags &= ~ARC_PREFETCH;
@@ -999,14 +1134,19 @@ remove_reference(arc_buf_hdr_t *ab, kmut
        if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
            (state != arc_anon)) {
                uint64_t *size = &state->arcs_lsize[ab->b_type];
+               list_t *list;
+               kmutex_t *lock;
+
+               get_buf_info(ab, state, &list, &lock);
 
-               ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-               mutex_enter(&state->arcs_mtx);
+               ASSERT(!MUTEX_HELD(lock));
+               mutex_enter(lock);
                ASSERT(!list_link_active(&ab->b_arc_node));
-               list_insert_head(&state->arcs_list[ab->b_type], ab);
+               list_insert_head(list, ab);
+               mutex_exit(lock);
+
                ASSERT(ab->b_datacnt > 0);
                atomic_add_64(size, ab->b_size * ab->b_datacnt);
-               mutex_exit(&state->arcs_mtx);
        }
        return (cnt);
 }
@@ -1021,6 +1161,8 @@ arc_change_state(arc_state_t *new_state,
        arc_state_t *old_state = ab->b_state;
        int64_t refcnt = refcount_count(&ab->b_refcnt);
        uint64_t from_delta, to_delta;
+       list_t *list;
+       kmutex_t *lock;
 
        ASSERT(MUTEX_HELD(hash_lock));
        ASSERT(new_state != old_state);
@@ -1035,14 +1177,17 @@ arc_change_state(arc_state_t *new_state,
         */
        if (refcnt == 0) {
                if (old_state != arc_anon) {
-                       int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+                       int use_mutex;
                        uint64_t *size = &old_state->arcs_lsize[ab->b_type];
 
+                       get_buf_info(ab, old_state, &list, &lock);
+                       use_mutex = !MUTEX_HELD(lock);
+
                        if (use_mutex)
-                               mutex_enter(&old_state->arcs_mtx);
+                               mutex_enter(lock);
 
                        ASSERT(list_link_active(&ab->b_arc_node));
-                       list_remove(&old_state->arcs_list[ab->b_type], ab);
+                       list_remove(list, ab);
 
                        /*
                         * If prefetching out of the ghost cache,
@@ -1057,16 +1202,20 @@ arc_change_state(arc_state_t *new_state,
                        atomic_add_64(size, -from_delta);
 
                        if (use_mutex)
-                               mutex_exit(&old_state->arcs_mtx);
+                               mutex_exit(lock);
                }
                if (new_state != arc_anon) {
-                       int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+                       int use_mutex; 
                        uint64_t *size = &new_state->arcs_lsize[ab->b_type];
 
+                       get_buf_info(ab, new_state, &list, &lock);
+                       use_mutex = !MUTEX_HELD(lock);
+                       
+                       
                        if (use_mutex)
-                               mutex_enter(&new_state->arcs_mtx);
+                               mutex_enter(lock);
 
-                       list_insert_head(&new_state->arcs_list[ab->b_type], ab);
+                       list_insert_head(list, ab);
 
                        /* ghost elements have a ghost size */
                        if (GHOST_STATE(new_state)) {
@@ -1077,7 +1226,7 @@ arc_change_state(arc_state_t *new_state,
                        atomic_add_64(size, to_delta);
 
                        if (use_mutex)
-                               mutex_exit(&new_state->arcs_mtx);
+                               mutex_exit(lock);
                }
        }
 
@@ -1467,21 +1616,49 @@ arc_evict(arc_state_t *state, spa_t *spa
 {
        arc_state_t *evicted_state;
        uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
+       int64_t bytes_remaining;
        arc_buf_hdr_t *ab, *ab_prev = NULL;
-       list_t *list = &state->arcs_list[type];
+       list_t *evicted_list, *list, *evicted_list_start, *list_start;
+       kmutex_t *lock, *evicted_lock;
        kmutex_t *hash_lock;
        boolean_t have_lock;
        void *stolen = NULL;
+       static int evict_metadata_offset, evict_data_offset;
+       int i, idx, offset, list_count, count;
 
        ASSERT(state == arc_mru || state == arc_mfu);
 
        evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+       
+       if (type == ARC_BUFC_METADATA) {
+               offset = 0;
+               list_count = ARC_BUFC_NUMMETADATALISTS;
+               list_start = &state->arcs_lists[0];
+               evicted_list_start = &evicted_state->arcs_lists[0];
+               idx = evict_metadata_offset;
+       } else {
+               offset = ARC_BUFC_NUMMETADATALISTS;
+
+               list_start = &state->arcs_lists[offset];
+               evicted_list_start = &evicted_state->arcs_lists[offset];
+               list_count = ARC_BUFC_NUMDATALISTS;
+               idx = evict_data_offset;
+       }
+       bytes_remaining = evicted_state->arcs_lsize[type];
+       count = 0;
+       
+evict_start:
+       list = &list_start[idx];
+       evicted_list = &evicted_list_start[idx];
+       lock = ARCS_LOCK(state, (offset + idx));
+       evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 
 
-       mutex_enter(&state->arcs_mtx);
-       mutex_enter(&evicted_state->arcs_mtx);
+       mutex_enter(lock);
+       mutex_enter(evicted_lock);
 
        for (ab = list_tail(list); ab; ab = ab_prev) {
                ab_prev = list_prev(list, ab);
+               bytes_remaining -= (ab->b_size * ab->b_datacnt);
                /* prefetch buffers have a minimum lifespan */
                if (HDR_IO_IN_PROGRESS(ab) ||
                    (spa && ab->b_spa != spa) ||
@@ -1541,18 +1718,36 @@ arc_evict(arc_state_t *state, spa_t *spa
                                mutex_exit(hash_lock);
                        if (bytes >= 0 && bytes_evicted >= bytes)
                                break;
+                       if (bytes_remaining > 0) {
+                               mutex_exit(evicted_lock);
+                               mutex_exit(lock);
+                               idx  = ((idx + 1)&(list_count-1));
+                               count++;
+                               goto evict_start;
+                       }
                } else {
                        missed += 1;
                }
        }
 
-       mutex_exit(&evicted_state->arcs_mtx);
-       mutex_exit(&state->arcs_mtx);
-
-       if (bytes_evicted < bytes)
-               dprintf("only evicted %lld bytes from %x",
-                   (longlong_t)bytes_evicted, state);
+       mutex_exit(evicted_lock);
+       mutex_exit(lock);
+       
+       idx  = ((idx + 1)&(list_count-1));
+       count++;
 
+       if (bytes_evicted < bytes) {
+               if (count < list_count)
+                       goto evict_start;
+               else
+                       dprintf("only evicted %lld bytes from %x",
+                           (longlong_t)bytes_evicted, state);
+       }
+       if (type == ARC_BUFC_METADATA) 
+               evict_metadata_offset = idx;
+       else
+               evict_data_offset = idx;
+               
        if (skipped)
                ARCSTAT_INCR(arcstat_evict_skip, skipped);
 
@@ -1579,6 +1774,8 @@ arc_evict(arc_state_t *state, spa_t *spa
                        arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
                }
        }
+       if (stolen)
+               ARCSTAT_BUMP(arcstat_stolen);
 
        return (stolen);
 }
@@ -1591,14 +1788,28 @@ static void
 arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
 {
        arc_buf_hdr_t *ab, *ab_prev;
-       list_t *list = &state->arcs_list[ARC_BUFC_DATA];
-       kmutex_t *hash_lock;
+       list_t *list, *list_start;
+       kmutex_t *hash_lock, *lock;
        uint64_t bytes_deleted = 0;
        uint64_t bufs_skipped = 0;
+       static int evict_offset;
+       int list_count, idx = evict_offset;
+       int offset, count = 0;
 
        ASSERT(GHOST_STATE(state));
-top:
-       mutex_enter(&state->arcs_mtx);
+
+       /*
+        * data lists come after metadata lists
+        */
+       list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
+       list_count = ARC_BUFC_NUMDATALISTS;
+       offset = ARC_BUFC_NUMMETADATALISTS;
+       
+evict_start:
+       list = &list_start[idx];
+       lock = ARCS_LOCK(state, idx + offset);
+
+       mutex_enter(lock);
        for (ab = list_tail(list); ab; ab = ab_prev) {
                ab_prev = list_prev(list, ab);
                if (spa && ab->b_spa != spa)
@@ -1628,20 +1839,31 @@ top:
                                break;
                } else {
                        if (bytes < 0) {
-                               mutex_exit(&state->arcs_mtx);
+                               /*
+                                * we're draining the ARC, retry
+                                */
+                               mutex_exit(lock);
                                mutex_enter(hash_lock);
                                mutex_exit(hash_lock);
-                               goto top;
+                               goto evict_start;
                        }
                        bufs_skipped += 1;
                }
        }
-       mutex_exit(&state->arcs_mtx);
-
-       if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+       mutex_exit(lock);
+       idx  = ((idx + 1)&(ARC_BUFC_NUMDATALISTS-1));
+       count++;
+       
+       if (count < list_count)
+               goto evict_start;
+       
+       evict_offset = idx;
+       if ((uintptr_t)list > 
(uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
            (bytes < 0 || bytes_deleted < bytes)) {
-               list = &state->arcs_list[ARC_BUFC_METADATA];
-               goto top;
+               list_start = &state->arcs_lists[0];
+               list_count = ARC_BUFC_NUMMETADATALISTS;
+               offset = count = 0;
+               goto evict_start;
        }
 
        if (bufs_skipped) {
@@ -1755,22 +1977,22 @@ restart:        
 void
 arc_flush(spa_t *spa)
 {
-       while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
+       while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
                (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
                if (spa)
                        break;
        }
-       while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
+       while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
                (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
                if (spa)
                        break;
        }
-       while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
+       while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
                (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
                if (spa)
                        break;
        }
-       while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
+       while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
                (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
                if (spa)
                        break;
@@ -2206,6 +2428,7 @@ out:
                    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
                        arc_p = MIN(arc_c, arc_p + size);
        }
+       ARCSTAT_BUMP(arcstat_allocated);
 }
 
 /*
@@ -2391,7 +2614,10 @@ arc_read_done(zio_t *zio)
        hdr->b_flags &= ~ARC_L2_EVICTED;
        if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
                hdr->b_flags &= ~ARC_L2CACHE;
-
+#if 0  
+       else if ((hdr->b_flags & ARC_PREFETCH) == 0)
+               hdr->b_flags |= ARC_L2CACHE;
+#endif 
        /* byteswap if necessary */
        callback_list = hdr->b_acb;
        ASSERT(callback_list != NULL);
@@ -2505,6 +2731,7 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
        int err;
+       arc_buf_hdr_t *hdr = pbuf->b_hdr;
 
        ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
        ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
@@ -2513,8 +2740,8 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_
        err = arc_read_nolock(pio, spa, bp, done, private, priority,
            zio_flags, arc_flags, zb);
 
+       ASSERT3P(hdr, ==, pbuf->b_hdr);
        rw_exit(&pbuf->b_lock);
-
        return (err);
 }
 
@@ -2825,7 +3052,9 @@ arc_buf_evict(arc_buf_t *buf)
        arc_buf_hdr_t *hdr;
        kmutex_t *hash_lock;
        arc_buf_t **bufp;
-
+       list_t *list, *evicted_list;
+       kmutex_t *lock, *evicted_lock;
+       
        rw_enter(&buf->b_lock, RW_WRITER);
        hdr = buf->b_hdr;
        if (hdr == NULL) {
@@ -2873,16 +3102,18 @@ arc_buf_evict(arc_buf_t *buf)
                evicted_state =
                    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
-               mutex_enter(&old_state->arcs_mtx);
-               mutex_enter(&evicted_state->arcs_mtx);
+               get_buf_info(hdr, old_state, &list, &lock);
+               get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
+               mutex_enter(lock);
+               mutex_enter(evicted_lock);
 
                arc_change_state(evicted_state, hdr, hash_lock);
                ASSERT(HDR_IN_HASH_TABLE(hdr));
                hdr->b_flags |= ARC_IN_HASH_TABLE;
                hdr->b_flags &= ~ARC_BUF_AVAILABLE;
 
-               mutex_exit(&evicted_state->arcs_mtx);
-               mutex_exit(&old_state->arcs_mtx);
+               mutex_exit(evicted_lock);
+               mutex_exit(lock);
        }
        mutex_exit(hash_lock);
        rw_exit(&buf->b_lock);
@@ -3428,7 +3659,8 @@ void
 arc_init(void)
 {
        int prefetch_tunable_set = 0;
-       
+       int i;
+
        mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
        mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -3496,33 +3728,34 @@ arc_init(void)
        arc_l2c_only = &ARC_l2c_only;
        arc_size = 0;
 
-       mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-       mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
-       list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-       list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
-           sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+       for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
+               
+               mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
+                   NULL, MUTEX_DEFAULT, NULL);
+               mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
+                   NULL, MUTEX_DEFAULT, NULL);
+               mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
+                   NULL, MUTEX_DEFAULT, NULL);
+               mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
+                   NULL, MUTEX_DEFAULT, NULL);
+               mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
+                   NULL, MUTEX_DEFAULT, NULL);
+               mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
+                   NULL, MUTEX_DEFAULT, NULL);
+       
+               list_create(&arc_mru->arcs_lists[i],
+                   sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, 
b_arc_node));
+               list_create(&arc_mru_ghost->arcs_lists[i],
+                   sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, 
b_arc_node));
+               list_create(&arc_mfu->arcs_lists[i],
+                   sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, 
b_arc_node));
+               list_create(&arc_mfu_ghost->arcs_lists[i],
+                   sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, 
b_arc_node));
+               list_create(&arc_mfu_ghost->arcs_lists[i],
+                   sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, 
b_arc_node));
+               list_create(&arc_l2c_only->arcs_lists[i],
+                   sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, 
b_arc_node));
+       }
 
        buf_init();
 
@@ -3596,7 +3829,8 @@ arc_init(void)
 void
 arc_fini(void)
 {
-
+       int i;
+       
        mutex_enter(&arc_reclaim_thr_lock);
        arc_thread_exit = 1;
        cv_signal(&arc_reclaim_thr_cv);
@@ -3617,21 +3851,19 @@ arc_fini(void)
        mutex_destroy(&arc_reclaim_thr_lock);
        cv_destroy(&arc_reclaim_thr_cv);
 
-       list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
-       list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
-       list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
-       list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
-       list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
-       list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
-       list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
-       list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
-
-       mutex_destroy(&arc_anon->arcs_mtx);
-       mutex_destroy(&arc_mru->arcs_mtx);
-       mutex_destroy(&arc_mru_ghost->arcs_mtx);
-       mutex_destroy(&arc_mfu->arcs_mtx);
-       mutex_destroy(&arc_mfu_ghost->arcs_mtx);
-
+       for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
+               list_destroy(&arc_mru->arcs_lists[i]);
+               list_destroy(&arc_mru_ghost->arcs_lists[i]);
+               list_destroy(&arc_mfu->arcs_lists[i]);
+               list_destroy(&arc_mfu_ghost->arcs_lists[i]);
+
+               mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
+               mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
+               mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
+               mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
+               mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
+       }
+       
        mutex_destroy(&zfs_write_limit_lock);
 
        buf_fini();
@@ -4026,28 +4258,31 @@ static list_t *
 l2arc_list_locked(int list_num, kmutex_t **lock)
 {
        list_t *list;
+       int idx;
+       
+       ASSERT(list_num >= 0 && list_num < 2*ARC_BUFC_NUMLISTS);
 
-       ASSERT(list_num >= 0 && list_num <= 3);
-
-       switch (list_num) {
-       case 0:
-               list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
-               *lock = &arc_mfu->arcs_mtx;
-               break;
-       case 1:
-               list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
-               *lock = &arc_mru->arcs_mtx;
-               break;
-       case 2:
-               list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
-               *lock = &arc_mfu->arcs_mtx;
-               break;
-       case 3:
-               list = &arc_mru->arcs_list[ARC_BUFC_DATA];
-               *lock = &arc_mru->arcs_mtx;
-               break;
+       if (list_num < ARC_BUFC_NUMMETADATALISTS) {
+               idx = list_num;
+               list = &arc_mfu->arcs_lists[idx];
+               *lock = ARCS_LOCK(arc_mfu, idx);
+       } else if (list_num < ARC_BUFC_NUMMETADATALISTS*2) {
+               idx = list_num - ARC_BUFC_NUMMETADATALISTS;
+               list = &arc_mru->arcs_lists[idx];
+               *lock = ARCS_LOCK(arc_mru, idx);
+       } else if (list_num < (ARC_BUFC_NUMMETADATALISTS*2 +
+               ARC_BUFC_NUMDATALISTS)) {
+               idx = list_num - ARC_BUFC_NUMMETADATALISTS;
+               list = &arc_mfu->arcs_lists[idx];
+               *lock = ARCS_LOCK(arc_mfu, idx);
+       } else {
+               idx = list_num - ARC_BUFC_NUMLISTS;
+               list = &arc_mru->arcs_lists[idx];
+               *lock = ARCS_LOCK(arc_mru, idx);
        }
 
+       CTR3(KTR_SPARE2, "list=%p list_num=%d idx=%d",
+           list, list_num, idx);
        ASSERT(!(MUTEX_HELD(*lock)));
        mutex_enter(*lock);
        return (list);
@@ -4212,13 +4447,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
        head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
        head->b_flags |= ARC_L2_WRITE_HEAD;
 
+       ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
        /*
         * Copy buffers for L2ARC writing.
         */
        mutex_enter(&l2arc_buflist_mtx);
-       for (try = 0; try <= 3; try++) {
+       for (try = 0; try < 2*ARC_BUFC_NUMLISTS; try++) {
                list = l2arc_list_locked(try, &list_lock);
                passed_sz = 0;
+               ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
 
                /*
                 * L2ARC fast warmup.
@@ -4231,52 +4468,66 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
                        ab = list_head(list);
                else
                        ab = list_tail(list);
+               if (ab == NULL) {
+                       ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
+               }
 
                for (; ab; ab = ab_prev) {
                        if (arc_warm == B_FALSE)
                                ab_prev = list_next(list, ab);
                        else
                                ab_prev = list_prev(list, ab);
-
+                       ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, 
ab->b_size);
+               
                        hash_lock = HDR_LOCK(ab);
                        have_lock = MUTEX_HELD(hash_lock);
                        if (!have_lock && !mutex_tryenter(hash_lock)) {
+                               ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
                                /*
                                 * Skip this buffer rather than waiting.
                                 */
                                continue;
                        }
 
+                       if (ab->b_l2hdr != NULL) {
+                               /*
+                                * Already in L2ARC.
+                                */
+                               mutex_exit(hash_lock);
+                               ARCSTAT_BUMP(arcstat_l2_write_in_l2);
+                               continue;
+                       }
+
                        passed_sz += ab->b_size;
                        if (passed_sz > headroom) {
                                /*
                                 * Searched too far.
                                 */
                                mutex_exit(hash_lock);
+                               ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
                                break;
                        }
 
                        if (ab->b_spa != spa) {
                                mutex_exit(hash_lock);
+                               ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
                                continue;
                        }
 
-                       if (ab->b_l2hdr != NULL) {
-                               /*
-                                * Already in L2ARC.
-                                */
+                       if (HDR_IO_IN_PROGRESS(ab)) {
                                mutex_exit(hash_lock);
+                               
ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
                                continue;
                        }
-
-                       if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
+                       if (!HDR_L2CACHE(ab)) {
                                mutex_exit(hash_lock);
+                               ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
                                continue;
                        }
-
                        if ((write_sz + ab->b_size) > target_sz) {
                                full = B_TRUE;
                                mutex_exit(hash_lock);
+                               ARCSTAT_BUMP(arcstat_l2_write_full);
                                break;
                        }
 
@@ -4300,8 +4551,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
                                cb->l2wcb_head = head;
                                pio = zio_root(spa, l2arc_write_done, cb,
                                    ZIO_FLAG_CANFAIL);
+                               ARCSTAT_BUMP(arcstat_l2_write_pios);
                        }
 
+                       ARCSTAT_INCR(arcstat_l2_write_bytes_written, 
ab->b_size);
                        /*
                         * Create and add a new L2ARC header.
                         */
@@ -4309,7 +4562,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
                        hdrl2->b_dev = dev;
                        hdrl2->b_daddr = dev->l2ad_hand;
 
-                       ab->b_flags |= ARC_L2_WRITING;
                        ab->b_l2hdr = hdrl2;
                        list_insert_head(dev->l2ad_buflist, ab);
                        buf_data = ab->b_buf->b_data;
@@ -4397,7 +4649,7 @@ l2arc_feed_thread(void *dummy __unused)
                 */
                CALLB_CPR_SAFE_BEGIN(&cpr);
                (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-                   hz * l2arc_feed_secs);
+                   hz * l2arc_feed_secs >> l2arc_feed_secs_shift);
                CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 
                /*

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h       Tue Mar 
16 21:44:21 2010        (r205230)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h       Tue Mar 
16 22:17:21 2010        (r205231)
@@ -55,8 +55,8 @@ struct arc_buf {
 };
 
 typedef enum arc_buf_contents {
-       ARC_BUFC_DATA,                          /* buffer contains data */
        ARC_BUFC_METADATA,                      /* buffer contains metadata */
+       ARC_BUFC_DATA,                          /* buffer contains data */
        ARC_BUFC_NUMTYPES
 } arc_buf_contents_t;
 /*
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to