Author: mav
Date: Sat Oct  3 11:11:56 2015
New Revision: 288581
URL: https://svnweb.freebsd.org/changeset/base/288581

Log:
  MFC r286763: 5497 lock contention on arcs_mtx
  
  Reviewed by: George Wilson <george.wil...@delphix.com>
  Reviewed by: Matthew Ahrens <mahr...@delphix.com>
  Reviewed by: Richard Elling <richard.ell...@richardelling.com>
  Approved by: Dan McDonald <dan...@omniti.com>
  Author: Prakash Surya <prakash.su...@delphix.com>
  
  illumos/illumos-gate@244781f10dcd82684fd8163c016540667842f203
  
  This patch attempts to reduce lock contention on the current arc_state_t
  mutexes. These mutexes are used liberally to protect the number of LRU
  lists within the ARC (e.g. ARC_mru, ARC_mfu, etc). The granularity at
  which these locks are acquired has been shown to greatly affect the
  performance of highly concurrent, cached workloads.

Added:
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
     - copied unchanged from r286763, 
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
     - copied unchanged from r286763, 
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
Modified:
  stable/10/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
  stable/10/sys/conf/files
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/Makefile.files    Sat Oct 
 3 11:10:54 2015        (r288580)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/Makefile.files    Sat Oct 
 3 11:11:56 2015        (r288581)
@@ -68,6 +68,7 @@ ZFS_COMMON_OBJS +=            \
        lz4.o                   \
        lzjb.o                  \
        metaslab.o              \
+       multilist.o             \
        range_tree.o            \
        refcount.o              \
        rrwlock.o               \

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c      Sat Oct 
 3 11:10:54 2015        (r288580)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c      Sat Oct 
 3 11:11:56 2015        (r288581)
@@ -129,6 +129,7 @@
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
+#include <sys/multilist.h>
 #ifdef _KERNEL
 #include <sys/dnlc.h>
 #endif
@@ -149,21 +150,39 @@ int arc_procfd;
 #endif
 #endif /* illumos */
 
-static kmutex_t                arc_reclaim_thr_lock;
-static kcondvar_t      arc_reclaim_thr_cv;     /* used to signal reclaim thr */
-static uint8_t         arc_thread_exit;
+static kmutex_t                arc_reclaim_lock;
+static kcondvar_t      arc_reclaim_thread_cv;
+static boolean_t       arc_reclaim_thread_exit;
+static kcondvar_t      arc_reclaim_waiters_cv;
+
+static kmutex_t                arc_user_evicts_lock;
+static kcondvar_t      arc_user_evicts_cv;
+static boolean_t       arc_user_evicts_thread_exit;
 
 uint_t arc_reduce_dnlc_percent = 3;
 
 /*
- * The number of iterations through arc_evict_*() before we
- * drop & reacquire the lock.
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
  */
-int arc_evict_iterations = 100;
+int zfs_arc_evict_batch_limit = 10;
+
+/*
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
+ */
+int zfs_arc_num_sublists_per_state = 0;
 
 /* number of seconds before growing cache again */
 static int             arc_grow_retry = 60;
 
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int            zfs_arc_overflow_shift = 8;
+
 /* shift of arc_c for calculating both min and max arc_p */
 static int             arc_p_min_shift = 4;
 
@@ -319,10 +338,19 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta
  */
 
 typedef struct arc_state {
-       list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
-       uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
-       uint64_t arcs_size;     /* total amount of data in this state */
-       kmutex_t arcs_mtx;
+       /*
+        * list of evictable buffers
+        */
+       multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+       /*
+        * total amount of evictable data in this state
+        */
+       uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+       /*
+        * total amount of data in this state; this includes: evictable,
+        * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+        */
+       uint64_t arcs_size;
 } arc_state_t;
 
 /* The 6 states: */
@@ -350,7 +378,6 @@ typedef struct arc_stats {
        kstat_named_t arcstat_mfu_ghost_hits;
        kstat_named_t arcstat_allocated;
        kstat_named_t arcstat_deleted;
-       kstat_named_t arcstat_recycle_miss;
        /*
         * Number of buffers that could not be evicted because the hash lock
         * was held by another thread.  The lock may not necessarily be held
@@ -364,9 +391,15 @@ typedef struct arc_stats {
         * not from the spa we're trying to evict from.
         */
        kstat_named_t arcstat_evict_skip;
+       /*
+        * Number of times arc_evict_state() was unable to evict enough
+        * buffers to reach it's target amount.
+        */
+       kstat_named_t arcstat_evict_not_enough;
        kstat_named_t arcstat_evict_l2_cached;
        kstat_named_t arcstat_evict_l2_eligible;
        kstat_named_t arcstat_evict_l2_ineligible;
+       kstat_named_t arcstat_evict_l2_skip;
        kstat_named_t arcstat_hash_elements;
        kstat_named_t arcstat_hash_elements_max;
        kstat_named_t arcstat_hash_collisions;
@@ -517,7 +550,7 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_writes_sent;
        kstat_named_t arcstat_l2_writes_done;
        kstat_named_t arcstat_l2_writes_error;
-       kstat_named_t arcstat_l2_writes_hdr_miss;
+       kstat_named_t arcstat_l2_writes_lock_retry;
        kstat_named_t arcstat_l2_evict_lock_retry;
        kstat_named_t arcstat_l2_evict_reading;
        kstat_named_t arcstat_l2_evict_l1cached;
@@ -571,12 +604,13 @@ static arc_stats_t arc_stats = {
        { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
        { "allocated",                  KSTAT_DATA_UINT64 },
        { "deleted",                    KSTAT_DATA_UINT64 },
-       { "recycle_miss",               KSTAT_DATA_UINT64 },
        { "mutex_miss",                 KSTAT_DATA_UINT64 },
        { "evict_skip",                 KSTAT_DATA_UINT64 },
+       { "evict_not_enough",           KSTAT_DATA_UINT64 },
        { "evict_l2_cached",            KSTAT_DATA_UINT64 },
        { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
        { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
+       { "evict_l2_skip",              KSTAT_DATA_UINT64 },
        { "hash_elements",              KSTAT_DATA_UINT64 },
        { "hash_elements_max",          KSTAT_DATA_UINT64 },
        { "hash_collisions",            KSTAT_DATA_UINT64 },
@@ -615,7 +649,7 @@ static arc_stats_t arc_stats = {
        { "l2_writes_sent",             KSTAT_DATA_UINT64 },
        { "l2_writes_done",             KSTAT_DATA_UINT64 },
        { "l2_writes_error",            KSTAT_DATA_UINT64 },
-       { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
+       { "l2_writes_lock_retry",       KSTAT_DATA_UINT64 },
        { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
        { "l2_evict_reading",           KSTAT_DATA_UINT64 },
        { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
@@ -792,7 +826,7 @@ typedef struct l1arc_buf_hdr {
 
        /* protected by arc state mutex */
        arc_state_t             *b_state;
-       list_node_t             b_arc_node;
+       multilist_node_t        b_arc_node;
 
        /* updated atomically */
        clock_t                 b_arc_access;
@@ -863,7 +897,6 @@ sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HAN
 #endif
 
 static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
 static arc_buf_hdr_t arc_eviction_hdr;
 
 #define        GHOST_STATE(state)      \
@@ -1092,8 +1125,7 @@ static uint8_t l2arc_thread_exit;
 
 static void arc_get_data_buf(arc_buf_t *);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
-static int arc_evict_needed(arc_buf_contents_t);
-static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
+static boolean_t arc_is_overflowing();
 static void arc_buf_watch(arc_buf_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
@@ -1274,6 +1306,7 @@ hdr_full_cons(void *vbuf, void *unused, 
        cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
        refcount_create(&hdr->b_l1hdr.b_refcnt);
        mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+       multilist_link_init(&hdr->b_l1hdr.b_arc_node);
        arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
        return (0);
@@ -1318,6 +1351,7 @@ hdr_full_dest(void *vbuf, void *unused)
        cv_destroy(&hdr->b_l1hdr.b_cv);
        refcount_destroy(&hdr->b_l1hdr.b_refcnt);
        mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+       ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
        arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
@@ -1354,7 +1388,7 @@ hdr_recl(void *unused)
         * which is after we do arc_fini().
         */
        if (!arc_dead)
-               cv_signal(&arc_reclaim_thr_cv);
+               cv_signal(&arc_reclaim_thread_cv);
 }
 
 static void
@@ -1433,18 +1467,31 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem
                 * l2c_only even though it's about to change.
                 */
                nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+               /* Verify previous threads set to NULL before freeing */
+               ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
        } else {
                ASSERT(hdr->b_l1hdr.b_buf == NULL);
                ASSERT0(hdr->b_l1hdr.b_datacnt);
-               ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
                /*
-                * We might be removing the L1hdr of a buffer which was just
-                * written out to L2ARC. If such a buffer is compressed then we
-                * need to free its b_tmp_cdata before destroying the header.
-                */
-               if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
-                   HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
-                       l2arc_release_cdata_buf(hdr);
+                * If we've reached here, We must have been called from
+                * arc_evict_hdr(), as such we should have already been
+                * removed from any ghost list we were previously on
+                * (which protects us from racing with arc_evict_state),
+                * thus no locking is needed during this check.
+                */
+               ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+               /*
+                * A buffer must not be moved into the arc_l2c_only
+                * state if it's not finished being written out to the
+                * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
+                * might try to be accessed, even though it was removed.
+                */
+               VERIFY(!HDR_L2_WRITING(hdr));
+               VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+
                nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
        }
        /*
@@ -1677,14 +1724,13 @@ add_reference(arc_buf_hdr_t *hdr, kmutex
            (state != arc_anon)) {
                /* We don't use the L2-only state list. */
                if (state != arc_l2c_only) {
+                       arc_buf_contents_t type = arc_buf_type(hdr);
                        uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
-                       list_t *list = &state->arcs_list[arc_buf_type(hdr)];
-                       uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+                       multilist_t *list = &state->arcs_list[type];
+                       uint64_t *size = &state->arcs_lsize[type];
+
+                       multilist_remove(list, hdr);
 
-                       ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-                       mutex_enter(&state->arcs_mtx);
-                       ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
-                       list_remove(list, hdr);
                        if (GHOST_STATE(state)) {
                                ASSERT0(hdr->b_l1hdr.b_datacnt);
                                ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
@@ -1693,7 +1739,6 @@ add_reference(arc_buf_hdr_t *hdr, kmutex
                        ASSERT(delta > 0);
                        ASSERT3U(*size, >=, delta);
                        atomic_add_64(size, -delta);
-                       mutex_exit(&state->arcs_mtx);
                }
                /* remove the prefetch flag if we get a reference */
                hdr->b_flags &= ~ARC_FLAG_PREFETCH;
@@ -1716,22 +1761,21 @@ remove_reference(arc_buf_hdr_t *hdr, kmu
         */
        if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
            (state != arc_anon)) {
-               uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+               arc_buf_contents_t type = arc_buf_type(hdr);
+               multilist_t *list = &state->arcs_list[type];
+               uint64_t *size = &state->arcs_lsize[type];
+
+               multilist_insert(list, hdr);
 
-               ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-               mutex_enter(&state->arcs_mtx);
-               ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
-               list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr);
                ASSERT(hdr->b_l1hdr.b_datacnt > 0);
                atomic_add_64(size, hdr->b_size *
                    hdr->b_l1hdr.b_datacnt);
-               mutex_exit(&state->arcs_mtx);
        }
        return (cnt);
 }
 
 /*
- * Move the supplied buffer to the indicated state.  The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
@@ -1775,15 +1819,10 @@ arc_change_state(arc_state_t *new_state,
         */
        if (refcnt == 0) {
                if (old_state != arc_anon && old_state != arc_l2c_only) {
-                       int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
                        uint64_t *size = &old_state->arcs_lsize[buftype];
 
-                       if (use_mutex)
-                               mutex_enter(&old_state->arcs_mtx);
-
                        ASSERT(HDR_HAS_L1HDR(hdr));
-                       ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
-                       list_remove(&old_state->arcs_list[buftype], hdr);
+                       multilist_remove(&old_state->arcs_list[buftype], hdr);
 
                        /*
                         * If prefetching out of the ghost cache,
@@ -1796,12 +1835,8 @@ arc_change_state(arc_state_t *new_state,
                        }
                        ASSERT3U(*size, >=, from_delta);
                        atomic_add_64(size, -from_delta);
-
-                       if (use_mutex)
-                               mutex_exit(&old_state->arcs_mtx);
                }
                if (new_state != arc_anon && new_state != arc_l2c_only) {
-                       int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
                        uint64_t *size = &new_state->arcs_lsize[buftype];
 
                        /*
@@ -1811,10 +1846,7 @@ arc_change_state(arc_state_t *new_state,
                         * beforehand.
                         */
                        ASSERT(HDR_HAS_L1HDR(hdr));
-                       if (use_mutex)
-                               mutex_enter(&new_state->arcs_mtx);
-
-                       list_insert_head(&new_state->arcs_list[buftype], hdr);
+                       multilist_insert(&new_state->arcs_list[buftype], hdr);
 
                        /* ghost elements have a ghost size */
                        if (GHOST_STATE(new_state)) {
@@ -1823,9 +1855,6 @@ arc_change_state(arc_state_t *new_state,
                                to_delta = hdr->b_size;
                        }
                        atomic_add_64(size, to_delta);
-
-                       if (use_mutex)
-                               mutex_exit(&new_state->arcs_mtx);
                }
        }
 
@@ -1847,8 +1876,8 @@ arc_change_state(arc_state_t *new_state,
         * L2 headers should never be on the L2 state list since they don't
         * have L1 headers allocated.
         */
-       ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
-           list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
+       ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+           multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
 }
 
 void
@@ -1941,6 +1970,7 @@ arc_buf_alloc(spa_t *spa, int32_t size, 
        hdr->b_l1hdr.b_state = arc_anon;
        hdr->b_l1hdr.b_arc_access = 0;
        hdr->b_l1hdr.b_datacnt = 1;
+       hdr->b_l1hdr.b_tmp_cdata = NULL;
 
        arc_get_data_buf(buf);
        ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -2076,7 +2106,7 @@ arc_buf_free_on_write(void *data, size_t
 {
        l2arc_data_free_t *df;
 
-       df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+       df = kmem_alloc(sizeof (*df), KM_SLEEP);
        df->l2df_data = data;
        df->l2df_size = size;
        df->l2df_func = free_func;
@@ -2120,19 +2150,49 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr
        if (!HDR_HAS_L1HDR(hdr))
                return;
 
-       if (hdr->b_l1hdr.b_tmp_cdata == NULL)
+       /*
+        * The header isn't being written to the l2arc device, thus it
+        * shouldn't have a b_tmp_cdata to free.
+        */
+       if (!HDR_L2_WRITING(hdr)) {
+               ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+               return;
+       }
+
+       /*
+        * The header does not have compression enabled. This can be due
+        * to the buffer not being compressible, or because we're
+        * freeing the buffer before the second phase of
+        * l2arc_write_buffer() has started (which does the compression
+        * step). In either case, b_tmp_cdata does not point to a
+        * separately compressed buffer, so there's nothing to free (it
+        * points to the same buffer as the arc_buf_t's b_data field).
+        */
+       if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+               hdr->b_l1hdr.b_tmp_cdata = NULL;
+               return;
+       }
+
+       /*
+        * There's nothing to free since the buffer was all zero's and
+        * compressed to a zero length buffer.
+        */
+       if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
+               ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
                return;
+       }
+
+       ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
 
-       ASSERT(HDR_L2_WRITING(hdr));
-       arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size,
-           zio_data_buf_free);
+       arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
+           hdr->b_size, zio_data_buf_free);
 
        ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
        hdr->b_l1hdr.b_tmp_cdata = NULL;
 }
 
 static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
+arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
 {
        arc_buf_t **bufp;
 
@@ -2147,17 +2207,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_
                arc_buf_unwatch(buf);
 #endif /* illumos */
 
-               if (!recycle) {
-                       if (type == ARC_BUFC_METADATA) {
-                               arc_buf_data_free(buf, zio_buf_free);
-                               arc_space_return(size, ARC_SPACE_META);
-                       } else {
-                               ASSERT(type == ARC_BUFC_DATA);
-                               arc_buf_data_free(buf, zio_data_buf_free);
-                               arc_space_return(size, ARC_SPACE_DATA);
-                       }
+               if (type == ARC_BUFC_METADATA) {
+                       arc_buf_data_free(buf, zio_buf_free);
+                       arc_space_return(size, ARC_SPACE_META);
+               } else {
+                       ASSERT(type == ARC_BUFC_DATA);
+                       arc_buf_data_free(buf, zio_data_buf_free);
+                       arc_space_return(size, ARC_SPACE_DATA);
                }
-               if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
+
+               /* protected by hash lock, if in the hash table */
+               if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
                        uint64_t *cnt = &state->arcs_lsize[type];
 
                        ASSERT(refcount_is_zero(
@@ -2305,20 +2365,19 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
                        arc_buf_t *buf = hdr->b_l1hdr.b_buf;
 
                        if (buf->b_efunc != NULL) {
-                               mutex_enter(&arc_eviction_mtx);
+                               mutex_enter(&arc_user_evicts_lock);
                                mutex_enter(&buf->b_evict_lock);
                                ASSERT(buf->b_hdr != NULL);
-                               arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
-                                   FALSE);
+                               arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
                                hdr->b_l1hdr.b_buf = buf->b_next;
                                buf->b_hdr = &arc_eviction_hdr;
                                buf->b_next = arc_eviction_list;
                                arc_eviction_list = buf;
                                mutex_exit(&buf->b_evict_lock);
-                               mutex_exit(&arc_eviction_mtx);
+                               cv_signal(&arc_user_evicts_cv);
+                               mutex_exit(&arc_user_evicts_lock);
                        } else {
-                               arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
-                                   TRUE);
+                               arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
                        }
                }
 #ifdef ZFS_DEBUG
@@ -2331,7 +2390,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 
        ASSERT3P(hdr->b_hash_next, ==, NULL);
        if (HDR_HAS_L1HDR(hdr)) {
-               ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+               ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
                ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
                kmem_cache_free(hdr_full_cache, hdr);
        } else {
@@ -2357,7 +2416,7 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 
                (void) remove_reference(hdr, hash_lock, tag);
                if (hdr->b_l1hdr.b_datacnt > 1) {
-                       arc_buf_destroy(buf, FALSE, TRUE);
+                       arc_buf_destroy(buf, TRUE);
                } else {
                        ASSERT(buf == hdr->b_l1hdr.b_buf);
                        ASSERT(buf->b_efunc == NULL);
@@ -2371,16 +2430,16 @@ arc_buf_free(arc_buf_t *buf, void *tag)
                 * this buffer unless the write completes before we finish
                 * decrementing the reference count.
                 */
-               mutex_enter(&arc_eviction_mtx);
+               mutex_enter(&arc_user_evicts_lock);
                (void) remove_reference(hdr, NULL, tag);
                ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
                destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
-               mutex_exit(&arc_eviction_mtx);
+               mutex_exit(&arc_user_evicts_lock);
                if (destroy_hdr)
                        arc_hdr_destroy(hdr);
        } else {
                if (remove_reference(hdr, NULL, tag) > 0)
-                       arc_buf_destroy(buf, FALSE, TRUE);
+                       arc_buf_destroy(buf, TRUE);
                else
                        arc_hdr_destroy(hdr);
        }
@@ -2409,7 +2468,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void*
        (void) remove_reference(hdr, hash_lock, tag);
        if (hdr->b_l1hdr.b_datacnt > 1) {
                if (no_callback)
-                       arc_buf_destroy(buf, FALSE, TRUE);
+                       arc_buf_destroy(buf, TRUE);
        } else if (no_callback) {
                ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
                ASSERT(buf->b_efunc == NULL);
@@ -2470,418 +2529,675 @@ arc_buf_eviction_needed(arc_buf_t *buf)
 }
 
 /*
- * Evict buffers from list until we've removed the specified number of
- * bytes.  Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on it's state prior to entering this
+ * function. The following transitions are possible:
  *
- * This function makes a "best effort".  It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
- * It may also return without evicting as much space as requested.
+ *    - arc_mru -> arc_mru_ghost
+ *    - arc_mfu -> arc_mfu_ghost
+ *    - arc_mru_ghost -> arc_l2c_only
+ *    - arc_mru_ghost -> deleted
+ *    - arc_mfu_ghost -> arc_l2c_only
+ *    - arc_mfu_ghost -> deleted
  */
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
-    arc_buf_contents_t type)
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
-       arc_state_t *evicted_state;
-       uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
-       arc_buf_hdr_t *hdr, *hdr_prev = NULL;
-       kmutex_t *hash_lock;
-       boolean_t have_lock;
-       void *stolen = NULL;
-       arc_buf_hdr_t marker = { 0 };
-       int count = 0;
+       arc_state_t *evicted_state, *state;
+       int64_t bytes_evicted = 0;
 
-       ASSERT(state == arc_mru || state == arc_mfu);
-
-       evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+       ASSERT(MUTEX_HELD(hash_lock));
+       ASSERT(HDR_HAS_L1HDR(hdr));
 
-       /*
-        * The ghost list lock must be acquired first in order to prevent
-        * a 3 party deadlock:
-        *
-        *  - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
-        *    l2ad_mtx in arc_hdr_realloc
-        *  - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
-        *  - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
-        *    arc_*_ghost->arcs_mtx and forms a deadlock cycle.
-        *
-        * This situation is avoided by acquiring the ghost list lock first.
-        */
-       mutex_enter(&evicted_state->arcs_mtx);
-       mutex_enter(&state->arcs_mtx);
+       state = hdr->b_l1hdr.b_state;
+       if (GHOST_STATE(state)) {
+               ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+               ASSERT(hdr->b_l1hdr.b_buf == NULL);
 
-       /*
-        * Decide which "type" (data vs metadata) to recycle from.
-        *
-        * If we are over the metadata limit, recycle from metadata.
-        * If we are under the metadata minimum, recycle from data.
-        * Otherwise, recycle from whichever type has the oldest (least
-        * recently accessed) header.
-        */
-       if (recycle) {
-               arc_buf_hdr_t *data_hdr =
-                   list_tail(&state->arcs_list[ARC_BUFC_DATA]);
-               arc_buf_hdr_t *metadata_hdr =
-                   list_tail(&state->arcs_list[ARC_BUFC_METADATA]);
-               arc_buf_contents_t realtype;
-
-               if (data_hdr == NULL) {
-                       realtype = ARC_BUFC_METADATA;
-               } else if (metadata_hdr == NULL) {
-                       realtype = ARC_BUFC_DATA;
-               } else if (arc_meta_used >= arc_meta_limit) {
-                       realtype = ARC_BUFC_METADATA;
-               } else if (arc_meta_used <= arc_meta_min) {
-                       realtype = ARC_BUFC_DATA;
-               } else if (HDR_HAS_L1HDR(data_hdr) &&
-                   HDR_HAS_L1HDR(metadata_hdr) &&
-                   data_hdr->b_l1hdr.b_arc_access <
-                   metadata_hdr->b_l1hdr.b_arc_access) {
-                       realtype = ARC_BUFC_DATA;
-               } else {
-                       realtype = ARC_BUFC_METADATA;
+               /*
+                * l2arc_write_buffers() relies on a header's L1 portion
+                * (i.e. it's b_tmp_cdata field) during it's write phase.
+                * Thus, we cannot push a header onto the arc_l2c_only
+                * state (removing it's L1 piece) until the header is
+                * done being written to the l2arc.
+                */
+               if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+                       ARCSTAT_BUMP(arcstat_evict_l2_skip);
+                       return (bytes_evicted);
                }
-               if (realtype != type) {
+
+               ARCSTAT_BUMP(arcstat_deleted);
+               bytes_evicted += hdr->b_size;
+
+               DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+               if (HDR_HAS_L2HDR(hdr)) {
                        /*
-                        * If we want to evict from a different list,
-                        * we can not recycle, because DATA vs METADATA
-                        * buffers are segregated into different kmem
-                        * caches (and vmem arenas).
+                        * This buffer is cached on the 2nd Level ARC;
+                        * don't destroy the header.
                         */
-                       type = realtype;
-                       recycle = B_FALSE;
+                       arc_change_state(arc_l2c_only, hdr, hash_lock);
+                       /*
+                        * dropping from L1+L2 cached to L2-only,
+                        * realloc to remove the L1 header.
+                        */
+                       hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+                           hdr_l2only_cache);
+               } else {
+                       arc_change_state(arc_anon, hdr, hash_lock);
+                       arc_hdr_destroy(hdr);
                }
+               return (bytes_evicted);
        }
 
-       list_t *list = &state->arcs_list[type];
+       ASSERT(state == arc_mru || state == arc_mfu);
+       evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
-       for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
-               hdr_prev = list_prev(list, hdr);
-               /* prefetch buffers have a minimum lifespan */
-               if (HDR_IO_IN_PROGRESS(hdr) ||
-                   (spa && hdr->b_spa != spa) ||
-                   ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-                   ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-                   arc_min_prefetch_lifespan)) {
-                       skipped++;
-                       continue;
+       /* prefetch buffers have a minimum lifespan */
+       if (HDR_IO_IN_PROGRESS(hdr) ||
+           ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+           ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+           arc_min_prefetch_lifespan)) {
+               ARCSTAT_BUMP(arcstat_evict_skip);
+               return (bytes_evicted);
+       }
+
+       ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+       ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
+       while (hdr->b_l1hdr.b_buf) {
+               arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+               if (!mutex_tryenter(&buf->b_evict_lock)) {
+                       ARCSTAT_BUMP(arcstat_mutex_miss);
+                       break;
                }
-               /* "lookahead" for better eviction candidate */
-               if (recycle && hdr->b_size != bytes &&
-                   hdr_prev && hdr_prev->b_size == bytes)
-                       continue;
+               if (buf->b_data != NULL)
+                       bytes_evicted += hdr->b_size;
+               if (buf->b_efunc != NULL) {
+                       mutex_enter(&arc_user_evicts_lock);
+                       arc_buf_destroy(buf, FALSE);
+                       hdr->b_l1hdr.b_buf = buf->b_next;
+                       buf->b_hdr = &arc_eviction_hdr;
+                       buf->b_next = arc_eviction_list;
+                       arc_eviction_list = buf;
+                       cv_signal(&arc_user_evicts_cv);
+                       mutex_exit(&arc_user_evicts_lock);
+                       mutex_exit(&buf->b_evict_lock);
+               } else {
+                       mutex_exit(&buf->b_evict_lock);
+                       arc_buf_destroy(buf, TRUE);
+               }
+       }
 
-               /* ignore markers */
-               if (hdr->b_spa == 0)
-                       continue;
+       if (HDR_HAS_L2HDR(hdr)) {
+               ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
+       } else {
+               if (l2arc_write_eligible(hdr->b_spa, hdr))
+                       ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
+               else
+                       ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
+       }
+
+       if (hdr->b_l1hdr.b_datacnt == 0) {
+               arc_change_state(evicted_state, hdr, hash_lock);
+               ASSERT(HDR_IN_HASH_TABLE(hdr));
+               hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+               hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
+               DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+       }
+
+       return (bytes_evicted);
+}
+
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+    uint64_t spa, int64_t bytes)
+{
+       multilist_sublist_t *mls;
+       uint64_t bytes_evicted = 0;
+       arc_buf_hdr_t *hdr;
+       kmutex_t *hash_lock;
+       int evict_count = 0;
+
+       ASSERT3P(marker, !=, NULL);
+       IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+       mls = multilist_sublist_lock(ml, idx);
+
+       for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+           hdr = multilist_sublist_prev(mls, marker)) {
+               if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+                   (evict_count >= zfs_arc_evict_batch_limit))
+                       break;
 
                /*
-                * It may take a long time to evict all the bufs requested.
-                * To avoid blocking all arc activity, periodically drop
-                * the arcs_mtx and give other threads a chance to run
-                * before reacquiring the lock.
-                *
-                * If we are looking for a buffer to recycle, we are in
-                * the hot code path, so don't sleep.
+                * To keep our iteration location, move the marker
+                * forward. Since we're not holding hdr's hash lock, we
+                * must be very careful and not remove 'hdr' from the
+                * sublist. Otherwise, other consumers might mistake the
+                * 'hdr' as not being on a sublist when they call the
+                * multilist_link_active() function (they all rely on
+                * the hash lock protecting concurrent insertions and
+                * removals). multilist_sublist_move_forward() was
+                * specifically implemented to ensure this is the case
+                * (only 'marker' will be removed and re-inserted).
+                */
+               multilist_sublist_move_forward(mls, marker);
+
+               /*
+                * The only case where the b_spa field should ever be
+                * zero, is the marker headers inserted by
+                * arc_evict_state(). It's possible for multiple threads
+                * to be calling arc_evict_state() concurrently (e.g.
+                * dsl_pool_close() and zio_inject_fault()), so we must
+                * skip any markers we see from these other threads.
                 */
-               if (!recycle && count++ > arc_evict_iterations) {
-                       list_insert_after(list, hdr, &marker);
-                       mutex_exit(&state->arcs_mtx);
-                       mutex_exit(&evicted_state->arcs_mtx);
-                       kpreempt(KPREEMPT_SYNC);
-                       mutex_enter(&evicted_state->arcs_mtx);
-                       mutex_enter(&state->arcs_mtx);
-                       hdr_prev = list_prev(list, &marker);
-                       list_remove(list, &marker);
-                       count = 0;
+               if (hdr->b_spa == 0)
+                       continue;
+
+               /* we're only interested in evicting buffers of a certain spa */
+               if (spa != 0 && hdr->b_spa != spa) {
+                       ARCSTAT_BUMP(arcstat_evict_skip);
                        continue;
                }
 
                hash_lock = HDR_LOCK(hdr);
-               have_lock = MUTEX_HELD(hash_lock);
-               if (have_lock || mutex_tryenter(hash_lock)) {
-                       ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
-                       ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
-                       while (hdr->b_l1hdr.b_buf) {
-                               arc_buf_t *buf = hdr->b_l1hdr.b_buf;
-                               if (!mutex_tryenter(&buf->b_evict_lock)) {
-                                       missed += 1;
-                                       break;
-                               }
-                               if (buf->b_data != NULL) {
-                                       bytes_evicted += hdr->b_size;
-                                       if (recycle &&
-                                           arc_buf_type(hdr) == type &&
-                                           hdr->b_size == bytes &&
-                                           !HDR_L2_WRITING(hdr)) {
-                                               stolen = buf->b_data;
-                                               recycle = FALSE;
-                                       }
-                               }
-                               if (buf->b_efunc != NULL) {
-                                       mutex_enter(&arc_eviction_mtx);
-                                       arc_buf_destroy(buf,
-                                           buf->b_data == stolen, FALSE);
-                                       hdr->b_l1hdr.b_buf = buf->b_next;
-                                       buf->b_hdr = &arc_eviction_hdr;
-                                       buf->b_next = arc_eviction_list;
-                                       arc_eviction_list = buf;
-                                       mutex_exit(&arc_eviction_mtx);
-                                       mutex_exit(&buf->b_evict_lock);
-                               } else {
-                                       mutex_exit(&buf->b_evict_lock);
-                                       arc_buf_destroy(buf,
-                                           buf->b_data == stolen, TRUE);
-                               }
-                       }
 
-                       if (HDR_HAS_L2HDR(hdr)) {
-                               ARCSTAT_INCR(arcstat_evict_l2_cached,
-                                   hdr->b_size);
-                       } else {
-                               if (l2arc_write_eligible(hdr->b_spa, hdr)) {
-                                       ARCSTAT_INCR(arcstat_evict_l2_eligible,
-                                           hdr->b_size);
-                               } else {
-                                       ARCSTAT_INCR(
-                                           arcstat_evict_l2_ineligible,
-                                           hdr->b_size);
-                               }
-                       }
+               /*
+                * We aren't calling this function from any code path
+                * that would already be holding a hash lock, so we're
+                * asserting on this assumption to be defensive in case
+                * this ever changes. Without this check, it would be
+                * possible to incorrectly increment arcstat_mutex_miss
+                * below (e.g. if the code changed such that we called
+                * this function with a hash lock held).
+                */
+               ASSERT(!MUTEX_HELD(hash_lock));
 
-                       if (hdr->b_l1hdr.b_datacnt == 0) {
-                               arc_change_state(evicted_state, hdr, hash_lock);
-                               ASSERT(HDR_IN_HASH_TABLE(hdr));
-                               hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
-                               hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
-                               DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
-                       }
-                       if (!have_lock)
-                               mutex_exit(hash_lock);
-                       if (bytes >= 0 && bytes_evicted >= bytes)
-                               break;
+               if (mutex_tryenter(hash_lock)) {
+                       uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+                       mutex_exit(hash_lock);
+
+                       bytes_evicted += evicted;
+
+                       /*
+                        * If evicted is zero, arc_evict_hdr() must have
+                        * decided to skip this header, don't increment
+                        * evict_count in this case.
+                        */
+                       if (evicted != 0)
+                               evict_count++;
+
+                       /*
+                        * If arc_size isn't overflowing, signal any
+                        * threads that might happen to be waiting.
+                        *
+                        * For each header evicted, we wake up a single
+                        * thread. If we used cv_broadcast, we could
+                        * wake up "too many" threads causing arc_size
+                        * to significantly overflow arc_c; since
+                        * arc_get_data_buf() doesn't check for overflow
+                        * when it's woken up (it doesn't because it's
+                        * possible for the ARC to be overflowing while
+                        * full of un-evictable buffers, and the
+                        * function should proceed in this case).
+                        *
+                        * If threads are left sleeping, due to not
+                        * using cv_broadcast, they will be woken up
+                        * just before arc_reclaim_thread() sleeps.
+                        */
+                       mutex_enter(&arc_reclaim_lock);
+                       if (!arc_is_overflowing())
+                               cv_signal(&arc_reclaim_waiters_cv);
+                       mutex_exit(&arc_reclaim_lock);
                } else {
-                       missed += 1;
+                       ARCSTAT_BUMP(arcstat_mutex_miss);
                }
        }
 
-       mutex_exit(&state->arcs_mtx);
-       mutex_exit(&evicted_state->arcs_mtx);
+       multilist_sublist_unlock(mls);
 
-       if (bytes_evicted < bytes)
-               dprintf("only evicted %lld bytes from %x",
-                   (longlong_t)bytes_evicted, state);
+       return (bytes_evicted);
+}
+
+/*
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
+{
+       uint64_t total_evicted = 0;
+       multilist_t *ml = &state->arcs_list[type];
+       int num_sublists;
+       arc_buf_hdr_t **markers;
 
-       if (skipped)
-               ARCSTAT_INCR(arcstat_evict_skip, skipped);
+       IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
 
-       if (missed)
-               ARCSTAT_INCR(arcstat_mutex_miss, missed);
+       num_sublists = multilist_get_num_sublists(ml);
 
        /*
-        * Note: we have just evicted some data into the ghost state,
-        * potentially putting the ghost size over the desired size.  Rather
-        * that evicting from the ghost list in this hot code path, leave
-        * this chore to the arc_reclaim_thread().
+        * If we've tried to evict from each sublist, made some
+        * progress, but still have not hit the target number of bytes
+        * to evict, we want to keep trying. The markers allow us to
+        * pick up where we left off for each individual sublist, rather
+        * than starting from the tail each time.
         */
+       markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+       for (int i = 0; i < num_sublists; i++) {
+               markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
 
-       return (stolen);
-}
+               /*
+                * A b_spa of 0 is used to indicate that this header is
+                * a marker. This fact is used in arc_adjust_type() and
+                * arc_evict_state_impl().
+                */
+               markers[i]->b_spa = 0;
 
-/*
- * Remove buffers from list until we've removed the specified number of
- * bytes.  Destroy the buffers that are removed.
- */

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-stable-10@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-stable-10
To unsubscribe, send any mail to "svn-src-stable-10-unsubscr...@freebsd.org"

Reply via email to