Author: mav Date: Sat Oct 3 07:27:58 2015 New Revision: 288547 URL: https://svnweb.freebsd.org/changeset/base/288547
Log: MFC r286570: 5408 managing ZFS cache devices requires lots of RAM Reviewed by: Christopher Siden <christopher.si...@delphix.com> Reviewed by: George Wilson <george.wil...@delphix.com> Reviewed by: Matthew Ahrens <mahr...@delphix.com> Reviewed by: Don Brady <dev.fs....@gmail.com> Reviewed by: Josef 'Jeff' Sipek <josef.si...@nexenta.com> Approved by: Garrett D'Amore <garr...@damore.org> Author: Chris Williamson <chris.william...@delphix.com> illumos/illumos-gate@89c86e32293a30cdd7af530c38b2073fee01411c Currently, every buffer cached in the L2ARC is accompanied by a 240-byte header in memory, leading to very high memory consumption when using very large cache devices. These changes significantly reduce this overhead. Currently: L1-only header = 176 bytes L1 + L2 or L2-only header = 176 bytes + 32 byte checksum + 32 byte l2hdr = 240 bytes Memory-optimized: L1-only header = 176 bytes L1 + L2 header = 176 bytes + 32 byte checksum = 208 bytes L2-only header = 96 bytes + 32 byte checksum = 128 bytes So overall: Trunk Optimized +-----------------+ L1-only | 176 B | 176 B | (same) +-----------------+ L1 & L2 | 240 B | 208 B | (saved 32 bytes) +-----------------+ L2-only | 240 B | 128 B | (saved 116 bytes) +-----------------+ For an average blocksize of 8KB, this means that for the L2ARC, the ratio of metadata to data has gone down from about 2.92% to 1.56%. For a 'storage optimized' EC2 instance with 1600GB of SSD and 60GB of RAM, this means that we expect a completely full L2ARC to use (1600 GB * 0.0156) / 60GB = 41% of the available memory, down from 78%. Relnotes: yes Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Directory Properties: stable/10/ (props changed) Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Sat Oct 3 07:26:48 2015 (r288546) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Sat Oct 3 07:27:58 2015 (r288547) @@ -111,7 +111,7 @@ * Note that the majority of the performance stats are manipulated * with atomic operations. * - * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction @@ -402,6 +402,7 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_hdr_miss; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; @@ -484,6 +485,7 @@ static arc_stats_t arc_stats = { { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, @@ -588,8 +590,6 @@ static int arc_no_grow; /* Don't try to static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; - typedef struct arc_callback arc_callback_t; struct arc_callback { @@ -610,29 +610,53 @@ struct arc_write_callback { arc_buf_t *awcb_buf; }; -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; +#ifdef ZFS_DEBUG + /* + * used for debugging wtih kmem_flags - by allocating and freeing + * b_thawed when the buffer is thawed, we get a record of the stack + * trace that thawed it. + */ void *b_thawed; +#endif - arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - arc_flags_t b_flags; uint32_t b_datacnt; - - arc_callback_t *b_acb; + /* for waiting on writes to complete */ kcondvar_t b_cv; - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; - /* protected by arc state mutex */ arc_state_t *b_state; list_node_t b_arc_node; @@ -643,8 +667,46 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; - l2arc_buf_hdr_t *b_l2hdr; + arc_callback_t *b_acb; + /* temporary buffer holder for in-flight compressed data */ + void *b_tmp_cdata; +} l1arc_buf_hdr_t; + +typedef struct l2arc_dev l2arc_dev_t; + +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + /* real alloc'd buffer size depending on b_compress applied */ + int32_t b_asize; + list_node_t b_l2node; +} l2arc_buf_hdr_t; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + /* + * Even though this checksum is only set/verified when a buffer is in + * the L1 cache, it needs to be in the set of common fields because it + * must be preserved from the time before a buffer is written out to + * L2ARC until after it is read back in. + */ + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* immutable */ + int32_t b_size; + uint64_t b_spa; + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; }; #ifdef _KERNEL @@ -681,22 +743,38 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) \ - ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS) + #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_ISTYPE_METADATA(hdr) \ + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) +#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) + +#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) +#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) + +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET 24 +#define HDR_COMPRESS_NBITS 7 + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) + /* * Other sizes */ -#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines @@ -820,7 +898,7 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_onl /* * L2ARC Internals */ -typedef struct l2arc_dev { +struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ @@ -829,15 +907,15 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ -} l2arc_dev_t; +}; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ -static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ @@ -857,18 +935,6 @@ typedef struct l2arc_write_callback { arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; -struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - /* compression applied to buffer data */ - enum zio_compress b_compress; - /* real alloc'd buffer size depending on b_compress applied */ - int b_asize; - /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; -}; - typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; @@ -887,12 +953,13 @@ static int arc_evict_needed(arc_buf_cont static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); static void arc_buf_watch(arc_buf_t *); +static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); +static uint32_t arc_bufc_to_flags(arc_buf_contents_t); + static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static void l2arc_hdr_stat_add(void); -static void l2arc_hdr_stat_remove(void); -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *); +static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); static void l2arc_release_cdata_buf(arc_buf_hdr_t *); @@ -915,8 +982,7 @@ buf_hash(uint64_t spa, const dva_t *dva, #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_cksum0 == 0) + (buf)->b_dva.dva_word[1] == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ @@ -929,7 +995,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr) hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; - hdr->b_cksum0 = 0; } static arc_buf_hdr_t * @@ -959,6 +1024,7 @@ buf_hash_find(uint64_t spa, const blkptr * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. + * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) @@ -971,8 +1037,14 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmut ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); - *lockp = hash_lock; - mutex_enter(hash_lock); + + if (lockp != NULL) { + *lockp = hash_lock; + mutex_enter(hash_lock); + } else { + ASSERT(MUTEX_HELD(hash_lock)); + } + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) @@ -1027,7 +1099,8 @@ buf_hash_remove(arc_buf_hdr_t *hdr) /* * Global data structures and functions for the buf kmem cache. */ -static kmem_cache_t *hdr_cache; +static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void @@ -1039,7 +1112,8 @@ buf_fini(void) (buf_hash_table.ht_mask + 1) * sizeof (void *)); for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -1049,15 +1123,27 @@ buf_fini(void) */ /* ARGSUSED */ static int -hdr_cons(void *vbuf, void *unused, int kmflag) +hdr_full_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_FULL_SIZE); + cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&hdr->b_l1hdr.b_refcnt); + mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; - bzero(hdr, sizeof (arc_buf_hdr_t)); - refcount_create(&hdr->b_refcnt); - cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + bzero(hdr, HDR_L2ONLY_SIZE); + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } @@ -1081,15 +1167,25 @@ buf_cons(void *vbuf, void *unused, int k */ /* ARGSUSED */ static void -hdr_dest(void *vbuf, void *unused) +hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(BUF_EMPTY(hdr)); - refcount_destroy(&hdr->b_refcnt); - cv_destroy(&hdr->b_cv); - mutex_destroy(&hdr->b_freeze_lock); - arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + cv_destroy(&hdr->b_l1hdr.b_cv); + refcount_destroy(&hdr->b_l1hdr.b_refcnt); + mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_l2only_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr = vbuf; + + ASSERT(BUF_EMPTY(hdr)); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ @@ -1143,8 +1239,11 @@ retry: goto retry; } - hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, + 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); + hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, + NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -1158,6 +1257,81 @@ retry: } } +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + ASSERT(HDR_HAS_L2HDR(hdr)); + + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + if (new == hdr_full_cache) { + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + } else { + ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* + * We might be removing the L1hdr of a buffer which was just + * written out to L2ARC. If such a buffer is compressed then we + * need to free its b_tmp_cdata before destroying the header. + */ + if (hdr->b_l1hdr.b_tmp_cdata != NULL && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(hdr); + nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + buf_discard_identity(hdr); + hdr->b_freeze_cksum = NULL; + kmem_cache_free(old, hdr); + + return (nhdr); +} + + #define ARC_MINTIME (hz>>4) /* 62 ms */ static void @@ -1168,16 +1342,15 @@ arc_cksum_verify(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); } static int @@ -1186,10 +1359,10 @@ arc_cksum_equal(arc_buf_t *buf) zio_cksum_t zc; int equal; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return (equal); } @@ -1200,15 +1373,15 @@ arc_cksum_compute(arc_buf_t *buf, boolea if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_watch(buf); #endif /* illumos */ @@ -1259,30 +1432,58 @@ arc_buf_watch(arc_buf_t *buf) } #endif /* illumos */ +static arc_buf_contents_t +arc_buf_type(arc_buf_hdr_t *hdr) +{ + if (HDR_ISTYPE_METADATA(hdr)) { + return (ARC_BUFC_METADATA); + } else { + return (ARC_BUFC_DATA); + } +} + +static uint32_t +arc_bufc_to_flags(arc_buf_contents_t type) +{ + switch (type) { + case ARC_BUFC_DATA: + /* metadata field is 0 if buffer contains normal data */ + return (0); + case ARC_BUFC_METADATA: + return (ARC_FLAG_BUFC_METADATA); + default: + break; + } + panic("undefined ARC buffer type!"); + return ((uint32_t)-1); +} + void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_state != arc_anon) + if (buf->b_hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) + if (HDR_IO_IN_PROGRESS(buf->b_hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } +#ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_thawed) - kmem_free(buf->b_hdr->b_thawed, 1); - buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); + if (buf->b_hdr->b_l1hdr.b_thawed != NULL) + kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); + buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } +#endif - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_unwatch(buf); @@ -1301,7 +1502,7 @@ arc_buf_freeze(arc_buf_t *buf) mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_state == arc_anon); + buf->b_hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); @@ -1312,7 +1513,7 @@ get_buf_info(arc_buf_hdr_t *hdr, arc_sta { uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth); - if (hdr->b_type == ARC_BUFC_METADATA) + if (arc_buf_type(hdr) == ARC_BUFC_METADATA) buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); else { buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); @@ -1327,32 +1528,36 @@ get_buf_info(arc_buf_hdr_t *hdr, arc_sta static void add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(MUTEX_HELD(hash_lock)); + arc_state_t *state = hdr->b_l1hdr.b_state; - if ((refcount_add(&hdr->b_refcnt, tag) == 1) && - (hdr->b_state != arc_anon)) { - uint64_t delta = hdr->b_size * hdr->b_datacnt; - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; - list_t *list; - kmutex_t *lock; + if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; + list_t *list; + kmutex_t *lock; - get_buf_info(hdr, hdr->b_state, &list, &lock); - ASSERT(!MUTEX_HELD(lock)); - mutex_enter(lock); - ASSERT(list_link_active(&hdr->b_arc_node)); - list_remove(list, hdr); - if (GHOST_STATE(hdr->b_state)) { - ASSERT0(hdr->b_datacnt); - ASSERT3P(hdr->b_buf, ==, NULL); - delta = hdr->b_size; - } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); - mutex_exit(lock); + get_buf_info(hdr, state, &list, &lock); + ASSERT(!MUTEX_HELD(lock)); + mutex_enter(lock); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_remove(list, hdr); + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + delta = hdr->b_size; + } + ASSERT(delta > 0); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); + mutex_exit(lock); + } /* remove the prefetch flag if we get a reference */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } @@ -1360,24 +1565,30 @@ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = hdr->b_state; + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) && + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[hdr->b_type]; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; list_t *list; kmutex_t *lock; get_buf_info(hdr, state, &list, &lock); ASSERT(!MUTEX_HELD(lock)); mutex_enter(lock); - ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); list_insert_head(list, hdr); - ASSERT(hdr->b_datacnt > 0); - atomic_add_64(size, hdr->b_size * hdr->b_datacnt); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); + atomic_add_64(size, hdr->b_size * + hdr->b_l1hdr.b_datacnt); mutex_exit(lock); } return (cnt); @@ -1391,44 +1602,64 @@ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { - arc_state_t *old_state = hdr->b_state; - int64_t refcnt = refcount_count(&hdr->b_refcnt); + arc_state_t *old_state; + int64_t refcnt; + uint32_t datacnt; uint64_t from_delta, to_delta; + arc_buf_contents_t buftype = arc_buf_type(hdr); list_t *list; kmutex_t *lock; + /* + * We almost always have an L1 hdr here, since we call arc_hdr_realloc() + * in arc_read() when bringing a buffer out of the L2ARC. However, the + * L1 hdr doesn't always exist when we change state to arc_anon before + * destroying a header, in which case reallocating to add the L1 hdr is + * pointless. + */ + if (HDR_HAS_L1HDR(hdr)) { + old_state = hdr->b_l1hdr.b_state; + refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); + datacnt = hdr->b_l1hdr.b_datacnt; + } else { + old_state = arc_l2c_only; + refcnt = 0; + datacnt = 0; + } + ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || hdr->b_datacnt > 0); - ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon); + ASSERT(refcnt == 0 || datacnt > 0); + ASSERT(!GHOST_STATE(new_state) || datacnt == 0); + ASSERT(old_state != arc_anon || datacnt <= 1); - from_delta = to_delta = hdr->b_datacnt * hdr->b_size; + from_delta = to_delta = datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { - if (old_state != arc_anon) { + if (old_state != arc_anon && old_state != arc_l2c_only) { int use_mutex; - uint64_t *size = &old_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &old_state->arcs_lsize[buftype]; get_buf_info(hdr, old_state, &list, &lock); use_mutex = !MUTEX_HELD(lock); if (use_mutex) mutex_enter(lock); - ASSERT(list_link_active(&hdr->b_arc_node)); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); list_remove(list, hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ - if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) { + if (GHOST_STATE(old_state) && datacnt == 0) { /* ghost elements have a ghost size */ - ASSERT(hdr->b_buf == NULL); + ASSERT(hdr->b_l1hdr.b_buf == NULL); from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); @@ -1437,10 +1668,17 @@ arc_change_state(arc_state_t *new_state, if (use_mutex) mutex_exit(lock); } - if (new_state != arc_anon) { + if (new_state != arc_anon && new_state != arc_l2c_only) { int use_mutex; - uint64_t *size = &new_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &new_state->arcs_lsize[buftype]; + /* + * An L1 header always exists here, since if we're + * moving to some L1-cached state (i.e. not l2c_only or + * anonymous), we realloc the header to add an L1hdr + * beforehand. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); get_buf_info(hdr, new_state, &list, &lock); use_mutex = !MUTEX_HELD(lock); if (use_mutex) @@ -1450,8 +1688,8 @@ arc_change_state(arc_state_t *new_state, /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(hdr->b_datacnt == 0); - ASSERT(hdr->b_buf == NULL); + ASSERT(datacnt == 0); + ASSERT(hdr->b_l1hdr.b_buf == NULL); to_delta = hdr->b_size; } atomic_add_64(size, to_delta); @@ -1465,20 +1703,22 @@ arc_change_state(arc_state_t *new_state, if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - /* adjust state sizes */ - if (to_delta) + /* adjust state sizes (ignore arc_l2c_only) */ + if (to_delta && new_state != arc_l2c_only) atomic_add_64(&new_state->arcs_size, to_delta); - if (from_delta) { + if (from_delta && old_state != arc_l2c_only) { ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } - hdr->b_state = new_state; + if (HDR_HAS_L1HDR(hdr)) + hdr->b_l1hdr.b_state = new_state; - /* adjust l2arc hdr stats */ - if (new_state == arc_l2c_only) - l2arc_hdr_stat_add(); - else if (old_state == arc_l2c_only) - l2arc_hdr_stat_remove(); + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. + */ + ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1534,31 +1774,36 @@ arc_space_return(uint64_t space, arc_spa } arc_buf_t * -arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; arc_buf_t *buf; ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); + ASSERT3P(hdr->b_freeze_cksum, ==, NULL); hdr->b_size = size; - hdr->b_type = type; hdr->b_spa = spa_load_guid(spa); - hdr->b_state = arc_anon; - hdr->b_arc_access = 0; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; + + hdr->b_flags = arc_bufc_to_flags(type); + hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + hdr->b_l1hdr.b_buf = buf; + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_datacnt = 1; + arc_get_data_buf(buf); - hdr->b_datacnt = 1; - hdr->b_flags = 0; - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - (void) refcount_add(&hdr->b_refcnt, tag); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); return (buf); } @@ -1591,8 +1836,9 @@ arc_return_buf(arc_buf_t *buf, void *tag arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - (void) refcount_add(&hdr->b_refcnt, tag); - (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } @@ -1601,12 +1847,12 @@ arc_return_buf(arc_buf_t *buf, void *tag void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; + arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - hdr = buf->b_hdr; - (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); - (void) refcount_remove(&hdr->b_refcnt, tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; @@ -1620,15 +1866,16 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; - ASSERT(hdr->b_state != arc_anon); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; - buf->b_next = hdr->b_buf; - hdr->b_buf = buf; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); @@ -1638,11 +1885,11 @@ arc_buf_clone(arc_buf_t *from) * then track the size and number of duplicates. These stats will be * updated as duplicate buffers are created and destroyed. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMP(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); } - hdr->b_datacnt += 1; + hdr->b_l1hdr.b_datacnt += 1; return (buf); } @@ -1665,17 +1912,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* ta hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } @@ -1718,18 +1968,26 @@ arc_buf_data_free(arc_buf_t *buf, void ( static void arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); *** DIFF OUTPUT TRUNCATED AT 1000 LINES *** _______________________________________________ svn-src-stable-10@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-stable-10 To unsubscribe, send any mail to "svn-src-stable-10-unsubscr...@freebsd.org"