Rebased, but that is not the first time.  What it really needs is reviews.

On 17.11.2016 14:22, Steven Hartland wrote:
> Thanks, looks like the PR needs a rebase before it can be merged.
> 
> On 17/11/2016 22:11, Alexander Motin wrote:
>> It is in OpenZFS review queue now:
>> https://github.com/openzfs/openzfs/pull/219  Welcome to comment there to
>> speed up the process.
>>
>> On 17.11.2016 13:43, Steven Hartland wrote:
>>> Is this something that should be upstreamed?
>>>
>>> On 17/11/2016 21:01, Alexander Motin wrote:
>>>> Author: mav
>>>> Date: Thu Nov 17 21:01:27 2016
>>>> New Revision: 308782
>>>> URL: https://svnweb.freebsd.org/changeset/base/308782
>>>>
>>>> Log:
>>>>   After some ZIL changes 6 years ago zil_slog_limit got partially broken
>>>>   due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
>>>>   Actually because of other changes about that time zl_itx_list_sz is not
>>>>   really required to implement the functionality, so this patch removes
>>>>   some unneeded broken code and variables.
>>>>   
>>>>   Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
>>>>   single heavy logger, that increased latency for other (more latency 
>>>> critical)
>>>>   loggers, by pushing heavy log out into the main pool instead of SLOG. 
>>>> Beside
>>>>   huge latency increase for heavy writers, this implementation caused 
>>>> double
>>>>   write of all data, since the log records were explicitly prepared for 
>>>> SLOG.
>>>>   Since we now have I/O scheduler, I've found it can be much more efficient
>>>>   to reduce priority of heavy logger SLOG writes from 
>>>> ZIO_PRIORITY_SYNC_WRITE
>>>>   to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
>>>>   
>>>>   Existing ZIL implementation had problem with space efficiency when it
>>>>   has to write large chunks of data into log blocks of limited size. In 
>>>> some
>>>>   cases efficiency stopped to almost as low as 50%. In case of ZIL stored 
>>>> on
>>>>   spinning rust, that also reduced log write speed in half, since head had 
>>>> to
>>>>   uselessly fly over allocated but not written areas. This change improves
>>>>   the situation by offloading problematic operations from z*_log_write() to
>>>>   zil_lwb_commit(), which knows real situation of log blocks allocation and
>>>>   can split large requests into pieces much more efficiently. Also as side
>>>>   effect it removes one of two data copy operations done by ZIL code 
>>>> WR_COPIED
>>>>   case.
>>>>   
>>>>   While there, untangle and unify code of z*_log_write() functions.
>>>>   Also zfs_log_write() alike to zvol_log_write() can now handle writes 
>>>> crossing
>>>>   block boundary, that may also improve efficiency if ZPL is made to do 
>>>> that.
>>>>   
>>>>   Sponsored by:    iXsystems, Inc.
>>>>
>>>> Modified:
>>>>   head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
>>>>   head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
>>>>   head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
>>>>   head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
>>>>   head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
>>>>   head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
>>>>   head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
>>>>   head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
>>>>
>>>> Modified: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
>>>> ==============================================================================
>>>> --- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c        Thu Nov 17 
>>>> 20:44:51 2016        (r308781)
>>>> +++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c        Thu Nov 17 
>>>> 21:01:27 2016        (r308782)
>>>> @@ -1371,7 +1371,6 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t
>>>>    itx->itx_private = zd;
>>>>    itx->itx_wr_state = write_state;
>>>>    itx->itx_sync = (ztest_random(8) == 0);
>>>> -  itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
>>>>  
>>>>    bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
>>>>        sizeof (*lr) - sizeof (lr_t));
>>>>
>>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
>>>> ==============================================================================
>>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h  Thu Nov 
>>>> 17 20:44:51 2016        (r308781)
>>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h  Thu Nov 
>>>> 17 21:01:27 2016        (r308782)
>>>> @@ -369,7 +369,6 @@ typedef struct itx {
>>>>    void            *itx_private;   /* type-specific opaque data */
>>>>    itx_wr_state_t  itx_wr_state;   /* write state */
>>>>    uint8_t         itx_sync;       /* synchronous transaction */
>>>> -  uint64_t        itx_sod;        /* record size on disk */
>>>>    uint64_t        itx_oid;        /* object id */
>>>>    lr_t            itx_lr;         /* common part of log record */
>>>>    /* followed by type-specific part of lr_xx_t and its immediate data */
>>>>
>>>> Modified: 
>>>> head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
>>>> ==============================================================================
>>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h     
>>>> Thu Nov 17 20:44:51 2016        (r308781)
>>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h     
>>>> Thu Nov 17 21:01:27 2016        (r308782)
>>>> @@ -42,6 +42,7 @@ extern "C" {
>>>>  typedef struct lwb {
>>>>    zilog_t         *lwb_zilog;     /* back pointer to log struct */
>>>>    blkptr_t        lwb_blk;        /* on disk address of this log blk */
>>>> +  boolean_t       lwb_slog;       /* lwb_blk is on SLOG device */
>>>>    int             lwb_nused;      /* # used bytes in buffer */
>>>>    int             lwb_sz;         /* size of block and buffer */
>>>>    char            *lwb_buf;       /* log write buffer */
>>>> @@ -62,7 +63,6 @@ typedef struct itxs {
>>>>  typedef struct itxg {
>>>>    kmutex_t        itxg_lock;      /* lock for this structure */
>>>>    uint64_t        itxg_txg;       /* txg for this chain */
>>>> -  uint64_t        itxg_sod;       /* total size on disk for this txg */
>>>>    itxs_t          *itxg_itxs;     /* sync and async itxs */
>>>>  } itxg_t;
>>>>  
>>>> @@ -120,7 +120,6 @@ struct zilog {
>>>>    kcondvar_t      zl_cv_batch[2]; /* batch condition variables */
>>>>    itxg_t          zl_itxg[TXG_SIZE]; /* intent log txg chains */
>>>>    list_t          zl_itx_commit_list; /* itx list to be committed */
>>>> -  uint64_t        zl_itx_list_sz; /* total size of records on list */
>>>>    uint64_t        zl_cur_used;    /* current commit log size used */
>>>>    list_t          zl_lwb_list;    /* in-flight log write list */
>>>>    kmutex_t        zl_vdev_lock;   /* protects zl_vdev_tree */
>>>> @@ -142,6 +141,8 @@ typedef struct zil_bp_node {
>>>>  
>>>>  #define   ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - 
>>>> \
>>>>      sizeof (lr_write_t))
>>>> +#define   ZIL_MAX_COPIED_DATA \
>>>> +    ((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof 
>>>> (lr_write_t))
>>>>  
>>>>  #ifdef    __cplusplus
>>>>  }
>>>>
>>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
>>>> ==============================================================================
>>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h  Thu Nov 
>>>> 17 20:44:51 2016        (r308781)
>>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h  Thu Nov 
>>>> 17 21:01:27 2016        (r308782)
>>>> @@ -547,7 +547,7 @@ extern zio_t *zio_free_sync(zio_t *pio, 
>>>>      const blkptr_t *bp, uint64_t size, enum zio_flag flags);
>>>>  
>>>>  extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
>>>> -    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
>>>> +    blkptr_t *old_bp, uint64_t size, boolean_t *slog);
>>>>  extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
>>>>  extern void zio_flush(zio_t *zio, vdev_t *vd);
>>>>  extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t 
>>>> offset,
>>>>
>>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
>>>> ==============================================================================
>>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c  Thu Nov 
>>>> 17 20:44:51 2016        (r308781)
>>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c  Thu Nov 
>>>> 17 21:01:27 2016        (r308782)
>>>> @@ -464,20 +464,17 @@ void
>>>>  zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
>>>>      znode_t *zp, offset_t off, ssize_t resid, int ioflag)
>>>>  {
>>>> +  uint32_t blocksize = zp->z_blksz;
>>>>    itx_wr_state_t write_state;
>>>> -  boolean_t slogging;
>>>>    uintptr_t fsync_cnt;
>>>> -  ssize_t immediate_write_sz;
>>>>  
>>>>    if (zil_replaying(zilog, tx) || zp->z_unlinked)
>>>>            return;
>>>>  
>>>> -  immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>>>> -      ? 0 : zfs_immediate_write_sz;
>>>> -
>>>> -  slogging = spa_has_slogs(zilog->zl_spa) &&
>>>> -      (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
>>>> -  if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
>>>> +  if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>>>> +          write_state = WR_INDIRECT;
>>>> +  else if (!spa_has_slogs(zilog->zl_spa) &&
>>>> +      resid >= zfs_immediate_write_sz)
>>>>            write_state = WR_INDIRECT;
>>>>    else if (ioflag & (FSYNC | FDSYNC))
>>>>            write_state = WR_COPIED;
>>>> @@ -491,30 +488,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *
>>>>    while (resid) {
>>>>            itx_t *itx;
>>>>            lr_write_t *lr;
>>>> -          ssize_t len;
>>>> +          itx_wr_state_t wr_state = write_state;
>>>> +          ssize_t len = resid;
>>>>  
>>>> -          /*
>>>> -           * If the write would overflow the largest block then split it.
>>>> -           */
>>>> -          if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
>>>> -                  len = SPA_OLD_MAXBLOCKSIZE >> 1;
>>>> -          else
>>>> -                  len = resid;
>>>> +          if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
>>>> +                  wr_state = WR_NEED_COPY;
>>>> +          else if (wr_state == WR_INDIRECT)
>>>> +                  len = MIN(blocksize - P2PHASE(off, blocksize), resid);
>>>>  
>>>>            itx = zil_itx_create(txtype, sizeof (*lr) +
>>>> -              (write_state == WR_COPIED ? len : 0));
>>>> +              (wr_state == WR_COPIED ? len : 0));
>>>>            lr = (lr_write_t *)&itx->itx_lr;
>>>> -          if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
>>>> +          if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
>>>>                zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
>>>>                    zil_itx_destroy(itx);
>>>>                    itx = zil_itx_create(txtype, sizeof (*lr));
>>>>                    lr = (lr_write_t *)&itx->itx_lr;
>>>> -                  write_state = WR_NEED_COPY;
>>>> +                  wr_state = WR_NEED_COPY;
>>>>            }
>>>>  
>>>> -          itx->itx_wr_state = write_state;
>>>> -          if (write_state == WR_NEED_COPY)
>>>> -                  itx->itx_sod += len;
>>>> +          itx->itx_wr_state = wr_state;
>>>>            lr->lr_foid = zp->z_id;
>>>>            lr->lr_offset = off;
>>>>            lr->lr_length = len;
>>>>
>>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
>>>> ==============================================================================
>>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c      Thu Nov 
>>>> 17 20:44:51 2016        (r308781)
>>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c      Thu Nov 
>>>> 17 21:01:27 2016        (r308782)
>>>> @@ -88,6 +88,15 @@ SYSCTL_DECL(_vfs_zfs_trim);
>>>>  SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, 
>>>> &zfs_trim_enabled, 0,
>>>>      "Enable ZFS TRIM");
>>>>  
>>>> +/*
>>>> + * Limit SLOG write size per commit executed with synchronous priority.
>>>> + * Any writes above that executed with lower (asynchronous) priority to
>>>> + * limit potential SLOG device abuse by single active ZIL writer.
>>>> + */
>>>> +uint64_t zil_slog_limit = 768 * 1024;
>>>> +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
>>>> +    &zil_slog_limit, 0, "Maximal SLOG commit size with sync priority");
>>>> +
>>>>  static kmem_cache_t *zil_lwb_cache;
>>>>  
>>>>  #define   LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
>>>> @@ -447,13 +456,14 @@ zil_free_log_record(zilog_t *zilog, lr_t
>>>>  }
>>>>  
>>>>  static lwb_t *
>>>> -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
>>>> +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
>>>>  {
>>>>    lwb_t *lwb;
>>>>  
>>>>    lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
>>>>    lwb->lwb_zilog = zilog;
>>>>    lwb->lwb_blk = *bp;
>>>> +  lwb->lwb_slog = slog;
>>>>    lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
>>>>    lwb->lwb_max_txg = txg;
>>>>    lwb->lwb_zio = NULL;
>>>> @@ -516,6 +526,7 @@ zil_create(zilog_t *zilog)
>>>>    dmu_tx_t *tx = NULL;
>>>>    blkptr_t blk;
>>>>    int error = 0;
>>>> +  boolean_t slog = FALSE;
>>>>  
>>>>    /*
>>>>     * Wait for any previous destroy to complete.
>>>> @@ -544,7 +555,7 @@ zil_create(zilog_t *zilog)
>>>>            }
>>>>  
>>>>            error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
>>>> -              ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
>>>> +              ZIL_MIN_BLKSZ, &slog);
>>>>  
>>>>            if (error == 0)
>>>>                    zil_init_log_chain(zilog, &blk);
>>>> @@ -554,7 +565,7 @@ zil_create(zilog_t *zilog)
>>>>     * Allocate a log write buffer (lwb) for the first log block.
>>>>     */
>>>>    if (error == 0)
>>>> -          lwb = zil_alloc_lwb(zilog, &blk, txg);
>>>> +          lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
>>>>  
>>>>    /*
>>>>     * If we just allocated the first log block, commit our transaction
>>>> @@ -885,6 +896,7 @@ static void
>>>>  zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
>>>>  {
>>>>    zbookmark_phys_t zb;
>>>> +  zio_priority_t prio;
>>>>  
>>>>    SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
>>>>        ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
>>>> @@ -895,9 +907,13 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t
>>>>                ZIO_FLAG_CANFAIL);
>>>>    }
>>>>    if (lwb->lwb_zio == NULL) {
>>>> +          if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
>>>> +                  prio = ZIO_PRIORITY_SYNC_WRITE;
>>>> +          else
>>>> +                  prio = ZIO_PRIORITY_ASYNC_WRITE;
>>>>            lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
>>>>                0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
>>>> -              zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
>>>> +              zil_lwb_write_done, lwb, prio,
>>>>                ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
>>>>    }
>>>>  }
>>>> @@ -917,18 +933,6 @@ uint64_t zil_block_buckets[] = {
>>>>  };
>>>>  
>>>>  /*
>>>> - * Use the slog as long as the logbias is 'latency' and the current 
>>>> commit size
>>>> - * is less than the limit or the total list size is less than 2X the 
>>>> limit.
>>>> - * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
>>>> - */
>>>> -uint64_t zil_slog_limit = 1024 * 1024;
>>>> -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
>>>> -    &zil_slog_limit, 0, "Maximal commit size to use SLOG");
>>>> -#define   USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) 
>>>> && \
>>>> -  (((zilog)->zl_cur_used < zil_slog_limit) || \
>>>> -  ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
>>>> -
>>>> -/*
>>>>   * Start a log block write and advance to the next log block.
>>>>   * Calls are serialized.
>>>>   */
>>>> @@ -943,6 +947,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>>>>    uint64_t txg;
>>>>    uint64_t zil_blksz, wsz;
>>>>    int i, error;
>>>> +  boolean_t slog;
>>>>  
>>>>    if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
>>>>            zilc = (zil_chain_t *)lwb->lwb_buf;
>>>> @@ -999,8 +1004,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>>>>  
>>>>    BP_ZERO(bp);
>>>>    /* pass the old blkptr in order to spread log blocks across devs */
>>>> -  error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
>>>> -      USE_SLOG(zilog));
>>>> +  error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
>>>>    if (error == 0) {
>>>>            ASSERT3U(bp->blk_birth, ==, txg);
>>>>            bp->blk_cksum = lwb->lwb_blk.blk_cksum;
>>>> @@ -1009,7 +1013,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>>>>            /*
>>>>             * Allocate a new log write buffer (lwb).
>>>>             */
>>>> -          nlwb = zil_alloc_lwb(zilog, bp, txg);
>>>> +          nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
>>>>  
>>>>            /* Record the block for later vdev flushing */
>>>>            zil_add_block(zilog, &lwb->lwb_blk);
>>>> @@ -1046,12 +1050,13 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>>>>  static lwb_t *
>>>>  zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
>>>>  {
>>>> -  lr_t *lrc = &itx->itx_lr; /* common log record */
>>>> -  lr_write_t *lrw = (lr_write_t *)lrc;
>>>> +  lr_t *lrcb, *lrc = &itx->itx_lr; /* common log record */
>>>> +  lr_write_t *lrwb, *lrw = (lr_write_t *)lrc;
>>>>    char *lr_buf;
>>>>    uint64_t txg = lrc->lrc_txg;
>>>>    uint64_t reclen = lrc->lrc_reclen;
>>>>    uint64_t dlen = 0;
>>>> +  uint64_t dnow, lwb_sp;
>>>>  
>>>>    if (lwb == NULL)
>>>>            return (NULL);
>>>> @@ -1068,25 +1073,30 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
>>>>  
>>>>    zil_lwb_write_init(zilog, lwb);
>>>>  
>>>> +cont:
>>>>    /*
>>>>     * If this record won't fit in the current log block, start a new one.
>>>> +   * For WR_NEED_COPY optimize layout for minimal number of chunks, but
>>>> +   * try to keep wasted space withing reasonable range (12%).
>>>>     */
>>>> -  if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
>>>> +  lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
>>>> +  if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
>>>> +      lwb_sp < ZIL_MAX_LOG_DATA / 8 && (dlen % ZIL_MAX_LOG_DATA == 0 ||
>>>> +      lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
>>>>            lwb = zil_lwb_write_start(zilog, lwb);
>>>>            if (lwb == NULL)
>>>>                    return (NULL);
>>>>            zil_lwb_write_init(zilog, lwb);
>>>>            ASSERT(LWB_EMPTY(lwb));
>>>> -          if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
>>>> -                  txg_wait_synced(zilog->zl_dmu_pool, txg);
>>>> -                  return (lwb);
>>>> -          }
>>>> +          lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
>>>> +          ASSERT3U(reclen + MIN(dlen, sizeof(uint64_t)), <=, lwb_sp);
>>>>    }
>>>>  
>>>> +  dnow = MIN(dlen, lwb_sp - reclen);
>>>>    lr_buf = lwb->lwb_buf + lwb->lwb_nused;
>>>>    bcopy(lrc, lr_buf, reclen);
>>>> -  lrc = (lr_t *)lr_buf;
>>>> -  lrw = (lr_write_t *)lrc;
>>>> +  lrcb = (lr_t *)lr_buf;
>>>> +  lrwb = (lr_write_t *)lrcb;
>>>>  
>>>>    /*
>>>>     * If it's a write, fetch the data or get its blkptr as appropriate.
>>>> @@ -1098,16 +1108,19 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
>>>>                    char *dbuf;
>>>>                    int error;
>>>>  
>>>> -                  if (dlen) {
>>>> -                          ASSERT(itx->itx_wr_state == WR_NEED_COPY);
>>>> +                  if (itx->itx_wr_state == WR_NEED_COPY) {
>>>>                            dbuf = lr_buf + reclen;
>>>> -                          lrw->lr_common.lrc_reclen += dlen;
>>>> +                          lrcb->lrc_reclen += dnow;
>>>> +                          if (lrwb->lr_length > dnow)
>>>> +                                  lrwb->lr_length = dnow;
>>>> +                          lrw->lr_offset += dnow;
>>>> +                          lrw->lr_length -= dnow;
>>>>                    } else {
>>>>                            ASSERT(itx->itx_wr_state == WR_INDIRECT);
>>>>                            dbuf = NULL;
>>>>                    }
>>>>                    error = zilog->zl_get_data(
>>>> -                      itx->itx_private, lrw, dbuf, lwb->lwb_zio);
>>>> +                      itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
>>>>                    if (error == EIO) {
>>>>                            txg_wait_synced(zilog->zl_dmu_pool, txg);
>>>>                            return (lwb);
>>>> @@ -1126,12 +1139,18 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
>>>>     * equal to the itx sequence number because not all transactions
>>>>     * are synchronous, and sometimes spa_sync() gets there first.
>>>>     */
>>>> -  lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
>>>> -  lwb->lwb_nused += reclen + dlen;
>>>> +  lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
>>>> +  lwb->lwb_nused += reclen + dnow;
>>>>    lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
>>>>    ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
>>>>    ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
>>>>  
>>>> +  dlen -= dnow;
>>>> +  if (dlen > 0) {
>>>> +          zilog->zl_cur_used += reclen;
>>>> +          goto cont;
>>>> +  }
>>>> +
>>>>    return (lwb);
>>>>  }
>>>>  
>>>> @@ -1145,7 +1164,6 @@ zil_itx_create(uint64_t txtype, size_t l
>>>>    itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
>>>>    itx->itx_lr.lrc_txtype = txtype;
>>>>    itx->itx_lr.lrc_reclen = lrsize;
>>>> -  itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
>>>>    itx->itx_lr.lrc_seq = 0;        /* defensive */
>>>>    itx->itx_sync = B_TRUE;         /* default is synchronous */
>>>>  
>>>> @@ -1294,11 +1312,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *it
>>>>                     * this itxg. Save the itxs for release below.
>>>>                     * This should be rare.
>>>>                     */
>>>> -                  atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
>>>> -                  itxg->itxg_sod = 0;
>>>>                    clean = itxg->itxg_itxs;
>>>>            }
>>>> -          ASSERT(itxg->itxg_sod == 0);
>>>>            itxg->itxg_txg = txg;
>>>>            itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
>>>>  
>>>> @@ -1310,8 +1325,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *it
>>>>    }
>>>>    if (itx->itx_sync) {
>>>>            list_insert_tail(&itxs->i_sync_list, itx);
>>>> -          atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
>>>> -          itxg->itxg_sod += itx->itx_sod;
>>>>    } else {
>>>>            avl_tree_t *t = &itxs->i_async_tree;
>>>>            uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
>>>> @@ -1359,8 +1372,6 @@ zil_clean(zilog_t *zilog, uint64_t synce
>>>>    ASSERT3U(itxg->itxg_txg, <=, synced_txg);
>>>>    ASSERT(itxg->itxg_txg != 0);
>>>>    ASSERT(zilog->zl_clean_taskq != NULL);
>>>> -  atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
>>>> -  itxg->itxg_sod = 0;
>>>>    clean_me = itxg->itxg_itxs;
>>>>    itxg->itxg_itxs = NULL;
>>>>    itxg->itxg_txg = 0;
>>>> @@ -1384,7 +1395,6 @@ zil_get_commit_list(zilog_t *zilog)
>>>>  {
>>>>    uint64_t otxg, txg;
>>>>    list_t *commit_list = &zilog->zl_itx_commit_list;
>>>> -  uint64_t push_sod = 0;
>>>>  
>>>>    if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
>>>>            otxg = ZILTEST_TXG;
>>>> @@ -1401,12 +1411,9 @@ zil_get_commit_list(zilog_t *zilog)
>>>>            }
>>>>  
>>>>            list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
>>>> -          push_sod += itxg->itxg_sod;
>>>> -          itxg->itxg_sod = 0;
>>>>  
>>>>            mutex_exit(&itxg->itxg_lock);
>>>>    }
>>>> -  atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
>>>>  }
>>>>  
>>>>  /*
>>>>
>>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
>>>> ==============================================================================
>>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c      Thu Nov 
>>>> 17 20:44:51 2016        (r308781)
>>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c      Thu Nov 
>>>> 17 21:01:27 2016        (r308782)
>>>> @@ -2908,20 +2908,21 @@ zio_dva_unallocate(zio_t *zio, zio_gang_
>>>>   */
>>>>  int
>>>>  zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t 
>>>> *old_bp,
>>>> -    uint64_t size, boolean_t use_slog)
>>>> +    uint64_t size, boolean_t *slog)
>>>>  {
>>>>    int error = 1;
>>>>  
>>>>    ASSERT(txg > spa_syncing_txg(spa));
>>>>  
>>>> -  if (use_slog) {
>>>> -          error = metaslab_alloc(spa, spa_log_class(spa), size,
>>>> -              new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
>>>> -  }
>>>> -
>>>> -  if (error) {
>>>> +  error = metaslab_alloc(spa, spa_log_class(spa), size,
>>>> +      new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
>>>> +  if (error == 0) {
>>>> +          *slog = TRUE;
>>>> +  } else {
>>>>            error = metaslab_alloc(spa, spa_normal_class(spa), size,
>>>>                new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
>>>> +          if (error == 0)
>>>> +                  *slog = FALSE;
>>>>    }
>>>>  
>>>>    if (error == 0) {
>>>>
>>>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
>>>> ==============================================================================
>>>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c     Thu Nov 
>>>> 17 20:44:51 2016        (r308781)
>>>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c     Thu Nov 
>>>> 17 21:01:27 2016        (r308782)
>>>> @@ -1387,54 +1387,44 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_
>>>>  {
>>>>    uint32_t blocksize = zv->zv_volblocksize;
>>>>    zilog_t *zilog = zv->zv_zilog;
>>>> -  boolean_t slogging;
>>>> -  ssize_t immediate_write_sz;
>>>> +  itx_wr_state_t write_state;
>>>>  
>>>>    if (zil_replaying(zilog, tx))
>>>>            return;
>>>>  
>>>> -  immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>>>> -      ? 0 : zvol_immediate_write_sz;
>>>> -
>>>> -  slogging = spa_has_slogs(zilog->zl_spa) &&
>>>> -      (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
>>>> +  if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>>>> +          write_state = WR_INDIRECT;
>>>> +  else if (!spa_has_slogs(zilog->zl_spa) &&
>>>> +      resid >= blocksize && blocksize > zvol_immediate_write_sz)
>>>> +          write_state = WR_INDIRECT;
>>>> +  else if (sync)
>>>> +          write_state = WR_COPIED;
>>>> +  else
>>>> +          write_state = WR_NEED_COPY;
>>>>  
>>>>    while (resid) {
>>>>            itx_t *itx;
>>>>            lr_write_t *lr;
>>>> -          ssize_t len;
>>>> -          itx_wr_state_t write_state;
>>>> +          itx_wr_state_t wr_state = write_state;
>>>> +          ssize_t len = resid;
>>>>  
>>>> -          /*
>>>> -           * Unlike zfs_log_write() we can be called with
>>>> -           * upto DMU_MAX_ACCESS/2 (5MB) writes.
>>>> -           */
>>>> -          if (blocksize > immediate_write_sz && !slogging &&
>>>> -              resid >= blocksize && off % blocksize == 0) {
>>>> -                  write_state = WR_INDIRECT; /* uses dmu_sync */
>>>> -                  len = blocksize;
>>>> -          } else if (sync) {
>>>> -                  write_state = WR_COPIED;
>>>> -                  len = MIN(ZIL_MAX_LOG_DATA, resid);
>>>> -          } else {
>>>> -                  write_state = WR_NEED_COPY;
>>>> -                  len = MIN(ZIL_MAX_LOG_DATA, resid);
>>>> -          }
>>>> +          if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
>>>> +                  wr_state = WR_NEED_COPY;
>>>> +          else if (wr_state == WR_INDIRECT)
>>>> +                  len = MIN(blocksize - P2PHASE(off, blocksize), resid);
>>>>  
>>>>            itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
>>>> -              (write_state == WR_COPIED ? len : 0));
>>>> +              (wr_state == WR_COPIED ? len : 0));
>>>>            lr = (lr_write_t *)&itx->itx_lr;
>>>> -          if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
>>>> +          if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
>>>>                ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
>>>>                    zil_itx_destroy(itx);
>>>>                    itx = zil_itx_create(TX_WRITE, sizeof (*lr));
>>>>                    lr = (lr_write_t *)&itx->itx_lr;
>>>> -                  write_state = WR_NEED_COPY;
>>>> +                  wr_state = WR_NEED_COPY;
>>>>            }
>>>>  
>>>> -          itx->itx_wr_state = write_state;
>>>> -          if (write_state == WR_NEED_COPY)
>>>> -                  itx->itx_sod += len;
>>>> +          itx->itx_wr_state = wr_state;
>>>>            lr->lr_foid = ZVOL_OBJ;
>>>>            lr->lr_offset = off;
>>>>            lr->lr_length = len;
>>>>
> 

-- 
Alexander Motin
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to