On Fri, 2023-03-10 at 18:31 +0800, Sam Li wrote: > A zone append command is a write operation that specifies the first > logical block of a zone as the write position. When writing to a zoned > block device using zone append, the byte offset of writes is pointing > to the write pointer of that zone.
s/writes is pointing to the write pointer of that zone/the call may point at any position within the zone to which the data is being appended/ > Upon completion the device will > respond with the position the data s/position the data/position where the data/ > has been written in the zone. > > Signed-off-by: Sam Li <faithilike...@gmail.com> With nits above, Reviewed-by: Dmitry Fomichev <dmitry.fomic...@wdc.com> > --- > block/block-backend.c | 60 +++++++++++++++++++++++++++++++ > block/file-posix.c | 54 +++++++++++++++++++++++++--- > block/io.c | 21 +++++++++++ > block/io_uring.c | 4 +++ > block/linux-aio.c | 3 ++ > block/raw-format.c | 8 +++++ > include/block/block-io.h | 4 +++ > include/block/block_int-common.h | 5 +++ > include/block/raw-aio.h | 4 ++- > include/sysemu/block-backend-io.h | 9 +++++ > 10 files changed, 166 insertions(+), 6 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index f70b08e3f6..28e8f5d778 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1888,6 +1888,45 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return &acb->common; > } > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_append(rwco->blk, &acb->bytes, > + rwco->iobuf, rwco->flags); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags, > + BlockCompletionFunc *cb, void *opaque) { > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .ret = NOT_DONE, > + .flags = flags, > + .iobuf = qiov, > + }; > + acb->bytes = *offset; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > + aio_co_enter(blk_get_aio_context(blk), co); > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > /* > * Send a zone_report command. > * offset is a byte offset from the start of the device. No alignment > @@ -1939,6 +1978,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return ret; > } > > +/* > + * Send a zone_append command. > + */ > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + if (!blk_is_available(blk)) { > + blk_dec_in_flight(blk); > + return -ENOMEDIUM; > + } > + > + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 61ed769ac8..2ba9174778 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -160,6 +160,7 @@ typedef struct BDRVRawState { > bool has_write_zeroes:1; > bool use_linux_aio:1; > bool use_linux_io_uring:1; > + int64_t *offset; /* offset of zone append operation */ > int page_cache_inconsistent; /* errno from fdatasync failure */ > bool has_fallocate; > bool needs_alignment; > @@ -1672,7 +1673,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData > *aiocb) > ssize_t len; > > len = RETRY_ON_EINTR( > - (aiocb->aio_type & QEMU_AIO_WRITE) ? > + (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ? > qemu_pwritev(aiocb->aio_fildes, > aiocb->io.iov, > aiocb->io.niov, > @@ -1701,7 +1702,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData > *aiocb, char *buf) > ssize_t len; > > while (offset < aiocb->aio_nbytes) { > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > len = pwrite(aiocb->aio_fildes, > (const char *)buf + offset, > aiocb->aio_nbytes - offset, > @@ -1794,7 +1795,7 @@ static int handle_aiocb_rw(void *opaque) > } > > nbytes = handle_aiocb_rw_linear(aiocb, buf); > - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { > + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { > char *p = buf; > size_t count = aiocb->aio_nbytes, copy; > int i; > @@ -2431,6 +2432,10 @@ static int coroutine_fn raw_co_prw(BlockDriverState > *bs, > uint64_t offset, > #if defined(CONFIG_BLKZONED) > if (bs->bl.wps) { > qemu_co_mutex_lock(&bs->bl.wps->colock); > + if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) { > + int index = offset / bs->bl.zone_size; > + offset = bs->bl.wps->wp[index]; > + } > } > #endif > > @@ -2478,9 +2483,13 @@ out: > #if defined(CONFIG_BLKZONED) > BlockZoneWps *wps = bs->bl.wps; > if (ret == 0) { > - if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) { > + if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > + && wps && bs->bl.zone_size) { > int index = offset / bs->bl.zone_size; > if (!BDRV_ZT_IS_CONV(wps->wp[index])) { > + if (type & QEMU_AIO_ZONE_APPEND) { > + *s->offset = wps->wp[index]; > + } > /* Advance the wp if needed */ > if (offset + bytes > wps->wp[index]) { > wps->wp[index] = offset + bytes; > @@ -2488,7 +2497,7 @@ out: > } > } > } else { > - if (type & QEMU_AIO_WRITE) { > + if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > update_zones_wp(s->fd, bs->bl.wps, 0, 1); > } > } > @@ -3498,6 +3507,40 @@ out: > } > #endif > > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > + int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) { > + assert(flags == 0); > + int64_t zone_size_mask = bs->bl.zone_size - 1; > + int64_t iov_len = 0; > + int64_t len = 0; > + BDRVRawState *s = bs->opaque; > + s->offset = offset; > + > + if (*offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); > + return -EINVAL; > + } > + > + int64_t wg = bs->bl.write_granularity; > + int64_t wg_mask = wg - 1; > + for (int i = 0; i < qiov->niov; i++) { > + iov_len = qiov->iov[i].iov_len; > + if (iov_len & wg_mask) { > + error_report("len of IOVector[%d] %" PRId64 " is not aligned to " > + "block size %" PRId64 "", i, iov_len, wg); > + return -EINVAL; > + } > + len += iov_len; > + } > + > + return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND); > +} > +#endif > + > static coroutine_fn int > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > bool blkdev) > @@ -4259,6 +4302,7 @@ static BlockDriver bdrv_host_device = { > /* zone management operations */ > .bdrv_co_zone_report = raw_co_zone_report, > .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > + .bdrv_co_zone_append = raw_co_zone_append, > #endif > }; > > diff --git a/block/io.c b/block/io.c > index 5dbf1e50f2..fe9cabaaf6 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -3152,6 +3152,27 @@ out: > return co.ret; > } > > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > void *qemu_blockalign(BlockDriverState *bs, size_t size) > { > IO_CODE(); > diff --git a/block/io_uring.c b/block/io_uring.c > index 973e15d876..f7488c241a 100644 > --- a/block/io_uring.c > +++ b/block/io_uring.c > @@ -345,6 +345,10 @@ static int luring_do_submit(int fd, LuringAIOCB > *luringcb, > LuringState *s, > io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, > luringcb->qiov->niov, offset); > break; > + case QEMU_AIO_ZONE_APPEND: > + io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, > + luringcb->qiov->niov, offset); > + break; > case QEMU_AIO_READ: > io_uring_prep_readv(sqes, fd, luringcb->qiov->iov, > luringcb->qiov->niov, offset); > diff --git a/block/linux-aio.c b/block/linux-aio.c > index d2cfb7f523..1959834156 100644 > --- a/block/linux-aio.c > +++ b/block/linux-aio.c > @@ -389,6 +389,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb > *laiocb, off_t offset, > case QEMU_AIO_WRITE: > io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); > break; > + case QEMU_AIO_ZONE_APPEND: > + io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); > + break; > case QEMU_AIO_READ: > io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); > break; > diff --git a/block/raw-format.c b/block/raw-format.c > index 72e23e7b55..64e7d48d04 100644 > --- a/block/raw-format.c > +++ b/block/raw-format.c > @@ -332,6 +332,13 @@ raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len); > } > > +static int coroutine_fn GRAPH_RDLOCK > +raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov, > + BdrvRequestFlags flags) > +{ > + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags); > +} > + > static int64_t coroutine_fn GRAPH_RDLOCK > raw_co_getlength(BlockDriverState *bs) > { > @@ -635,6 +642,7 @@ BlockDriver bdrv_raw = { > .bdrv_co_pdiscard = &raw_co_pdiscard, > .bdrv_co_zone_report = &raw_co_zone_report, > .bdrv_co_zone_mgmt = &raw_co_zone_mgmt, > + .bdrv_co_zone_append = &raw_co_zone_append, > .bdrv_co_block_status = &raw_co_block_status, > .bdrv_co_copy_range_from = &raw_co_copy_range_from, > .bdrv_co_copy_range_to = &raw_co_copy_range_to, > diff --git a/include/block/block-io.h b/include/block/block-io.h > index 19d1fad9cf..55fca02991 100644 > --- a/include/block/block-io.h > +++ b/include/block/block-io.h > @@ -120,6 +120,10 @@ int coroutine_fn GRAPH_RDLOCK > bdrv_co_zone_report(BlockDriverState *bs, > int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_mgmt(BlockDriverState *bs, > BlockZoneOp op, > int64_t offset, int64_t len); > +int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_append(BlockDriverState *bs, > + int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > int bdrv_block_status(BlockDriverState *bs, int64_t offset, > diff --git a/include/block/block_int-common.h b/include/block/block_int- > common.h > index 19915b34af..ccd8811919 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -724,6 +724,9 @@ struct BlockDriver { > BlockZoneDescriptor *zones); > int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp > op, > int64_t offset, int64_t len); > + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs, > + int64_t *offset, QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > /* removable device specific */ > bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_is_inserted)( > @@ -887,6 +890,8 @@ typedef struct BlockLimits { > > /* array of write pointers' location of each zone in the zoned device. */ > BlockZoneWps *wps; > + > + int64_t write_granularity; > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > index eda6a7a253..fb9c9f5a01 100644 > --- a/include/block/raw-aio.h > +++ b/include/block/raw-aio.h > @@ -30,6 +30,7 @@ > #define QEMU_AIO_TRUNCATE 0x0080 > #define QEMU_AIO_ZONE_REPORT 0x0100 > #define QEMU_AIO_ZONE_MGMT 0x0200 > +#define QEMU_AIO_ZONE_APPEND 0x0400 > #define QEMU_AIO_TYPE_MASK \ > (QEMU_AIO_READ | \ > QEMU_AIO_WRITE | \ > @@ -40,7 +41,8 @@ > QEMU_AIO_COPY_RANGE | \ > QEMU_AIO_TRUNCATE | \ > QEMU_AIO_ZONE_REPORT | \ > - QEMU_AIO_ZONE_MGMT) > + QEMU_AIO_ZONE_MGMT | \ > + QEMU_AIO_ZONE_APPEND) > > /* AIO flags */ > #define QEMU_AIO_MISALIGNED 0x1000 > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend- > io.h > index f575ab5b6b..e716591a1a 100644 > --- a/include/sysemu/block-backend-io.h > +++ b/include/sysemu/block-backend-io.h > @@ -53,6 +53,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t > offset, > BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > int64_t offset, int64_t len, > BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags, > + BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t > bytes, > BlockCompletionFunc *cb, void *opaque); > void blk_aio_cancel_async(BlockAIOCB *acb); > @@ -201,6 +204,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > int64_t offset, int64_t len); > int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > int64_t offset, int64_t len); > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > +int co_wrapper_mixed blk_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset, > int64_t bytes);