On Tue, Oct 04, 2022 at 08:23:15AM +0900, Damien Le Moal wrote: > On 2022/10/04 2:47, Stefan Hajnoczi wrote: > > On Thu, Sep 29, 2022 at 04:36:27PM +0800, Sam Li wrote: > >> Add a new zoned_host_device BlockDriver. The zoned_host_device option > >> accepts only zoned host block devices. By adding zone management > >> operations in this new BlockDriver, users can use the new block > >> layer APIs including Report Zone and four zone management operations > >> (open, close, finish, reset). > >> > >> Qemu-io uses the new APIs to perform zoned storage commands of the device: > >> zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > >> zone_finish(zf). > >> > >> For example, to test zone_report, use following command: > >> $ ./build/qemu-io --image-opts -n driver=zoned_host_device, > >> filename=/dev/nullb0 > >> -c "zrp offset nr_zones" > >> > >> Signed-off-by: Sam Li <faithilike...@gmail.com> > >> Reviewed-by: Hannes Reinecke <h...@suse.de> > >> --- > >> block/block-backend.c | 146 +++++++++++++ > >> block/file-posix.c | 340 +++++++++++++++++++++++++++++- > >> block/io.c | 41 ++++ > >> include/block/block-common.h | 4 + > >> include/block/block-io.h | 7 + > >> include/block/block_int-common.h | 24 +++ > >> include/block/raw-aio.h | 6 +- > >> include/sysemu/block-backend-io.h | 17 ++ > >> meson.build | 4 + > >> qapi/block-core.json | 8 +- > >> qemu-io-cmds.c | 148 +++++++++++++ > >> 11 files changed, 741 insertions(+), 4 deletions(-) > >> > >> diff --git a/block/block-backend.c b/block/block-backend.c > >> index d4a5df2ac2..f7f7acd6f4 100644 > >> --- a/block/block-backend.c > >> +++ b/block/block-backend.c > >> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo { > >> void *iobuf; > >> int ret; > >> BdrvRequestFlags flags; > >> + union { > >> + struct { > >> + unsigned int *nr_zones; > >> + BlockZoneDescriptor *zones; > >> + } zone_report; > >> + struct { > >> + BlockZoneOp op; > >> + } zone_mgmt; > >> + }; > >> } BlkRwCo; > >> > >> int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) > >> @@ -1775,6 +1784,143 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > >> return ret; > >> } > >> > >> +static void blk_aio_zone_report_entry(void *opaque) { > > > > > > The coroutine_fn annotation is missing: > > > > static void coroutine_fn blk_aio_zone_report_entry(void *opaque) { > > > >> + BlkAioEmAIOCB *acb = opaque; > >> + BlkRwCo *rwco = &acb->rwco; > >> + > >> + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, > >> + rwco->zone_report.nr_zones, > >> + rwco->zone_report.zones); > >> + blk_aio_complete(acb); > >> +} > >> + > >> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > >> + unsigned int *nr_zones, > >> + BlockZoneDescriptor *zones, > >> + BlockCompletionFunc *cb, void *opaque) > >> +{ > >> + BlkAioEmAIOCB *acb; > >> + Coroutine *co; > >> + IO_CODE(); > >> + > >> + blk_inc_in_flight(blk); > >> + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > >> + acb->rwco = (BlkRwCo) { > >> + .blk = blk, > >> + .offset = offset, > >> + .ret = NOT_DONE, > >> + .zone_report = { > >> + .zones = zones, > >> + .nr_zones = nr_zones, > >> + }, > >> + }; > >> + acb->has_returned = false; > >> + > >> + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); > >> + bdrv_coroutine_enter(blk_bs(blk), co); > >> + > >> + acb->has_returned = true; > >> + if (acb->rwco.ret != NOT_DONE) { > >> + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > >> + blk_aio_complete_bh, acb); > >> + } > >> + > >> + return &acb->common; > >> +} > >> + > >> +static void blk_aio_zone_mgmt_entry(void *opaque) { > > > > coroutine_fn is missing here. > > > >> + BlkAioEmAIOCB *acb = opaque; > >> + BlkRwCo *rwco = &acb->rwco; > >> + > >> + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, > >> + rwco->offset, acb->bytes); > >> + blk_aio_complete(acb); > >> +} > >> + > >> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > >> + int64_t offset, int64_t len, > >> + BlockCompletionFunc *cb, void *opaque) { > >> + BlkAioEmAIOCB *acb; > >> + Coroutine *co; > >> + IO_CODE(); > >> + > >> + blk_inc_in_flight(blk); > >> + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > >> + acb->rwco = (BlkRwCo) { > >> + .blk = blk, > >> + .offset = offset, > >> + .ret = NOT_DONE, > >> + .zone_mgmt = { > >> + .op = op, > >> + }, > >> + }; > >> + acb->bytes = len; > >> + acb->has_returned = false; > >> + > >> + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); > >> + bdrv_coroutine_enter(blk_bs(blk), co); > >> + > >> + acb->has_returned = true; > >> + if (acb->rwco.ret != NOT_DONE) { > >> + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > >> + blk_aio_complete_bh, acb); > >> + } > >> + > >> + return &acb->common; > >> +} > >> + > >> +/* > >> + * Send a zone_report command. > >> + * offset is a byte offset from the start of the device. No alignment > >> + * required for offset. > >> + * nr_zones represents IN maximum and OUT actual. > >> + */ > >> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > >> + unsigned int *nr_zones, > >> + BlockZoneDescriptor *zones) > >> +{ > >> + int ret; > >> + IO_CODE(); > >> + > >> + blk_inc_in_flight(blk); /* increase before waiting */ > >> + blk_wait_while_drained(blk); > >> + if (!blk_is_available(blk)) { > >> + blk_dec_in_flight(blk); > >> + return -ENOMEDIUM; > >> + } > >> + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > >> + blk_dec_in_flight(blk); > >> + return ret; > >> +} > >> + > >> +/* > >> + * Send a zone_management command. > >> + * op is the zone operation; > >> + * offset is the byte offset from the start of the zoned device; > >> + * len is the maximum number of bytes the command should operate on. It > >> + * should be aligned with the device zone size. > >> + */ > >> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > >> + int64_t offset, int64_t len) > >> +{ > >> + int ret; > >> + IO_CODE(); > >> + > >> + > >> + blk_inc_in_flight(blk); > >> + blk_wait_while_drained(blk); > >> + > >> + ret = blk_check_byte_request(blk, offset, len); > >> + if (ret < 0) { > >> + blk_dec_in_flight(blk); > >> + return ret; > >> + } > >> + > >> + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > >> + blk_dec_in_flight(blk); > >> + return ret; > >> +} > >> + > >> void blk_drain(BlockBackend *blk) > >> { > >> BlockDriverState *bs = blk_bs(blk); > >> diff --git a/block/file-posix.c b/block/file-posix.c > >> index 0a8b4b426e..0a6c781201 100644 > >> --- a/block/file-posix.c > >> +++ b/block/file-posix.c > >> @@ -67,6 +67,9 @@ > >> #include <sys/param.h> > >> #include <sys/syscall.h> > >> #include <sys/vfs.h> > >> +#if defined(CONFIG_BLKZONED) > >> +#include <linux/blkzoned.h> > >> +#endif > >> #include <linux/cdrom.h> > >> #include <linux/fd.h> > >> #include <linux/fs.h> > >> @@ -216,6 +219,15 @@ typedef struct RawPosixAIOData { > >> PreallocMode prealloc; > >> Error **errp; > >> } truncate; > >> + struct { > >> + unsigned int *nr_zones; > >> + BlockZoneDescriptor *zones; > >> + } zone_report; > >> + struct { > >> + unsigned long zone_op; > >> + const char *zone_op_name; > >> + bool all; > > > > Please remove this field if it is unused. > > > >> + } zone_mgmt; > >> }; > >> } RawPosixAIOData; > >> > >> @@ -1339,7 +1351,7 @@ static void raw_refresh_limits(BlockDriverState *bs, > >> Error **errp) > >> #endif > >> > >> if (bs->sg || S_ISBLK(st.st_mode)) { > >> - int ret = hdev_get_max_hw_transfer(s->fd, &st); > >> + ret = hdev_get_max_hw_transfer(s->fd, &st); > >> > >> if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { > >> bs->bl.max_hw_transfer = ret; > >> @@ -1356,6 +1368,41 @@ static void raw_refresh_limits(BlockDriverState > >> *bs, Error **errp) > >> zoned = BLK_Z_NONE; > >> } > >> bs->bl.zoned = zoned; > >> + if (zoned != BLK_Z_NONE) { > >> + ret = get_sysfs_long_val(&st, "chunk_sectors"); > >> + if (ret <= 0) { > >> + error_report("Invalid zone size %" PRId32 " sectors ", ret); > >> + bs->bl.zoned = BLK_Z_NONE; > >> + return; > >> + } > >> + bs->bl.zone_size = ret * 512; > >> + > >> + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); > >> + if (ret > 0) { > >> + bs->bl.max_append_sectors = ret / 512; > >> + } > >> + > >> + ret = get_sysfs_long_val(&st, "max_open_zones"); > >> + if (ret >= 0) { > >> + bs->bl.max_open_zones = ret; > >> + } > >> + > >> + ret = get_sysfs_long_val(&st, "max_active_zones"); > >> + if (ret >= 0) { > >> + bs->bl.max_active_zones = ret; > >> + } > >> + > >> + ret = get_sysfs_long_val(&st, "nr_zones"); > >> + if (ret >= 0) { > >> + bs->bl.nr_zones = ret; > >> + } > >> + > >> + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity); > >> + if (ret != 0) { > >> + error_report("Invalid device capacity %" PRId64 " bytes ", > >> bs->bl.capacity); > >> + return; > >> + } > > > > The QEMU block layer already knows the capacity of the device. Can > > bdrv_getlength() be used instead of introducing a new > > BlockLimits.capacity field? > > > >> + } > >> } > >> > >> static int check_for_dasd(int fd) > >> @@ -1850,6 +1897,147 @@ static off_t copy_file_range(int in_fd, off_t > >> *in_off, int out_fd, > >> } > >> #endif > >> > >> +/* > >> + * parse_zone - Fill a zone descriptor > >> + */ > >> +#if defined(CONFIG_BLKZONED) > >> +static inline void parse_zone(struct BlockZoneDescriptor *zone, > >> + const struct blk_zone *blkz, > >> + const struct blk_zone_report *rep) { > >> + zone->start = blkz->start << BDRV_SECTOR_BITS; > >> + zone->length = blkz->len << BDRV_SECTOR_BITS; > >> + zone->wp = blkz->wp << BDRV_SECTOR_BITS; > >> + > >> + if (rep->flags & BLK_ZONE_REP_CAPACITY) { > >> + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > > > > #ifdef HAVE_BLK_ZONE_REP_CAPACITY is needed since the rep->flags and > > blkz->capacity fields are missing and would cause a compilation error > > when HAVE_BLK_ZONE_REP_CAPACITY is not defined: > > > > zone->cap = blkz->len << BDRV_SECTOR_BITS; > > #ifdef HAVE_BLK_ZONE_REP_CAPACITY > > /* Replace with the dedicated field on newer kernels */ > > if (rep->flags & BLK_ZONE_REP_CAPACITY) { > > zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > > } > > #endif > > It would be a lot cleaner to do something like this: > > in the block common header file, add: > > #ifdef HAVE_BLK_ZONE_REP_CAPACITY > > #define BLK_ZONE_REP_CAPACITY (1 << 0) > > struct blk_zone_v2 { > __u64 start; /* Zone start sector */ > __u64 len; /* Zone length in number of sectors */ > __u64 wp; /* Zone write pointer position */ > __u8 type; /* Zone type */ > __u8 cond; /* Zone condition */ > __u8 non_seq; /* Non-sequential write resources active */ > __u8 reset; /* Reset write pointer recommended */ > __u8 resv[4]; > __u64 capacity; /* Zone capacity in number of sectors */ > __u8 reserved[24]; > }; > #define blk_zone blk_zone_v2 > > struct blk_zone_report_v2 { > __u64 sector; > __u32 nr_zones; > __u32 flags; > struct blk_zone zones[0]; > }; > #define blk_zone_report blk_zone_report_v2 > > #endif > > Then the above code becomes: > > if (rep->flags & BLK_ZONE_REP_CAPACITY) { > zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > } else { > zone->cap = blkz->len << BDRV_SECTOR_BITS; > } > > No #ifdef in the C code, only in the header and that compiles and works for > all > host kernel versions.
The ->flags field doesn't exist in old versions of the header. How will "if (rep->flags ..." compile on old systems? Stefan
signature.asc
Description: PGP signature