Stefan Hajnoczi <stefa...@redhat.com> 于2022年12月21日周三 05:37写道: > > On Mon, Dec 19, 2022 at 04:16:43PM +0800, Sam Li wrote: > > This patch extends virtio-blk emulation to handle zoned device commands > > by calling the new block layer APIs to perform zoned device I/O on > > behalf of the guest. It supports Report Zone, four zone oparations (open, > > close, finish, reset), and Append Zone. > > > > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does > > support zoned block devices. Regular block devices(conventional zones) > > will not be set. > > > > The guest os can use blktests, fio to test those commands on zoned devices. > > Furthermore, using zonefs to test zone append write is also supported. > > > > Signed-off-by: Sam Li <faithilike...@gmail.com> > > --- > > hw/block/virtio-blk-common.c | 2 + > > hw/block/virtio-blk.c | 383 +++++++++++++++++++++++++++++++++++ > > 2 files changed, 385 insertions(+) > > > > diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c > > index ac52d7c176..e2f8e2f6da 100644 > > --- a/hw/block/virtio-blk-common.c > > +++ b/hw/block/virtio-blk-common.c > > @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = { > > .end = endof(struct virtio_blk_config, discard_sector_alignment)}, > > {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES, > > .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)}, > > + {.flags = 1ULL << VIRTIO_BLK_F_ZONED, > > + .end = endof(struct virtio_blk_config, zoned)}, > > {} > > }; > > > > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c > > index f717550fdc..2157157e7d 100644 > > --- a/hw/block/virtio-blk.c > > +++ b/hw/block/virtio-blk.c > > @@ -601,6 +601,331 @@ err: > > return err_status; > > } > > > > +typedef struct ZoneCmdData { > > + VirtIOBlockReq *req; > > + union { > > + struct { > > + unsigned int nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report_data; > > + struct { > > + int64_t offset; > > + } zone_append_data; > > + }; > > +} ZoneCmdData; > > + > > +/* > > + * check zoned_request: error checking before issuing requests. If all > > checks > > + * passed, return true. > > + * append: true if only zone append requests issued. > > + */ > > +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t > > len, > > + bool append, uint8_t *status) { > > + BlockDriverState *bs = blk_bs(s->blk); > > + int index = offset / bs->bl.zone_size; > > Is bs->bl.zone_size 0 when !virtio_has_feature(s->host_features, > VIRTIO_BLK_F_ZONED)? > > In that case there is a division by zero and the QEMU process crashes > with a SIGFPE signal. > > The VIRTIO_BLK_F_ZONED check needs to be performed before calculating > index. > > > + > > + if (offset < 0 || len < 0 || len > (bs->total_sectors << > > BDRV_SECTOR_BITS) > > + || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) { > > + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + return false; > > + } > > + > > + if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) { > > + *status = VIRTIO_BLK_S_UNSUPP; > > + return false; > > + } > > + > > + if (append) { > > + if (bs->bl.write_granularity) { > > + if ((offset % bs->bl.write_granularity) != 0) { > > + *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP; > > + return false; > > + } > > + } > > + > > + if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) { > > + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + return false; > > + } > > + > > + if (len / 512 > bs->bl.max_append_sectors) { > > + if (bs->bl.max_append_sectors == 0) { > > + *status = VIRTIO_BLK_S_UNSUPP; > > + } else { > > + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + } > > + return false; > > + } > > + } > > + return true; > > +} > > + > > +static void virtio_blk_zone_report_complete(void *opaque, int ret) > > +{ > > + ZoneCmdData *data = opaque; > > + VirtIOBlockReq *req = data->req; > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(req->dev); > > + struct iovec *in_iov = req->elem.in_sg; > > + unsigned in_num = req->elem.in_num; > > in_iov/in_num must be passed in from virtio_blk_handle_request() (e.g. > via ZoneCmdData). req->elem.in_num is incorrect after > iov_discard_back_undoable() when the guest driver submits a single-byte > iovec as the final element. > > > + int64_t zrp_size, n, j = 0; > > + int64_t nz = data->zone_report_data.nr_zones; > > + int8_t err_status = VIRTIO_BLK_S_OK; > > + > > + if (ret) { > > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + goto out; > > + } > > + > > + struct virtio_blk_zone_report zrp_hdr = (struct > > virtio_blk_zone_report) { > > + .nr_zones = cpu_to_le64(nz), > > + }; > > + zrp_size = sizeof(struct virtio_blk_zone_report) > > + + sizeof(struct virtio_blk_zone_descriptor) * nz; > > + n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr)); > > + if (n != sizeof(zrp_hdr)) { > > + virtio_error(vdev, "Driver provided intput buffer that is too > > small!"); > > s/intput/input/ > > > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + goto out; > > + } > > + > > + for (size_t i = sizeof(zrp_hdr); i < zrp_size; > > + i += sizeof(struct virtio_blk_zone_descriptor), ++j) { > > + struct virtio_blk_zone_descriptor desc = > > + (struct virtio_blk_zone_descriptor) { > > + .z_start = > > cpu_to_le64(data->zone_report_data.zones[j].start > > + >> BDRV_SECTOR_BITS), > > + .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap > > + >> BDRV_SECTOR_BITS), > > + .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp > > + >> BDRV_SECTOR_BITS), > > + }; > > + > > + switch (data->zone_report_data.zones[j].type) { > > + case BLK_ZT_CONV: > > + desc.z_type = VIRTIO_BLK_ZT_CONV; > > + break; > > + case BLK_ZT_SWR: > > + desc.z_type = VIRTIO_BLK_ZT_SWR; > > + break; > > + case BLK_ZT_SWP: > > + desc.z_type = VIRTIO_BLK_ZT_SWP; > > + break; > > + default: > > + g_assert_not_reached(); > > + } > > + > > + switch (data->zone_report_data.zones[j].state) { > > + case BLK_ZS_RDONLY: > > + desc.z_state = VIRTIO_BLK_ZS_RDONLY; > > + break; > > + case BLK_ZS_OFFLINE: > > + desc.z_state = VIRTIO_BLK_ZS_OFFLINE; > > + break; > > + case BLK_ZS_EMPTY: > > + desc.z_state = VIRTIO_BLK_ZS_EMPTY; > > + break; > > + case BLK_ZS_CLOSED: > > + desc.z_state = VIRTIO_BLK_ZS_CLOSED; > > + break; > > + case BLK_ZS_FULL: > > + desc.z_state = VIRTIO_BLK_ZS_FULL; > > + break; > > + case BLK_ZS_EOPEN: > > + desc.z_state = VIRTIO_BLK_ZS_EOPEN; > > + break; > > + case BLK_ZS_IOPEN: > > + desc.z_state = VIRTIO_BLK_ZS_IOPEN; > > + break; > > + case BLK_ZS_NOT_WP: > > + desc.z_state = VIRTIO_BLK_ZS_NOT_WP; > > + break; > > + default: > > + g_assert_not_reached(); > > + } > > + > > + /* TODO: it takes O(n^2) time complexity. Optimizations required. > > */ > > + n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc)); > > + if (n != sizeof(desc)) { > > + virtio_error(vdev, "Driver provided input buffer " > > + "for descriptors that is too small!"); > > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + } > > + } > > + > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + g_free(data->zone_report_data.zones); > > + g_free(data); > > +} > > + > > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) > > +{ > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > > + unsigned int nr_zones; > > + ZoneCmdData *data; > > + int64_t zone_size, offset; > > + uint8_t err_status; > > + > > + if (req->in_len < sizeof(struct virtio_blk_inhdr) + > > + sizeof(struct virtio_blk_zone_report) + > > + sizeof(struct virtio_blk_zone_descriptor)) { > > + virtio_error(vdev, "in buffer too small for zone report"); > > + return -1; > > + } > > + > > + /* start byte offset of the zone report */ > > + offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS; > > + if (!check_zoned_request(s, offset, 0, false, &err_status)) { > > + goto out; > > + } > > + nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) - > > + sizeof(struct virtio_blk_zone_report)) / > > + sizeof(struct virtio_blk_zone_descriptor); > > + > > + zone_size = sizeof(BlockZoneDescriptor) * nr_zones; > > + data = g_malloc(sizeof(ZoneCmdData)); > > + data->req = req; > > + data->zone_report_data.nr_zones = nr_zones; > > + data->zone_report_data.zones = g_malloc(zone_size), > > + > > + blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones, > > + data->zone_report_data.zones, > > + virtio_blk_zone_report_complete, data); > > + return 0; > > + > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + return err_status; > > +} > > + > > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) > > +{ > > + VirtIOBlockReq *req = opaque; > > + VirtIOBlock *s = req->dev; > > + int8_t err_status = VIRTIO_BLK_S_OK; > > + > > + if (ret) { > > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + } > > + > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > +} > > + > > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) > > +{ > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > > + BlockDriverState *bs = blk_bs(s->blk); > > + int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << > > BDRV_SECTOR_BITS; > > + uint64_t len; > > + uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; > > + uint8_t err_status = VIRTIO_BLK_S_OK; > > + > > + uint32_t type = virtio_ldl_p(vdev, &req->out.type); > > + if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) { > > + /* Entire drive capacity */ > > + offset = 0; > > + len = capacity; > > + } else { > > + if (bs->bl.zone_size > capacity - offset) { > > + /* The zoned device allows the last smaller zone. */ > > + len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1); > > + } else { > > + len = bs->bl.zone_size; > > + } > > + } > > + > > + if (!check_zoned_request(s, offset, len, false, &err_status)) { > > + goto out; > > + } > > + > > + blk_aio_zone_mgmt(s->blk, op, offset, len, > > + virtio_blk_zone_mgmt_complete, req); > > + > > + return 0; > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + return err_status; > > +} > > + > > +static void virtio_blk_zone_append_complete(void *opaque, int ret) > > +{ > > + ZoneCmdData *data = opaque; > > + VirtIOBlockReq *req = data->req; > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(req->dev); > > + int64_t append_sector, n; > > + struct iovec *in_iov = req->elem.in_sg; > > + unsigned in_num = req->elem.in_num; > > Same as for report zones. in_iov/in_num must be passed in from > virtio_blk_handle_request(). > > > + uint8_t err_status = VIRTIO_BLK_S_OK; > > + > > + if (ret) { > > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + goto out; > > + } > > + > > + virtio_stl_p(vdev, &append_sector, > > + data->zone_append_data.offset >> BDRV_SECTOR_BITS); > > This must be stq because offset is 64 bits.
Will change it to virtio_ldq_p(). > > > + n = iov_from_buf(in_iov, in_num, 0, &append_sector, > > sizeof(append_sector)); > > + if (n != sizeof(append_sector)) { > > + virtio_error(vdev, "Driver provided input buffer less than size of > > " > > + "append_sector"); > > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + goto out; > > + } > > + > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + g_free(data); > > +} > > + > > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req, > > + struct iovec *out_iov, > > + uint64_t niov) { > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > > + uint8_t err_status = VIRTIO_BLK_S_OK; > > + > > + int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << > > BDRV_SECTOR_BITS; > > + int64_t len = iov_size(out_iov, niov); > > + > > + if (!check_zoned_request(s, offset, len, true, &err_status)) { > > + goto out; > > + } > > + > > + ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData)); > > + data->req = req; > > + data->zone_append_data.offset = offset; > > + qemu_iovec_init_external(&req->qiov, out_iov, niov); > > + blk_aio_zone_append(s->blk, &data->zone_append_data.offset, > > &req->qiov, 0, > > + virtio_blk_zone_append_complete, data); > > Accounting is missing. Other I/O request types (read, write, flush) are > tracked (see BLOCK_ACCT_*). I think a new BLOCK_ACCT_APPEND enum > constant should be introduced. > > This reminds me that QEMU's I/O throttling code doesn't know about > append operations either. Guests can bypass I/O throttling by submitting > append operations instead of writes. I think this can be left as a > separate task though. It's not directly related to the virtio-blk > emulation code. Thanks! I'll see to it. > > > + return 0; > > + > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + return err_status; > > +} > > + > > static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer > > *mrb) > > { > > uint32_t type; > > @@ -687,6 +1012,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq > > *req, MultiReqBuffer *mrb) > > case VIRTIO_BLK_T_FLUSH: > > virtio_blk_handle_flush(req, mrb); > > break; > > + case VIRTIO_BLK_T_ZONE_REPORT: > > + virtio_blk_handle_zone_report(req); > > + break; > > + case VIRTIO_BLK_T_ZONE_OPEN: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN); > > + break; > > + case VIRTIO_BLK_T_ZONE_CLOSE: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE); > > + break; > > + case VIRTIO_BLK_T_ZONE_FINISH: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH); > > + break; > > + case VIRTIO_BLK_T_ZONE_RESET: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET); > > + break; > > + case VIRTIO_BLK_T_ZONE_RESET_ALL: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET); > > + break; > > case VIRTIO_BLK_T_SCSI_CMD: > > virtio_blk_handle_scsi(req); > > break; > > @@ -705,6 +1048,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq > > *req, MultiReqBuffer *mrb) > > virtio_blk_free_request(req); > > break; > > } > > + case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT: > > + /* > > + * It is not safe to access req->elem.out_sg directly because it > > + * may be modified by virtio_blk_handle_request(). > > + */ > > + virtio_blk_handle_zone_append(req, out_iov, out_num); > > + break; > > /* > > * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with > > * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch > > statement, > > @@ -904,6 +1254,7 @@ static void virtio_blk_update_config(VirtIODevice > > *vdev, uint8_t *config) > > { > > VirtIOBlock *s = VIRTIO_BLK(vdev); > > BlockConf *conf = &s->conf.conf; > > + BlockDriverState *bs = blk_bs(s->blk); > > struct virtio_blk_config blkcfg; > > uint64_t capacity; > > int64_t length; > > @@ -963,6 +1314,30 @@ static void virtio_blk_update_config(VirtIODevice > > *vdev, uint8_t *config) > > blkcfg.write_zeroes_may_unmap = 1; > > virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1); > > } > > + if (bs->bl.zoned != BLK_Z_NONE) { > > + switch (bs->bl.zoned) { > > + case BLK_Z_HM: > > + blkcfg.zoned.model = VIRTIO_BLK_Z_HM; > > + break; > > + case BLK_Z_HA: > > + blkcfg.zoned.model = VIRTIO_BLK_Z_HA; > > + break; > > + default: > > + g_assert_not_reached(); > > + } > > + > > + virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors, > > + bs->bl.zone_size / 512); > > + virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones, > > + bs->bl.max_active_zones); > > + virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones, > > + bs->bl.max_open_zones); > > + virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size); > > + virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors, > > + bs->bl.max_append_sectors); > > + } else { > > + blkcfg.zoned.model = VIRTIO_BLK_Z_NONE; > > + } > > memcpy(config, &blkcfg, s->config_size); > > } > > > > @@ -1127,6 +1502,7 @@ static void virtio_blk_device_realize(DeviceState > > *dev, Error **errp) > > VirtIODevice *vdev = VIRTIO_DEVICE(dev); > > VirtIOBlock *s = VIRTIO_BLK(dev); > > VirtIOBlkConf *conf = &s->conf; > > + BlockDriverState *bs = blk_bs(conf->conf.blk); > > Error *err = NULL; > > unsigned i; > > > > @@ -1172,6 +1548,13 @@ static void virtio_blk_device_realize(DeviceState > > *dev, Error **errp) > > return; > > } > > > > + if (bs->bl.zoned != BLK_Z_NONE) { > > + virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED); > > + if (bs->bl.zoned == BLK_Z_HM) { > > + virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD); > > + } > > + } > > + > > if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) && > > (!conf->max_discard_sectors || > > conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) { > > -- > > 2.38.1 > >