Stefan Hajnoczi <stefa...@redhat.com> 于2022年9月20日周二 05:31写道: > > On Sat, Sep 10, 2022 at 02:50:57PM +0800, Sam Li wrote: > > This patch extends virtio-blk emulation to handle zoned device commands > > by calling the new block layer APIs to perform zoned device I/O on > > behalf of the guest. It supports Report Zone, four zone oparations (open, > > close, finish, reset), and Append Zone. > > > > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does > > support zoned block devices. Regular block devices(conventional zones) > > will not be set. > > > > The guest os having zoned device support can use blkzone(8) to test those > > commands. Furthermore, using zonefs to test zone append write is also > > supported. > > > > Signed-off-by: Sam Li <faithilike...@gmail.com> > > --- > > hw/block/virtio-blk.c | 326 ++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 326 insertions(+) > > > > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c > > index e9ba752f6b..3ef74c01db 100644 > > --- a/hw/block/virtio-blk.c > > +++ b/hw/block/virtio-blk.c > > @@ -46,6 +46,8 @@ static const VirtIOFeature feature_sizes[] = { > > .end = endof(struct virtio_blk_config, discard_sector_alignment)}, > > {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES, > > .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)}, > > + {.flags = 1ULL << VIRTIO_BLK_F_ZONED, > > + .end = endof(struct virtio_blk_config, zoned)}, > > {} > > }; > > > > @@ -614,6 +616,273 @@ err: > > return err_status; > > } > > > > +typedef struct ZoneCmdData { > > + VirtIOBlockReq *req; > > + union { > > + struct { > > + unsigned int nr_zones; > > + BlockZoneDescriptor *zones; > > + } ZoneReportData; > > + struct { > > + int64_t append_sector; > > + } ZoneAppendData; > > Field names should be lowercase: > > struct { > unsigned int nr_zones; > BlockZoneDescriptor *zones; > } zone_report_data; > struct { > int64_t append_sector; > } zone_append_data; > > > + }; > > +} ZoneCmdData; > > + > > +/* > > + * check zone_model: error checking before issuing requests. If all checks > > Maybe rename it to check_zoned_request()? It does more than check the > model. > > > + * passed, return true. > > + * append: true if only zone append request issued. > > + */ > > +static bool check_zone_model(VirtIOBlock *s, int64_t sector, int64_t > > nr_sector, > > + bool append, uint8_t *status) { > > + BlockDriverState *bs = blk_bs(s->blk); > > + BlockZoneDescriptor *zone = &bs->bl.zones[sector / > > bs->bl.zone_sectors]; > > Inputs from the guest driver are untrusted and must be validated before > using them. sector could have any value here, including invalid values. > Please check that sector is less than the device capacity and also that > it is positive. > > > + int64_t max_append_sector = bs->bl.max_append_sectors; > > + > > + if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) { > > + *status = VIRTIO_BLK_S_UNSUPP; > > + return false; > > + } > > + > > + if (zone->cond == BLK_ZS_OFFLINE) { > > + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + return false; > > + } > > + > > + if (append) { > > + if ((zone->type != BLK_ZT_SWR) || (zone->cond == BLK_ZS_RDONLY) || > > + (sector + nr_sector > (*(zone + 1)).start)) { > > + /* the end sector of the request exceeds to next zone */ > > + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + return false; > > + } > > + > > + if (nr_sector > max_append_sector) { > > + if (max_append_sector == 0) { > > + *status = VIRTIO_BLK_S_UNSUPP; > > + } else { > > + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + } > > + return false; > > + } > > + } > > + return true; > > +} > > + > > +static void virtio_blk_zone_report_complete(void *opaque, int ret) > > +{ > > + ZoneCmdData *data = opaque; > > + VirtIOBlockReq *req = data->req; > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(req->dev); > > + struct iovec *in_iov = req->elem.in_sg; > > + unsigned in_num = req->elem.in_num; > > + int64_t zrp_size, nz, n, j = 0; > > + int8_t err_status = VIRTIO_BLK_S_OK; > > + > > + nz = data->ZoneReportData.nr_zones; > > + struct virtio_blk_zone_report zrp_hdr = (struct > > virtio_blk_zone_report) { > > + .nr_zones = cpu_to_le64(nz), > > + }; > > + > > + zrp_size = sizeof(struct virtio_blk_zone_report) > > + + sizeof(struct virtio_blk_zone_descriptor) * nz; > > + n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr)); > > + if (n != sizeof(zrp_hdr)) { > > + virtio_error(vdev, "Driver provided intput buffer that is too > > small!"); > > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + goto out; > > + } > > + > > + for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct > > virtio_blk_zone_descriptor), ++j) { > > + struct virtio_blk_zone_descriptor desc = > > + (struct virtio_blk_zone_descriptor) { > > + .z_start = > > cpu_to_le64(data->ZoneReportData.zones[j].start), > > + .z_cap = > > cpu_to_le64(data->ZoneReportData.zones[j].cap), > > + .z_wp = > > cpu_to_le64(data->ZoneReportData.zones[j].wp), > > If the QEMU block layer uses byte constants it will be necessary to > convert the above fields to sectors. I think the code is correct right > now because the QEMU block layer patches still use sectors, but using > bytes would be consistent with the QEMU block layer conventions. > > > + .z_type = data->ZoneReportData.zones[j].type, > > + .z_state = data->ZoneReportData.zones[j].cond, > > The QEMU type and cond field constants might not match the virtio-blk > constants. Please convert them explicitly (e.g. with a switch > statement in a helper function) so there is no assumption about the > values of the constants. > > > + }; > > + n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc)); > > This is O(n^2) time complexity since it's called from the loop, but > nevermind for now... Maybe add a TODO comment so anyone trying to > optimize this code will immediately see the expensive part. > > > + if (n != sizeof(desc)) { > > + virtio_error(vdev, "Driver provided input buffer " > > + "for descriptors that is too small!"); > > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + goto out; > > + } > > + } > > + goto out; > > + > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + g_free(data->ZoneReportData.zones); > > + g_free(data); > > +} > > + > > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) { > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > > + unsigned int nr_zones; > > + ZoneCmdData *data; > > + int64_t zone_size, offset; > > + uint8_t err_status; > > + > > + if (req->in_len <= sizeof(struct virtio_blk_inhdr) + > > + sizeof(struct virtio_blk_zone_report)) { > > + virtio_error(vdev, "in buffer too small for zone report"); > > + return -1; > > + } > > + > > + /* start byte offset of the zone report */ > > + offset = virtio_ldq_p(vdev, &req->out.sector) * 512; > > + if (!check_zone_model(s, offset / 512, 0, false, &err_status)) { > > + goto out; > > + } > > + > > + nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) - > > + sizeof(struct virtio_blk_zone_report)) / > > + sizeof(struct virtio_blk_zone_descriptor); > > + > > + zone_size = sizeof(BlockZoneDescriptor) * nr_zones; > > + data = g_malloc(sizeof(ZoneCmdData)); > > + data->req = req; > > + data->ZoneReportData.nr_zones = nr_zones; > > + data->ZoneReportData.zones = g_malloc(zone_size), > > nr_zones = 0 and zones = NULL is possible when in_len is sizeof(struct > virtio_blk_inhdr) + sizeof(struct virtio_blk_zone_report) + 1. Maybe the > code handles it okay without dereferencing a NULL pointer, but it would > be safer to change the check above like this: > > if (req->in_len < sizeof(struct virtio_blk_inhdr) + > sizeof(struct virtio_blk_zone_report) + > sizeof(struct virtio_blk_zone_descriptor)) { > virtio_error(vdev, "in buffer too small for zone report"); > return -1; > } > > > + > > + blk_aio_zone_report(s->blk, offset, &data->ZoneReportData.nr_zones, > > + data->ZoneReportData.zones, > > + virtio_blk_zone_report_complete, data); > > + return 0; > > + > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + return err_status; > > +} > > + > > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) { > > + ZoneCmdData *data = opaque; > > + VirtIOBlockReq *req = data->req; > > + VirtIOBlock *s = req->dev; > > Missing ret < 0 error handling. > > > + > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + g_free(data); > > +} > > + > > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp > > op) { > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > > + BlockDriverState *bs = blk_bs(s->blk); > > + int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512; > > + uint64_t len; > > + uint32_t type; > > + uint8_t err_status = VIRTIO_BLK_S_OK; > > + > > + if (!check_zone_model(s, offset / 512, 0, false, &err_status)) { > > + goto out; > > + } > > + > > + ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData)); > > + data->req = req; > > + > > + type = virtio_ldl_p(vdev, &req->out.type); > > + if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) { > > + /* Entire drive capacity */ > > + offset = 0; > > + blk_get_geometry(s->blk, &len); > > + len *= 512; > > + } else { > > + len = bs->bl.zone_sectors * 512; > > Is this correct when accessing the last zone of the device? > > > + } > > + > > + blk_aio_zone_mgmt(s->blk, op, offset, len, > > + virtio_blk_zone_mgmt_complete, data); > > + > > + return 0; > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + return err_status; > > +} > > + > > +static void virtio_blk_zone_append_complete(void *opaque, int ret) { > > + ZoneCmdData *data = opaque; > > + VirtIOBlockReq *req = data->req; > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(req->dev); > > + int64_t append_sector, n; > > + struct iovec *out_iov = req->elem.out_sg; > > + unsigned out_num = req->elem.out_num; > > + uint8_t err_status = VIRTIO_BLK_S_OK; > > Error handling code for the ret < 0 case is missing. > > > + > > + append_sector = data->ZoneAppendData.append_sector; > > + int write_granularity = s->conf.conf.logical_block_size; > > + if ((append_sector * 512 % write_granularity) != 0) { > > + err_status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP; > > + goto out; > > + } > > This looks like input validation. Why is it performed after the I/O > request has completed?
UNALIGNED_WP should be checked in both input and output validation. By checking if the starting/ending sector of the request data is aligned with write_granularity(value of physical block size), the device sets the UNALIGNED_WP bit and completes the request. According to spec here: +VIRTIO_BLK_S_ZONE_UNALIGNED_WP is set by the device when the request received +from the driver attempts to perform a write to an SWR zone and at least one of +the following conditions is met: + +\begin{itemize} +\item the starting sector of the request is not equal to the current value of + the zone write pointer. + +\item the ending sector of the request data multiplied by 512 is not a multiple + of the value reported by the device in the field \field{write_granularity} + in the device configuration space. +\end{itemize} > > I guess the intent is to check the value that the guest driver provided, > not the value produced by the device after the I/O request completed? > > > + n = iov_to_buf(out_iov, out_num, 0, &append_sector, > > sizeof(append_sector)); > > Please use virtio_stq_p() on append_sector first to ensure that the > endianness is correct. > > > + if (n != sizeof(append_sector)) { > > + virtio_error(vdev, "Driver provided input buffer less than size of > > " > > + "append_sector"); > > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > > + goto out; > > + } > > + goto out; > > + > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + g_free(data); > > +} > > + > > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) { > > The return value is not used. Please change the return type to void. The return type should be int actually. Otherwise, QEMU will terminate when the zone_append request is issued from the guest. It is the one cause that failed some of the zonefs tests. After coredump, backtrace indicates address_space_unmap: Assertion `mr ! = NULL` failed rooted from virtio_blk_zone_append_complete(). I haven't figured out exactly why but my guess is virtio_blk_zone_op_complete() needs the return value of virtio_blk_zone_op() ...... > > > + VirtIOBlock *s = req->dev; > > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > > + uint64_t niov = req->elem.out_num; > > + struct iovec *out_iov = req->elem.out_sg; > > + uint8_t err_status = VIRTIO_BLK_S_OK; > > + > > + int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512; > > + int64_t len = 0; > > + for (int i = 1; i < niov; ++i) { > > + len += out_iov[i].iov_len; > > + } > > Why is the first iovec skipped? Because the first iovec is normally headers, and the rest is buffer that out_iov needs. > > > + > > + if (!check_zone_model(s, offset / 512, len / 512, true, &err_status)) { > > + goto out; > > + } > > + > > + ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData)); > > + data->req = req; > > + data->ZoneAppendData.append_sector = offset; > > + qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1); > > + blk_aio_zone_append(s->blk, &data->ZoneAppendData.append_sector, > > &req->qiov, 0, > > + virtio_blk_zone_append_complete, data); > > + > > + return 0; > > + > > +out: > > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > > + virtio_blk_req_complete(req, err_status); > > + virtio_blk_free_request(req); > > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > > + return err_status; > > +} > > + > > static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer > > *mrb) > > { > > uint32_t type; > > @@ -700,6 +969,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq > > *req, MultiReqBuffer *mrb) > > case VIRTIO_BLK_T_FLUSH: > > virtio_blk_handle_flush(req, mrb); > > break; > > + case VIRTIO_BLK_T_ZONE_REPORT: > > + virtio_blk_handle_zone_report(req); > > + break; > > + case VIRTIO_BLK_T_ZONE_OPEN: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN); > > + break; > > + case VIRTIO_BLK_T_ZONE_CLOSE: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE); > > + break; > > + case VIRTIO_BLK_T_ZONE_FINISH: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH); > > + break; > > + case VIRTIO_BLK_T_ZONE_RESET: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET); > > + break; > > + case VIRTIO_BLK_T_ZONE_RESET_ALL: > > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL); > > + break; > > case VIRTIO_BLK_T_SCSI_CMD: > > virtio_blk_handle_scsi(req); > > break; > > @@ -718,6 +1005,9 @@ static int virtio_blk_handle_request(VirtIOBlockReq > > *req, MultiReqBuffer *mrb) > > virtio_blk_free_request(req); > > break; > > } > > + case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT: > > + virtio_blk_handle_zone_append(req); > > + break; > > /* > > * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with > > * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch > > statement, > > @@ -917,6 +1207,7 @@ static void virtio_blk_update_config(VirtIODevice > > *vdev, uint8_t *config) > > { > > VirtIOBlock *s = VIRTIO_BLK(vdev); > > BlockConf *conf = &s->conf.conf; > > + BlockDriverState *state = blk_bs(s->blk); > > Usually the variable is called "bs". Please use that name here and > elsewhere in the patch. > > > struct virtio_blk_config blkcfg; > > uint64_t capacity; > > int64_t length; > > @@ -976,6 +1267,31 @@ static void virtio_blk_update_config(VirtIODevice > > *vdev, uint8_t *config) > > blkcfg.write_zeroes_may_unmap = 1; > > virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1); > > } > > +#ifdef CONFIG_BLKZONED > > Is this ifdef appropriate? I think bs->bl.zoned should always be > available, even when <blkzoned.h> is missing (e.g. non-Linux system). In > the future there may be an emulated zoned BlockDriver. virtio-blk > should be able to use the emulated zoned BlockDriver even on non-Linux > systems. > Ok, you are right. > I think CONFIG_BLKZONED should be limited to block/file-posix.c. I'm not sure. There may be some places where virtio-blk needs this config. Like in transforming blk_zone_descriptor to virtio_blk_zone_descriptor, it needs to use Linux's constants to ensure nothing go wrong here. > > > + if (state->bl.zoned != BLK_Z_NONE) { > > + switch (state->bl.zoned) { > > + case BLK_Z_HM: > > + blkcfg.zoned.model = VIRTIO_BLK_Z_HM; > > + virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors, > > + state->bl.zone_sectors); > > + virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones, > > + state->bl.max_active_zones); > > + virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones, > > + state->bl.max_open_zones); > > + virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size); > > + virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors, > > + state->bl.max_append_sectors); > > + break; > > + case BLK_Z_HA: > > + blkcfg.zoned.model = VIRTIO_BLK_Z_HA; > > The block limits aren't relevant to host-aware zoned devices? Yes, the HA devices are seen as non-zoned device and the driver just ignore all other fields in zoned. +\item if the driver that can not support zoned devices reads + VIRTIO_BLK_Z_HA from the \field{model} field of \field{zoned}, the driver + MAY handle the device as a non-zoned device. In this case, the + driver SHOULD ignore all other fields in \field{zoned}. +\end{itemize} > > > + break; > > + default: > > + blkcfg.zoned.model = VIRTIO_BLK_Z_NONE; > > + virtio_error(vdev, "Invalid zoned model %x \n", > > (int)state->bl.zoned); > > + break; > > + } > > + } > > +#endif > > memcpy(config, &blkcfg, s->config_size); > > } > > > > @@ -995,6 +1311,7 @@ static uint64_t virtio_blk_get_features(VirtIODevice > > *vdev, uint64_t features, > > Error **errp) > > { > > VirtIOBlock *s = VIRTIO_BLK(vdev); > > + BlockDriverState *state = blk_bs(s->blk); > > > > /* Firstly sync all virtio-blk possible supported features */ > > features |= s->host_features; > > @@ -1003,6 +1320,12 @@ static uint64_t virtio_blk_get_features(VirtIODevice > > *vdev, uint64_t features, > > virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY); > > virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY); > > virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE); > > + if (state->bl.zoned != BLK_Z_NONE) { > > + virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED); > > + if (state->bl.zoned == BLK_Z_HM) { > > + virtio_clear_feature(&features, VIRTIO_BLK_F_DISCARD); > > Why is features modified here but s->host_features is modified in the > line above? Because F_DISCARD should not be offered by the driver when the device offers F_ZONED with the HM model. > > > + } > > + } > > This detects VIRTIO_BLK_F_ZONED based on the BlockDriverState... (This part can be removed.) > > > if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) { > > if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_SCSI)) { > > error_setg(errp, "Please set scsi=off for virtio-blk devices > > in order to use virtio 1.0"); > > @@ -1286,6 +1609,9 @@ static Property virtio_blk_properties[] = { > > #ifdef __linux__ > > DEFINE_PROP_BIT64("scsi", VirtIOBlock, host_features, > > VIRTIO_BLK_F_SCSI, false), > > +#endif > > +#ifdef CONFIG_BLKZONED > > + DEFINE_PROP_BIT64("zoned", VirtIOBlock, host_features, > > VIRTIO_BLK_F_ZONED, true), > > #endif > > ...but this allows users to enable/disable the flag from the > command-line. > > I think DEFINE_PROP_BIT64() can be removed, but please check that the > config space size is still correct. It may be necessary to move the > bs->bl.zoned check into virtio_blk_device_realize() and set > the VIRTIO_BLK_F_ZONED s->host_features bit there instead. That will > allow the s->config_size calculation to work correctly. Ok, it works. Thanks! Additionally, the DEFINE_PROP_BIT here is to declare that the supports zones. Then the virtio-blk driver will not need to look at that feature. So the former part detecting the F_ZONED feature based on BlockDriverState can be removed. If removing this macro, then the virio-blk driver must set the F_ZONED feature by itself.