On 6/27/22 02:19, Sam Li wrote:
By adding zone management operations in BlockDriver, storage
controller emulation can use the new block layer APIs including
zone_report and zone_mgmt(open, close, finish, reset).
---
block/block-backend.c | 56 ++++++++
block/coroutines.h | 5 +
block/file-posix.c | 238 +++++++++++++++++++++++++++++++
include/block/block-common.h | 43 +++++-
include/block/block_int-common.h | 20 +++
5 files changed, 361 insertions(+), 1 deletion(-)
diff --git a/block/block-backend.c b/block/block-backend.c
index e0e1aff4b1..786f964d02 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
return ret;
}
+/*
+ * Return zone_report from BlockDriver. Offset can be any number within
+ * the zone size. No alignment for offset and len.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+ int64_t len, int64_t *nr_zones,
+ BlockZoneDescriptor *zones)
+{
+ int ret;
+ BlockDriverState *bs;
+ IO_CODE();
+
+ blk_inc_in_flight(blk); /* increase before waiting */
+ blk_wait_while_drained(blk);
+ bs = blk_bs(blk);
+
+ ret = blk_check_byte_request(blk, offset, len);
+ if (ret < 0) {
+ return ret;
+ }
+
+ bdrv_inc_in_flight(bs);
+ ret = bdrv_co_zone_report(blk->root->bs, offset, len,
+ nr_zones, zones);
+ bdrv_dec_in_flight(bs);
+ blk_dec_in_flight(blk);
+ return ret;
+}
+
+/*
+ * Return zone_mgmt from BlockDriver.
+ * Offset is the start of a zone and len is aligned to zones.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+ int64_t offset, int64_t len)
+{
+ int ret;
+ BlockDriverState *bs;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ blk_wait_while_drained(blk);
+ bs = blk_bs(blk);
+
+ ret = blk_check_byte_request(blk, offset, len);
+ if (ret < 0) {
+ return ret;
+ }
+
+ bdrv_inc_in_flight(bs);
+ ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
+ bdrv_dec_in_flight(bs);
+ blk_dec_in_flight(blk);
+ return ret;
+}
+
void blk_drain(BlockBackend *blk)
{
BlockDriverState *bs = blk_bs(blk);
diff --git a/block/coroutines.h b/block/coroutines.h
index 830ecaa733..a114d7bc30 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -80,6 +80,11 @@ int coroutine_fn
blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
int coroutine_fn blk_co_do_flush(BlockBackend *blk);
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+ int64_t len, int64_t *nr_zones,
+ BlockZoneDescriptor *zones);
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+ int64_t offset, int64_t len);
/*
diff --git a/block/file-posix.c b/block/file-posix.c
index 48cd096624..1b8b0d351f 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -67,6 +67,7 @@
#include <sys/param.h>
#include <sys/syscall.h>
#include <sys/vfs.h>
+#include <linux/blkzoned.h>
#include <linux/cdrom.h>
#include <linux/fd.h>
#include <linux/fs.h>
@@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
PreallocMode prealloc;
Error **errp;
} truncate;
+ struct {
+ int64_t *nr_zones;
+ BlockZoneDescriptor *zones;
+ } zone_report;
+ zone_op op;
};
} RawPosixAIOData;
@@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
}
#endif
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+static inline void parse_zone(struct BlockZoneDescriptor *zone,
+ struct blk_zone *blkz) {
+ zone->start = blkz->start;
+ zone->length = blkz->len;
+ zone->cap = blkz->capacity;
+ zone->wp = blkz->wp - blkz->start;
+ zone->type = blkz->type;
+ zone->cond = blkz->cond;
+}
+
+static int handle_aiocb_zone_report(void *opaque) {
+ RawPosixAIOData *aiocb = opaque;
+ int fd = aiocb->aio_fildes;
+ int64_t *nr_zones = aiocb->zone_report.nr_zones;
+ BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+ int64_t offset = aiocb->aio_offset;
+ int64_t len = aiocb->aio_nbytes;
+
+ struct blk_zone *blkz;
+ int64_t rep_size, nrz;
+ int ret, n = 0, i = 0;
+
+ nrz = *nr_zones;
+ if (len == -1) {
+ return -errno;
+ }
+ rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+ g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report,
nrz);
+ offset = offset / 512; /* get the unit of the start sector: sector size is
512 bytes. */
+ printf("start to report zone with offset: 0x%lx\n", offset);
+
+ blkz = (struct blk_zone *)(rep + 1);
+ while (n < nrz) {
+ memset(rep, 0, rep_size);
+ rep->sector = offset;
+ rep->nr_zones = nrz;
+
+ ret = ioctl(fd, BLKREPORTZONE, rep);
+ if (ret != 0) {
+ ret = -errno;
+ error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
+ fd, offset, errno);
+ return ret;
+ }
+
+ if (!rep->nr_zones) {
+ break;
+ }
+
+ for (i = 0; i < rep->nr_zones; i++, n++) {
+ parse_zone(&zones[n], &blkz[i]);
+ /* The next report should start after the last zone reported */
+ offset = blkz[i].start + blkz[i].len;
+ }
Where do you increase 'n' such that the loop can make forward progress?
Wouldn't it be better to use a for() loop here?
+ }
+
+ *nr_zones = n;
+ return 0;
+}
+
+static int handle_aiocb_zone_mgmt(void *opaque) {
+ RawPosixAIOData *aiocb = opaque;
+ int fd = aiocb->aio_fildes;
+ int64_t offset = aiocb->aio_offset;
+ int64_t len = aiocb->aio_nbytes;
+ zone_op op = aiocb->op;
+
+ struct blk_zone_range range;
+ const char *ioctl_name;
+ unsigned long ioctl_op;
+ int64_t zone_size;
+ int64_t zone_size_mask;
+ int ret;
+
Shouldn't we add a check here if 'fd' points to a zoned device?
ioctl errors are not _that_ helpful here, as you might get a variety
of errors and it's not quite obvious which of those errors indicate
an unsupported feature.
+ ret = ioctl(fd, BLKGETZONESZ, &zone_size);
+ if (ret) {
+ return -1;
+ }
+
+ zone_size_mask = zone_size - 1;
+ if (offset & zone_size_mask) {
+ error_report("offset is not the start of a zone");
+ return -1;
+ }
+
+ if (len & zone_size_mask) {
+ error_report("len is not aligned to zones");
+ return -1;
+ }
+
+ switch (op) {
+ case zone_open:
+ ioctl_name = "BLKOPENZONE";
+ ioctl_op = BLKOPENZONE;
+ break;
+ case zone_close:
+ ioctl_name = "BLKCLOSEZONE";
+ ioctl_op = BLKCLOSEZONE;
+ break;
+ case zone_finish:
+ ioctl_name = "BLKFINISHZONE";
+ ioctl_op = BLKFINISHZONE;
+ break;
+ case zone_reset:
+ ioctl_name = "BLKRESETZONE";
+ ioctl_op = BLKRESETZONE;
+ break;
+ default:
+ error_report("Invalid zone operation 0x%x", op);
+ errno = -EINVAL;
+ return -1;
+ }
+
+ /* Execute the operation */
+ range.sector = offset;
+ range.nr_sectors = len;
+ ret = ioctl(fd, ioctl_op, &range);
+ if (ret != 0) {
+ error_report("ioctl %s failed %d",
+ ioctl_name, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
static int handle_aiocb_copy_range(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
@@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s,
uint64_t nbytes, int ret)
}
}
+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ *
+ * @param bs: passing zone block device file descriptor
+ * @param zones: an array of zone descriptors to hold zone
+ * information on reply
+ * @param offset: offset can be any byte within the zone size.
+ * @param len: (not sure yet.
+ * @return 0 on success, -1 on failure
+ */
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t
offset,
+ int64_t len, int64_t *nr_zones,
+ BlockZoneDescriptor *zones) {
+ BDRVRawState *s = bs->opaque;
+ RawPosixAIOData acb;
+
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = QEMU_AIO_IOCTL,
+ .aio_offset = offset,
+ .aio_nbytes = len,
+ .zone_report = {
+ .nr_zones = nr_zones,
+ .zones = zones,
+ },
+ };
+
+ return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
+}
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
+ int64_t offset, int64_t len) {
+ BDRVRawState *s = bs->opaque;
+ RawPosixAIOData acb;
+
+ acb = (RawPosixAIOData) {
+ .bs = bs,
+ .aio_fildes = s->fd,
+ .aio_type = QEMU_AIO_IOCTL,
+ .aio_offset = offset,
+ .aio_nbytes = len,
+ .op = op,
+ };
+
+ return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
+}
+
static coroutine_fn int
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
bool blkdev)
@@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = {
.bdrv_abort_perm_update = raw_abort_perm_update,
.create_opts = &raw_create_opts,
.mutable_opts = mutable_opts,
+
+ .bdrv_co_zone_report = raw_co_zone_report,
+ .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
};
/***********************************************/
@@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = {
#endif
};
+static BlockDriver bdrv_zoned_host_device = {
+ .format_name = "zoned_host_device",
+ .protocol_name = "zoned_host_device",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_needs_filename = true,
+ .bdrv_probe_device = hdev_probe_device,
+ .bdrv_parse_filename = hdev_parse_filename,
+ .bdrv_file_open = hdev_open,
+ .bdrv_close = raw_close,
+ .bdrv_reopen_prepare = raw_reopen_prepare,
+ .bdrv_reopen_commit = raw_reopen_commit,
+ .bdrv_reopen_abort = raw_reopen_abort,
+ .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+ .create_opts = &bdrv_create_opts_simple,
+ .mutable_opts = mutable_opts,
+ .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
+ .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
+
+ .bdrv_co_preadv = raw_co_preadv,
+ .bdrv_co_pwritev = raw_co_pwritev,
+ .bdrv_co_flush_to_disk = raw_co_flush_to_disk,
+ .bdrv_co_pdiscard = hdev_co_pdiscard,
+ .bdrv_co_copy_range_from = raw_co_copy_range_from,
+ .bdrv_co_copy_range_to = raw_co_copy_range_to,
+ .bdrv_refresh_limits = raw_refresh_limits,
+ .bdrv_io_plug = raw_aio_plug,
+ .bdrv_io_unplug = raw_aio_unplug,
+ .bdrv_attach_aio_context = raw_aio_attach_aio_context,
+
+ .bdrv_co_truncate = raw_co_truncate,
+ .bdrv_getlength = raw_getlength,
+ .bdrv_get_info = raw_get_info,
+ .bdrv_get_allocated_file_size
+ = raw_get_allocated_file_size,
+ .bdrv_get_specific_stats = hdev_get_specific_stats,
+ .bdrv_check_perm = raw_check_perm,
+ .bdrv_set_perm = raw_set_perm,
+ .bdrv_abort_perm_update = raw_abort_perm_update,
+ .bdrv_probe_blocksizes = hdev_probe_blocksizes,
+ .bdrv_probe_geometry = hdev_probe_geometry,
+ .bdrv_co_ioctl = hdev_co_ioctl,
+
+ /* zone management operations */
+ .bdrv_co_zone_report = raw_co_zone_report,
+ .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+};
+
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
static void cdrom_parse_filename(const char *filename, QDict *options,
Error **errp)
@@ -3964,6 +4201,7 @@ static void bdrv_file_init(void)
#if defined(HAVE_HOST_BLOCK_DEVICE)
bdrv_register(&bdrv_host_device);
#ifdef __linux__
+ bdrv_register(&bdrv_zoned_host_device);
bdrv_register(&bdrv_host_cdrom);
#endif
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/include/block/block-common.h b/include/block/block-common.h
index fdb7306e78..78cddeeda5 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -23,7 +23,6 @@
*/
#ifndef BLOCK_COMMON_H
#define BLOCK_COMMON_H
-
#include "block/aio.h"
#include "block/aio-wait.h"
#include "qemu/iov.h"
@@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
typedef struct BdrvChild BdrvChild;
typedef struct BdrvChildClass BdrvChildClass;
+typedef enum zone_op {
+ zone_open,
+ zone_close,
+ zone_finish,
+ zone_reset,
+} zone_op;
+
+typedef enum zone_model {
+ BLK_Z_HM,
+ BLK_Z_HA,
+} zone_model;
+
+typedef enum BlkZoneCondition {
+ BLK_ZS_NOT_WP = 0x0,
+ BLK_ZS_EMPTY = 0x1,
+ BLK_ZS_IOPEN = 0x2,
+ BLK_ZS_EOPEN = 0x3,
+ BLK_ZS_CLOSED = 0x4,
+ BLK_ZS_RDONLY = 0xD,
+ BLK_ZS_FULL = 0xE,
+ BLK_ZS_OFFLINE = 0xF,
+} BlkZoneCondition;
+
+typedef enum BlkZoneType {
+ BLK_ZT_CONV = 0x1,
+ BLK_ZT_SWR = 0x2,
+ BLK_ZT_SWP = 0x3,
+} BlkZoneType;
+
+/*
+ * Zone descriptor data structure.
+ * Provide information on a zone with all position and size values in bytes.
+ */
+typedef struct BlockZoneDescriptor {
+ uint64_t start;
+ uint64_t length;
+ uint64_t cap;
+ uint64_t wp;
+ BlkZoneType type;
+ BlkZoneCondition cond;
+} BlockZoneDescriptor;
+
typedef struct BlockDriverInfo {
/* in bytes, 0 if irrelevant */
int cluster_size;
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 8947abab76..b9ea9db6dc 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
struct BdrvTrackedRequest *waiting_for;
} BdrvTrackedRequest;
+/**
+ * Zone device information data structure.
+ * Provide information on a device.
+ */
+typedef struct zbd_dev {
+ uint32_t zone_size;
+ zone_model model;
+ uint32_t block_size;
+ uint32_t write_granularity;
+ uint32_t nr_zones;
+ struct BlockZoneDescriptor *zones; /* array of zones */
+ uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
+ uint32_t max_nr_active_zones;
+} zbd_dev;
struct BlockDriver {
/*
@@ -691,6 +705,12 @@ struct BlockDriver {
QEMUIOVector *qiov,
int64_t pos);
+ int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
+ int64_t offset, int64_t len, int64_t *nr_zones,
+ BlockZoneDescriptor *zones);
+ int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op
op,
+ int64_t offset, int64_t len);
+
/* removable device specific */
bool (*bdrv_is_inserted)(BlockDriverState *bs);
void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
Other than that: Well done!
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
h...@suse.de +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer