On Tue, Oct 04, 2022 at 08:23:15AM +0900, Damien Le Moal wrote:
> On 2022/10/04 2:47, Stefan Hajnoczi wrote:
> > On Thu, Sep 29, 2022 at 04:36:27PM +0800, Sam Li wrote:
> >> Add a new zoned_host_device BlockDriver. The zoned_host_device option
> >> accepts only zoned host block devices. By adding zone management
> >> operations in this new BlockDriver, users can use the new block
> >> layer APIs including Report Zone and four zone management operations
> >> (open, close, finish, reset).
> >>
> >> Qemu-io uses the new APIs to perform zoned storage commands of the device:
> >> zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
> >> zone_finish(zf).
> >>
> >> For example, to test zone_report, use following command:
> >> $ ./build/qemu-io --image-opts -n driver=zoned_host_device, 
> >> filename=/dev/nullb0
> >> -c "zrp offset nr_zones"
> >>
> >> Signed-off-by: Sam Li <faithilike...@gmail.com>
> >> Reviewed-by: Hannes Reinecke <h...@suse.de>
> >> ---
> >>  block/block-backend.c             | 146 +++++++++++++
> >>  block/file-posix.c                | 340 +++++++++++++++++++++++++++++-
> >>  block/io.c                        |  41 ++++
> >>  include/block/block-common.h      |   4 +
> >>  include/block/block-io.h          |   7 +
> >>  include/block/block_int-common.h  |  24 +++
> >>  include/block/raw-aio.h           |   6 +-
> >>  include/sysemu/block-backend-io.h |  17 ++
> >>  meson.build                       |   4 +
> >>  qapi/block-core.json              |   8 +-
> >>  qemu-io-cmds.c                    | 148 +++++++++++++
> >>  11 files changed, 741 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/block/block-backend.c b/block/block-backend.c
> >> index d4a5df2ac2..f7f7acd6f4 100644
> >> --- a/block/block-backend.c
> >> +++ b/block/block-backend.c
> >> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
> >>      void *iobuf;
> >>      int ret;
> >>      BdrvRequestFlags flags;
> >> +    union {
> >> +        struct {
> >> +            unsigned int *nr_zones;
> >> +            BlockZoneDescriptor *zones;
> >> +        } zone_report;
> >> +        struct {
> >> +            BlockZoneOp op;
> >> +        } zone_mgmt;
> >> +    };
> >>  } BlkRwCo;
> >>  
> >>  int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
> >> @@ -1775,6 +1784,143 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
> >>      return ret;
> >>  }
> >>  
> >> +static void blk_aio_zone_report_entry(void *opaque) {
> > 
> > 
> > The coroutine_fn annotation is missing:
> > 
> >   static void coroutine_fn blk_aio_zone_report_entry(void *opaque) {
> > 
> >> +    BlkAioEmAIOCB *acb = opaque;
> >> +    BlkRwCo *rwco = &acb->rwco;
> >> +
> >> +    rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
> >> +                                   rwco->zone_report.nr_zones,
> >> +                                   rwco->zone_report.zones);
> >> +    blk_aio_complete(acb);
> >> +}
> >> +
> >> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> >> +                                unsigned int *nr_zones,
> >> +                                BlockZoneDescriptor  *zones,
> >> +                                BlockCompletionFunc *cb, void *opaque)
> >> +{
> >> +    BlkAioEmAIOCB *acb;
> >> +    Coroutine *co;
> >> +    IO_CODE();
> >> +
> >> +    blk_inc_in_flight(blk);
> >> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> >> +    acb->rwco = (BlkRwCo) {
> >> +        .blk    = blk,
> >> +        .offset = offset,
> >> +        .ret    = NOT_DONE,
> >> +        .zone_report = {
> >> +            .zones = zones,
> >> +            .nr_zones = nr_zones,
> >> +        },
> >> +    };
> >> +    acb->has_returned = false;
> >> +
> >> +    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
> >> +    bdrv_coroutine_enter(blk_bs(blk), co);
> >> +
> >> +    acb->has_returned = true;
> >> +    if (acb->rwco.ret != NOT_DONE) {
> >> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> >> +                                         blk_aio_complete_bh, acb);
> >> +    }
> >> +
> >> +    return &acb->common;
> >> +}
> >> +
> >> +static void blk_aio_zone_mgmt_entry(void *opaque) {
> > 
> > coroutine_fn is missing here.
> > 
> >> +    BlkAioEmAIOCB *acb = opaque;
> >> +    BlkRwCo *rwco = &acb->rwco;
> >> +
> >> +    rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
> >> +                                 rwco->offset, acb->bytes);
> >> +    blk_aio_complete(acb);
> >> +}
> >> +
> >> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >> +                              int64_t offset, int64_t len,
> >> +                              BlockCompletionFunc *cb, void *opaque) {
> >> +    BlkAioEmAIOCB *acb;
> >> +    Coroutine *co;
> >> +    IO_CODE();
> >> +
> >> +    blk_inc_in_flight(blk);
> >> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> >> +    acb->rwco = (BlkRwCo) {
> >> +        .blk    = blk,
> >> +        .offset = offset,
> >> +        .ret    = NOT_DONE,
> >> +        .zone_mgmt = {
> >> +            .op = op,
> >> +        },
> >> +    };
> >> +    acb->bytes = len;
> >> +    acb->has_returned = false;
> >> +
> >> +    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
> >> +    bdrv_coroutine_enter(blk_bs(blk), co);
> >> +
> >> +    acb->has_returned = true;
> >> +    if (acb->rwco.ret != NOT_DONE) {
> >> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> >> +                                         blk_aio_complete_bh, acb);
> >> +    }
> >> +
> >> +    return &acb->common;
> >> +}
> >> +
> >> +/*
> >> + * Send a zone_report command.
> >> + * offset is a byte offset from the start of the device. No alignment
> >> + * required for offset.
> >> + * nr_zones represents IN maximum and OUT actual.
> >> + */
> >> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> >> +                                    unsigned int *nr_zones,
> >> +                                    BlockZoneDescriptor *zones)
> >> +{
> >> +    int ret;
> >> +    IO_CODE();
> >> +
> >> +    blk_inc_in_flight(blk); /* increase before waiting */
> >> +    blk_wait_while_drained(blk);
> >> +    if (!blk_is_available(blk)) {
> >> +        blk_dec_in_flight(blk);
> >> +        return -ENOMEDIUM;
> >> +    }
> >> +    ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
> >> +    blk_dec_in_flight(blk);
> >> +    return ret;
> >> +}
> >> +
> >> +/*
> >> + * Send a zone_management command.
> >> + * op is the zone operation;
> >> + * offset is the byte offset from the start of the zoned device;
> >> + * len is the maximum number of bytes the command should operate on. It
> >> + * should be aligned with the device zone size.
> >> + */
> >> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> >> +        int64_t offset, int64_t len)
> >> +{
> >> +    int ret;
> >> +    IO_CODE();
> >> +
> >> +
> >> +    blk_inc_in_flight(blk);
> >> +    blk_wait_while_drained(blk);
> >> +
> >> +    ret = blk_check_byte_request(blk, offset, len);
> >> +    if (ret < 0) {
> >> +        blk_dec_in_flight(blk);
> >> +        return ret;
> >> +    }
> >> +
> >> +    ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
> >> +    blk_dec_in_flight(blk);
> >> +    return ret;
> >> +}
> >> +
> >>  void blk_drain(BlockBackend *blk)
> >>  {
> >>      BlockDriverState *bs = blk_bs(blk);
> >> diff --git a/block/file-posix.c b/block/file-posix.c
> >> index 0a8b4b426e..0a6c781201 100644
> >> --- a/block/file-posix.c
> >> +++ b/block/file-posix.c
> >> @@ -67,6 +67,9 @@
> >>  #include <sys/param.h>
> >>  #include <sys/syscall.h>
> >>  #include <sys/vfs.h>
> >> +#if defined(CONFIG_BLKZONED)
> >> +#include <linux/blkzoned.h>
> >> +#endif
> >>  #include <linux/cdrom.h>
> >>  #include <linux/fd.h>
> >>  #include <linux/fs.h>
> >> @@ -216,6 +219,15 @@ typedef struct RawPosixAIOData {
> >>              PreallocMode prealloc;
> >>              Error **errp;
> >>          } truncate;
> >> +        struct {
> >> +            unsigned int *nr_zones;
> >> +            BlockZoneDescriptor *zones;
> >> +        } zone_report;
> >> +        struct {
> >> +            unsigned long zone_op;
> >> +            const char *zone_op_name;
> >> +            bool all;
> > 
> > Please remove this field if it is unused.
> > 
> >> +        } zone_mgmt;
> >>      };
> >>  } RawPosixAIOData;
> >>  
> >> @@ -1339,7 +1351,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> >> Error **errp)
> >>  #endif
> >>  
> >>      if (bs->sg || S_ISBLK(st.st_mode)) {
> >> -        int ret = hdev_get_max_hw_transfer(s->fd, &st);
> >> +        ret = hdev_get_max_hw_transfer(s->fd, &st);
> >>  
> >>          if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
> >>              bs->bl.max_hw_transfer = ret;
> >> @@ -1356,6 +1368,41 @@ static void raw_refresh_limits(BlockDriverState 
> >> *bs, Error **errp)
> >>          zoned = BLK_Z_NONE;
> >>      }
> >>      bs->bl.zoned = zoned;
> >> +    if (zoned != BLK_Z_NONE) {
> >> +        ret = get_sysfs_long_val(&st, "chunk_sectors");
> >> +        if (ret <= 0) {
> >> +            error_report("Invalid zone size %" PRId32 " sectors ", ret);
> >> +            bs->bl.zoned = BLK_Z_NONE;
> >> +            return;
> >> +        }
> >> +        bs->bl.zone_size = ret * 512;
> >> +
> >> +        ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
> >> +        if (ret > 0) {
> >> +            bs->bl.max_append_sectors = ret / 512;
> >> +        }
> >> +
> >> +        ret = get_sysfs_long_val(&st, "max_open_zones");
> >> +        if (ret >= 0) {
> >> +            bs->bl.max_open_zones = ret;
> >> +        }
> >> +
> >> +        ret = get_sysfs_long_val(&st, "max_active_zones");
> >> +        if (ret >= 0) {
> >> +            bs->bl.max_active_zones = ret;
> >> +        }
> >> +        
> >> +        ret = get_sysfs_long_val(&st, "nr_zones");
> >> +        if (ret >= 0) {
> >> +            bs->bl.nr_zones = ret;
> >> +        }
> >> +
> >> +        ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity);
> >> +        if (ret != 0) {
> >> +            error_report("Invalid device capacity %" PRId64 " bytes ", 
> >> bs->bl.capacity);
> >> +            return;
> >> +        }
> > 
> > The QEMU block layer already knows the capacity of the device. Can
> > bdrv_getlength() be used instead of introducing a new
> > BlockLimits.capacity field?
> > 
> >> +    }
> >>  }
> >>  
> >>  static int check_for_dasd(int fd)
> >> @@ -1850,6 +1897,147 @@ static off_t copy_file_range(int in_fd, off_t 
> >> *in_off, int out_fd,
> >>  }
> >>  #endif
> >>  
> >> +/*
> >> + * parse_zone - Fill a zone descriptor
> >> + */
> >> +#if defined(CONFIG_BLKZONED)
> >> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> >> +                              const struct blk_zone *blkz,
> >> +                              const struct blk_zone_report *rep) {
> >> +    zone->start = blkz->start << BDRV_SECTOR_BITS;
> >> +    zone->length = blkz->len << BDRV_SECTOR_BITS;
> >> +    zone->wp = blkz->wp << BDRV_SECTOR_BITS;
> >> +    
> >> +    if (rep->flags & BLK_ZONE_REP_CAPACITY) {
> >> +        zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
> > 
> > #ifdef HAVE_BLK_ZONE_REP_CAPACITY is needed since the rep->flags and
> > blkz->capacity fields are missing and would cause a compilation error
> > when HAVE_BLK_ZONE_REP_CAPACITY is not defined:
> > 
> >   zone->cap = blkz->len << BDRV_SECTOR_BITS;
> >   #ifdef HAVE_BLK_ZONE_REP_CAPACITY
> >   /* Replace with the dedicated field on newer kernels */
> >   if (rep->flags & BLK_ZONE_REP_CAPACITY) {
> >       zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
> >   }
> >   #endif
> 
> It would be a lot cleaner to do something like this:
> 
> in the block common header file, add:
> 
> #ifdef HAVE_BLK_ZONE_REP_CAPACITY
> 
> #define BLK_ZONE_REP_CAPACITY   (1 << 0)
> 
> struct blk_zone_v2 {
>         __u64   start;          /* Zone start sector */
>         __u64   len;            /* Zone length in number of sectors */
>         __u64   wp;             /* Zone write pointer position */
>         __u8    type;           /* Zone type */
>         __u8    cond;           /* Zone condition */
>         __u8    non_seq;        /* Non-sequential write resources active */
>         __u8    reset;          /* Reset write pointer recommended */
>         __u8    resv[4];
>         __u64   capacity;       /* Zone capacity in number of sectors */
>         __u8    reserved[24];
> };
> #define blk_zone blk_zone_v2
> 
> struct blk_zone_report_v2 {
>         __u64   sector;
>         __u32   nr_zones;
>         __u32   flags;
>       struct blk_zone zones[0];
> };
> #define blk_zone_report blk_zone_report_v2
> 
> #endif
> 
> Then the above code becomes:
> 
> if (rep->flags & BLK_ZONE_REP_CAPACITY) {
>     zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
> } else {
>     zone->cap = blkz->len << BDRV_SECTOR_BITS;
> }
> 
> No #ifdef in the C code, only in the header and that compiles and works for 
> all
> host kernel versions.

The ->flags field doesn't exist in old versions of the header. How will
"if (rep->flags ..." compile on old systems?

Stefan

Attachment: signature.asc
Description: PGP signature

Reply via email to