Damien Le Moal <damien.lem...@opensource.wdc.com> 于2022年9月11日周日 16:36写道: > > On 2022/09/11 17:00, Sam Li wrote: > [...] > >>> @@ -1604,6 +1629,12 @@ static ssize_t > >>> handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) > >>> (const char *)buf + offset, > >>> aiocb->aio_nbytes - offset, > >>> aiocb->aio_offset + offset); > >>> + } else if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) { > >>> + uint64_t wp = aiocb->aio_offset; > >> > >> This variable is not necessary. > >> > >>> + len = pwrite(aiocb->aio_fildes, > >>> + (const char *)buf + offset, > >>> + aiocb->aio_nbytes - offset, > >>> + wp + offset); > >>> } else { > >>> len = pread(aiocb->aio_fildes, > >>> buf + offset, > >>> @@ -1638,7 +1669,6 @@ static int handle_aiocb_rw(void *opaque) > >>> RawPosixAIOData *aiocb = opaque; > >>> ssize_t nbytes; > >>> char *buf; > >>> - > >> > >> whiteline change. > >> > >>> if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { > >>> /* > >>> * If there is just a single buffer, and it is properly aligned > >>> @@ -1692,7 +1722,7 @@ static int handle_aiocb_rw(void *opaque) > >>> } > >>> > >>> nbytes = handle_aiocb_rw_linear(aiocb, buf); > >>> - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { > >>> + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { > >>> char *p = buf; > >>> size_t count = aiocb->aio_nbytes, copy; > >>> int i; > >>> @@ -1713,6 +1743,25 @@ static int handle_aiocb_rw(void *opaque) > >>> > >>> out: > >>> if (nbytes == aiocb->aio_nbytes) { > >>> +#if defined(CONFIG_BLKZONED) > >>> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > >>> + BlockZoneDescriptor *zone = aiocb->io.zone; > >>> + int64_t nr_sectors = aiocb->aio_nbytes / 512; > >>> + if (zone) { > >>> + qemu_mutex_init(&zone->lock); > >>> + if (zone->type == BLK_ZT_SWR) { > >>> + qemu_mutex_lock(&zone->lock); > >>> + zone->wp += nr_sectors; > >>> + qemu_mutex_unlock(&zone->lock); > >>> + } > >>> + qemu_mutex_destroy(&zone->lock); > >> > >> This is weird. you init the mutex, lock/unlock it and destroy it. So it has > >> absolutely no meaning at all. > > > > I was thinking that init every lock for all the zones or init the lock > > for the zone that needed it. The confusion I have here is the cost of > > initializing/destroying the lock. > > A mutex needs to be initialized before it is used and should not be > re-initialized, ever, until it is not needed anymore. That is, in this case, > since the mutex protects a zone wp tracking entry, it should be initialized > when > the array of zone wp is allocated & initialized with a zone report, and the > mutex destroyed when that same array is freed. > > The cost of initializing & destroying a mutex is low. And since that is not > done > in the hot IO path, you do not need to worry about it.
I see, thanks! > > [...] > >>> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > >>> + int64_t *offset, > >>> + QEMUIOVector *qiov, > >>> + BdrvRequestFlags flags) { > >>> +#if defined(CONFIG_BLKZONED) > >>> + BDRVRawState *s = bs->opaque; > >>> + int64_t zone_sector = bs->bl.zone_sectors; > >>> + int64_t zone_sector_mask = zone_sector - 1; > >>> + int64_t iov_len = 0; > >>> + int64_t len = 0; > >>> + RawPosixAIOData acb; > >>> + > >>> + if (*offset & zone_sector_mask) { > >>> + error_report("offset %" PRId64 " is not aligned to zone size " > >>> + "%" PRId64 "", *offset, zone_sector); > >>> + return -EINVAL; > >>> + } > >>> + > >>> + int64_t lbsz = bs->bl.logical_block_size;> + int64_t lbsz_mask = > >>> lbsz - 1; > >>> + for (int i = 0; i < qiov->niov; i++) { > >>> + iov_len = qiov->iov[i].iov_len; > >>> + if (iov_len & lbsz_mask) { > >>> + error_report("len of IOVector[%d] %" PRId64 " is not aligned > >>> to block " > >>> + "size %" PRId64 "", i, iov_len, lbsz); > >>> + return -EINVAL; > >>> + } > >> > >> This alignment check should be against the device write granularity, not > >> the > >> logical block size. The write granularity will always be equal to the > >> device > >> physical block size, which may or may not be equal to the device logical > >> block > >> size. E.g. a 512e SMR disk has a 512B logical block size but a 4096B > >> physical > >> block size. And the ZBC & ZAC specifications mandate that all write be > >> aligned > >> to the physical block size. > > > > I see. I'll change it to physical block size. > > I would use a filed called "write_granularity" since the virtio specs will > introduce that anyway. This zone granularity is going to be indeed equal to > the > physical block size of the host device for now. > > [...] > >>> /* removable device specific */ > >>> bool (*bdrv_is_inserted)(BlockDriverState *bs); > >>> @@ -854,6 +857,12 @@ typedef struct BlockLimits { > >>> > >>> /* maximum number of active zones */ > >>> int64_t max_active_zones; > >>> + > >>> + /* array of zones in the zoned block device. Only tracks write > >>> pointer's > >>> + * location of each zone as a helper for zone_append API */ > >>> + BlockZoneDescriptor *zones; > >> > >> This is a lot of memory for only tracking the wp... Why not reduce this to > >> an > >> array of int64 values, for the wp only ? As you may need the zone type too > >> (conventional vs sequential), you can use the most significant bit (a zone > >> wp > >> value will never use all 64 bits !). > >> > >> Or device another zone structure with zone type, zone wp and mutex only, so > >> smaller than the regular zone report structure. > > > > I was just trying to reuse do_zone_report. It is better to only track > > the wp only. Then a new struct and smaller zone_report should be > > introduced for it. > > Yes, this will use less memory, which is always good. > > -- > Damien Le Moal > Western Digital Research >