Am 18.01.2012 15:59, schrieb Stefan Hajnoczi: > The ability to zero regions of an image file is a useful primitive for > higher-level features such as image streaming or zero write detection. > > Image formats may support an optimized metadata representation instead > of writing zeroes into the image file. This allows zero writes to be > potentially faster than regular write operations and also preserve > sparseness of the image file. > > The .bdrv_co_write_zeroes() interface should be implemented by block > drivers that wish to provide efficient zeroing. > > Note that this operation is different from the discard operation, which > may leave the contents of the region indeterminate. That means > discarded blocks are not guaranteed to contain zeroes and may contain > junk data instead. > > Signed-off-by: Stefan Hajnoczi <stefa...@linux.vnet.ibm.com> > --- > block.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++------ > block.h | 7 +++++++ > block_int.h | 8 ++++++++ > trace-events | 1 + > 4 files changed, 63 insertions(+), 6 deletions(-) > > diff --git a/block.c b/block.c > index 3621d11..c9fa5c1 100644 > --- a/block.c > +++ b/block.c > @@ -50,6 +50,7 @@ > > typedef enum { > BDRV_REQ_COPY_ON_READ = 0x1, > + BDRV_REQ_ZERO_WRITE = 0x2, > } BdrvRequestFlags; > > static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); > @@ -69,7 +70,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState > *bs, > int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, > BdrvRequestFlags flags); > static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, > - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); > + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, > + BdrvRequestFlags flags); > static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, > int64_t sector_num, > QEMUIOVector *qiov, > @@ -1300,7 +1302,7 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) > rwco->nb_sectors, rwco->qiov, 0); > } else { > rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, > - rwco->nb_sectors, rwco->qiov); > + rwco->nb_sectors, rwco->qiov, 0); > } > } > > @@ -1639,11 +1641,37 @@ int coroutine_fn > bdrv_co_copy_on_readv(BlockDriverState *bs, > BDRV_REQ_COPY_ON_READ); > } > > +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, > + int64_t sector_num, int nb_sectors) > +{ > + BlockDriver *drv = bs->drv; > + QEMUIOVector qiov; > + struct iovec iov; > + int ret; > + > + /* First try the efficient write zeroes operation */ > + if (drv->bdrv_co_write_zeroes) { > + return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors); > + } > + > + /* Fall back to bounce buffer if write zeroes is unsupported */ > + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; > + iov.iov_base = qemu_blockalign(bs, iov.iov_len); > + memset(iov.iov_base, 0, iov.iov_len); > + qemu_iovec_init_external(&qiov, &iov, 1); > + > + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); > + > + qemu_vfree(iov.iov_base); > + return ret; > +} > + > /* > * Handle a write request in coroutine context > */ > static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, > - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) > + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, > + BdrvRequestFlags flags) > { > BlockDriver *drv = bs->drv; > BdrvTrackedRequest req; > @@ -1670,7 +1698,11 @@ static int coroutine_fn > bdrv_co_do_writev(BlockDriverState *bs, > > tracked_request_begin(&req, bs, sector_num, nb_sectors, true); > > - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); > + if (flags & BDRV_REQ_ZERO_WRITE) { > + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); > + } else { > + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); > + } > > if (bs->dirty_bitmap) { > set_dirty_bitmap(bs, sector_num, nb_sectors, 1); > @@ -1690,7 +1722,16 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, > int64_t sector_num, > { > trace_bdrv_co_writev(bs, sector_num, nb_sectors); > > - return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov); > + return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); > +} > + > +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, > + int64_t sector_num, int nb_sectors) > +{ > + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); > + > + return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, > + BDRV_REQ_ZERO_WRITE); > } > > /** > @@ -3192,7 +3233,7 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque) > acb->req.nb_sectors, acb->req.qiov, 0); > } else { > acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, > - acb->req.nb_sectors, acb->req.qiov); > + acb->req.nb_sectors, acb->req.qiov, 0); > } > > acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); > diff --git a/block.h b/block.h > index cae289b..9f3aad3 100644 > --- a/block.h > +++ b/block.h > @@ -146,6 +146,13 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState > *bs, > int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); > int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, > int nb_sectors, QEMUIOVector *qiov); > +/* > + * Efficiently zero a region of the disk image. Note that this is a regular > + * I/O request like read or write and should have a reasonable size. This > + * function is not suitable for zeroing the entire image in a single request. > + */
The reason for this is that in the fallback case you allocate memory for the whole request, right? So what about just limiting the allocation in bdrv_co_do_write_zeroes() to a reasonable size and putting a loop there for large requests? Kevin