From: Evgeny Yakovlev <eyakov...@virtuozzo.com> Some guests (win2008 server for example) do a lot of unnecessary flushing when underlying media has not changed. This adds additional overhead on host when calling fsync/fdatasync.
This change introduces a write generation scheme in BlockDriverState. Current write generation is checked against last flushed generation to avoid unnessesary flushes. The problem with excessive flushing was found by a performance test which does parallel directory tree creation (from 2 processes). Results improved from 0.424 loops/sec to 0.432 loops/sec. Each loop creates 10^3 directories with 10 files in each. Signed-off-by: Evgeny Yakovlev <eyakov...@virtuozzo.com> Signed-off-by: Denis V. Lunev <d...@openvz.org> Reviewed-by: Paolo Bonzini <pbonz...@redhat.com> CC: Kevin Wolf <kw...@redhat.com> CC: Max Reitz <mre...@redhat.com> CC: Stefan Hajnoczi <stefa...@redhat.com> CC: Fam Zheng <f...@redhat.com> CC: John Snow <js...@redhat.com> --- block.c | 3 +++ block/io.c | 21 +++++++++++++++++++++ include/block/block_int.h | 5 +++++ 3 files changed, 29 insertions(+) diff --git a/block.c b/block.c index c2fb8bd..b88ad31 100644 --- a/block.c +++ b/block.c @@ -234,6 +234,8 @@ BlockDriverState *bdrv_new(void) bs->refcnt = 1; bs->aio_context = qemu_get_aio_context(); + qemu_co_queue_init(&bs->flush_queue); + QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list); return bs; @@ -2472,6 +2474,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset) ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); bdrv_dirty_bitmap_truncate(bs); bdrv_parent_cb_resize(bs); + ++bs->write_gen; } return ret; } diff --git a/block/io.c b/block/io.c index 7086908..f181ff7 100644 --- a/block/io.c +++ b/block/io.c @@ -1303,6 +1303,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, } bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); + ++bs->write_gen; bdrv_set_dirty(bs, start_sector, end_sector - start_sector); if (bs->wr_highest_offset < offset + bytes) { @@ -2235,6 +2236,15 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); + int current_gen = bs->write_gen; + + /* Wait until any previous flushes are completed */ + while (bs->flush_started_gen != bs->flushed_gen) { + qemu_co_queue_wait(&bs->flush_queue); + } + + bs->flush_started_gen = current_gen; + /* Write back all layers by calling one driver function */ if (bs->drv->bdrv_co_flush) { ret = bs->drv->bdrv_co_flush(bs); @@ -2255,6 +2265,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) goto flush_parent; } + /* Check if we really need to flush anything */ + if (bs->flushed_gen == current_gen) { + goto flush_parent; + } + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); if (bs->drv->bdrv_co_flush_to_disk) { ret = bs->drv->bdrv_co_flush_to_disk(bs); @@ -2285,6 +2300,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) */ ret = 0; } + if (ret < 0) { goto out; } @@ -2295,6 +2311,10 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) flush_parent: ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; out: + /* Notify any pending flushes that we have completed */ + bs->flushed_gen = current_gen; + qemu_co_queue_restart_all(&bs->flush_queue); + tracked_request_end(&req); return ret; } @@ -2420,6 +2440,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, } ret = 0; out: + ++bs->write_gen; bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS, req.bytes >> BDRV_SECTOR_BITS); tracked_request_end(&req); diff --git a/include/block/block_int.h b/include/block/block_int.h index 042c118..104d9ab 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -439,6 +439,11 @@ struct BlockDriverState { int copy_on_read; /* if nonzero, copy read backing sectors into image. note this is a reference count */ + CoQueue flush_queue; /* Serializing flush queue */ + unsigned int write_gen; /* Current data generation */ + unsigned int flush_started_gen; /* Generation for which flush has started */ + unsigned int flushed_gen; /* Flushed write generation */ + BlockDriver *drv; /* NULL means no media */ void *opaque; -- 2.1.4