Hi Paolo, On Tue, Aug 12, 2014 at 3:37 AM, Paolo Bonzini <pbonz...@redhat.com> wrote: > Il 10/08/2014 05:46, Ming Lei ha scritto: >> Hi Kevin, Paolo, Stefan and all, >> >> >> On Wed, 6 Aug 2014 10:48:55 +0200 >> Kevin Wolf <kw...@redhat.com> wrote: >> >>> Am 06.08.2014 um 07:33 hat Ming Lei geschrieben: >> >>> >>> Anyhow, the coroutine version of your benchmark is buggy, it leaks all >>> coroutines instead of exiting them, so it can't make any use of the >>> coroutine pool. On my laptop, I get this (where fixed coroutine is a >>> version that simply removes the yield at the end): >>> >>> | bypass | fixed coro | buggy coro >>> ----------------+---------------+---------------+-------------- >>> time | 1.09s | 1.10s | 1.62s >>> L1-dcache-loads | 921,836,360 | 932,781,747 | 1,298,067,438 >>> insns per cycle | 2.39 | 2.39 | 1.90 >>> >>> Begs the question whether you see a similar effect on a real qemu and >>> the coroutine pool is still not big enough? With correct use of >>> coroutines, the difference seems to be barely measurable even without >>> any I/O involved. >> >> Now I fixes the coroutine leak bug, and previous crypt bench is a bit high >> loading, and cause operations per sec very low(~40K/sec), finally I write a >> new >> and simple one which can generate hundreds of kilo operations per sec and >> the number should match with some fast storage devices, and it does show >> there >> is not small effect from coroutine. >> >> Extremely if just getppid() syscall is run in each iteration, with using >> coroutine, >> only 3M operations/sec can be got, and without using coroutine, the number >> can >> reach 16M/sec, and there is more than 4 times difference!!! > > I should be on vacation, but I'm following a couple threads in the mailing > list > and I'm a bit tired to hear the same argument again and again... > > The different characteristics of asynchronous I/O vs. any synchronous workload > are such that it is hard to be sure that microbenchmarks make sense. > > The below patch is basically the minimal change to bypass coroutines. Of > course > the block.c part is not acceptable as is (the change to refresh_total_sectors > is broken, the others are just ugly), but it is a start. Please run it with > your fio workloads, or write an aio-based version of a qemu-img/qemu-io *I/O* > benchmark.
I have to say this approach is much cleaver, and better than mine, and I just run a quick fio randread test in VM, and IOPS can improve > 10% than bypass coroutine patch. Hope it can be merged soon, :-) Great thanks, Paolo. Thanks, > Paolo > > diff --git a/block.c b/block.c > index 3e252a2..0b6e9cf 100644 > --- a/block.c > +++ b/block.c > @@ -704,7 +704,7 @@ static int refresh_total_sectors(BlockDriverState *bs, > int64_t hint) > return 0; > > /* query actual device if possible, otherwise just trust the hint */ > - if (drv->bdrv_getlength) { > + if (!hint && drv->bdrv_getlength) { > int64_t length = drv->bdrv_getlength(bs); > if (length < 0) { > return length; > @@ -2651,9 +2651,6 @@ static int bdrv_check_byte_request(BlockDriverState > *bs, int64_t offset, > if (!bdrv_is_inserted(bs)) > return -ENOMEDIUM; > > - if (bs->growable) > - return 0; > - > len = bdrv_getlength(bs); > > if (offset < 0) > @@ -3107,7 +3104,7 @@ static int coroutine_fn > bdrv_co_do_preadv(BlockDriverState *bs, > if (!drv) { > return -ENOMEDIUM; > } > - if (bdrv_check_byte_request(bs, offset, bytes)) { > + if (!bs->growable && bdrv_check_byte_request(bs, offset, bytes)) { > return -EIO; > } > > @@ -3347,7 +3344,7 @@ static int coroutine_fn > bdrv_co_do_pwritev(BlockDriverState *bs, > if (bs->read_only) { > return -EACCES; > } > - if (bdrv_check_byte_request(bs, offset, bytes)) { > + if (!bs->growable && bdrv_check_byte_request(bs, offset, bytes)) { > return -EIO; > } > > @@ -4356,6 +4353,20 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, > int64_t sector_num, > { > trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); > > + if (bs->drv && bs->drv->bdrv_aio_readv && > + bs->drv->bdrv_aio_readv != bdrv_aio_readv_em && > + nb_sectors >= 0 && nb_sectors <= (UINT_MAX >> BDRV_SECTOR_BITS) && > + !bdrv_check_byte_request(bs, sector_num << BDRV_SECTOR_BITS, > + nb_sectors << BDRV_SECTOR_BITS) && > + !bs->copy_on_read && !bs->io_limits_enabled && > + bs->request_alignment <= BDRV_SECTOR_SIZE) { > + BlockDriverAIOCB *acb = > + bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, > + cb, opaque); > + assert(acb); > + return acb; > + } > + > return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, > cb, opaque, false); > } > @@ -4366,6 +4377,24 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState > *bs, int64_t sector_num, > { > trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); > > + if (bs->drv && bs->drv->bdrv_aio_writev && > + bs->drv->bdrv_aio_writev != bdrv_aio_writev_em && > + nb_sectors >= 0 && nb_sectors <= (UINT_MAX >> BDRV_SECTOR_BITS) && > + !bdrv_check_byte_request(bs, sector_num << BDRV_SECTOR_BITS, > + nb_sectors << BDRV_SECTOR_BITS) && > + !bs->read_only && !bs->io_limits_enabled && > + bs->request_alignment <= BDRV_SECTOR_SIZE && > + bs->enable_write_cache && > + QLIST_EMPTY(&bs->before_write_notifiers.notifiers) && > + bs->wr_highest_sector >= sector_num + nb_sectors - 1 && > + QLIST_EMPTY(&bs->dirty_bitmaps)) { > + BlockDriverAIOCB *acb = > + bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, > + cb, opaque); > + assert(acb); > + return acb; > + } > + > return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, > cb, opaque, true); > } > diff --git a/block/raw_bsd.c b/block/raw_bsd.c > index 492f58d..b86f26b 100644 > --- a/block/raw_bsd.c > +++ b/block/raw_bsd.c > @@ -48,6 +48,22 @@ static int raw_reopen_prepare(BDRVReopenState > *reopen_state, > return 0; > } > > +static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, int64_t > sector_num, > + QEMUIOVector *qiov, int nb_sectors, > + BlockDriverCompletionFunc *cb, void > *opaque) > +{ > + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); > + return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, > opaque); > +} > + > +static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, int64_t > sector_num, > + QEMUIOVector *qiov, int nb_sectors, > + BlockDriverCompletionFunc *cb, void > *opaque) > +{ > + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); > + return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, > opaque); > +} > + > static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t > sector_num, > int nb_sectors, QEMUIOVector *qiov) > { > @@ -181,6 +197,8 @@ static BlockDriver bdrv_raw = { > .bdrv_open = &raw_open, > .bdrv_close = &raw_close, > .bdrv_create = &raw_create, > + .bdrv_aio_readv = &raw_aio_readv, > + .bdrv_aio_writev = &raw_aio_writev, > .bdrv_co_readv = &raw_co_readv, > .bdrv_co_writev = &raw_co_writev, > .bdrv_co_write_zeroes = &raw_co_write_zeroes,