Re: [Qemu-devel] [PATCH v1 00/17] dataplane: optimization and multi virtqueue support

Ming Lei Wed, 13 Aug 2014 04:44:37 -0700

Hi Paolo,

On Tue, Aug 12, 2014 at 3:37 AM, Paolo Bonzini <pbonz...@redhat.com> wrote:
> Il 10/08/2014 05:46, Ming Lei ha scritto:
>> Hi Kevin, Paolo, Stefan and all,
>>
>>
>> On Wed, 6 Aug 2014 10:48:55 +0200
>> Kevin Wolf <kw...@redhat.com> wrote:
>>
>>> Am 06.08.2014 um 07:33 hat Ming Lei geschrieben:
>>
>>>
>>> Anyhow, the coroutine version of your benchmark is buggy, it leaks all
>>> coroutines instead of exiting them, so it can't make any use of the
>>> coroutine pool. On my laptop, I get this (where fixed coroutine is a
>>> version that simply removes the yield at the end):
>>>
>>>                 | bypass        | fixed coro    | buggy coro
>>> ----------------+---------------+---------------+--------------
>>> time            | 1.09s         | 1.10s         | 1.62s
>>> L1-dcache-loads | 921,836,360   | 932,781,747   | 1,298,067,438
>>> insns per cycle | 2.39          | 2.39          | 1.90
>>>
>>> Begs the question whether you see a similar effect on a real qemu and
>>> the coroutine pool is still not big enough? With correct use of
>>> coroutines, the difference seems to be barely measurable even without
>>> any I/O involved.
>>
>> Now I fixes the coroutine leak bug, and previous crypt bench is a bit high
>> loading, and cause operations per sec very low(~40K/sec), finally I write a 
>> new
>> and simple one which can generate hundreds of kilo operations per sec and
>> the number should match with some fast storage devices, and it does show 
>> there
>> is not small effect from coroutine.
>>
>> Extremely if just getppid() syscall is run in each iteration, with using 
>> coroutine,
>> only 3M operations/sec can be got, and without using coroutine, the number 
>> can
>> reach 16M/sec, and there is more than 4 times difference!!!
>
> I should be on vacation, but I'm following a couple threads in the mailing 
> list
> and I'm a bit tired to hear the same argument again and again...
>
> The different characteristics of asynchronous I/O vs. any synchronous workload
> are such that it is hard to be sure that microbenchmarks make sense.
>
> The below patch is basically the minimal change to bypass coroutines.  Of 
> course
> the block.c part is not acceptable as is (the change to refresh_total_sectors
> is broken, the others are just ugly), but it is a start.  Please run it with
> your fio workloads, or write an aio-based version of a qemu-img/qemu-io *I/O*
> benchmark.


I have to say this approach is much cleaver, and better than mine, and
I just run a quick fio randread test in VM, and IOPS can improve > 10%
than bypass coroutine patch.

Hope it can be merged soon, :-)

Great thanks, Paolo.

Thanks,

> Paolo
>
> diff --git a/block.c b/block.c
> index 3e252a2..0b6e9cf 100644
> --- a/block.c
> +++ b/block.c
> @@ -704,7 +704,7 @@ static int refresh_total_sectors(BlockDriverState *bs, 
> int64_t hint)
>          return 0;
>
>      /* query actual device if possible, otherwise just trust the hint */
> -    if (drv->bdrv_getlength) {
> +    if (!hint && drv->bdrv_getlength) {
>          int64_t length = drv->bdrv_getlength(bs);
>          if (length < 0) {
>              return length;
> @@ -2651,9 +2651,6 @@ static int bdrv_check_byte_request(BlockDriverState 
> *bs, int64_t offset,
>      if (!bdrv_is_inserted(bs))
>          return -ENOMEDIUM;
>
> -    if (bs->growable)
> -        return 0;
> -
>      len = bdrv_getlength(bs);
>
>      if (offset < 0)
> @@ -3107,7 +3104,7 @@ static int coroutine_fn 
> bdrv_co_do_preadv(BlockDriverState *bs,
>      if (!drv) {
>          return -ENOMEDIUM;
>      }
> -    if (bdrv_check_byte_request(bs, offset, bytes)) {
> +    if (!bs->growable && bdrv_check_byte_request(bs, offset, bytes)) {
>          return -EIO;
>      }
>
> @@ -3347,7 +3344,7 @@ static int coroutine_fn 
> bdrv_co_do_pwritev(BlockDriverState *bs,
>      if (bs->read_only) {
>          return -EACCES;
>      }
> -    if (bdrv_check_byte_request(bs, offset, bytes)) {
> +    if (!bs->growable && bdrv_check_byte_request(bs, offset, bytes)) {
>          return -EIO;
>      }
>
> @@ -4356,6 +4353,20 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, 
> int64_t sector_num,
>  {
>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>
> +    if (bs->drv && bs->drv->bdrv_aio_readv &&
> +        bs->drv->bdrv_aio_readv != bdrv_aio_readv_em &&
> +        nb_sectors >= 0 && nb_sectors <= (UINT_MAX >> BDRV_SECTOR_BITS) &&
> +        !bdrv_check_byte_request(bs, sector_num << BDRV_SECTOR_BITS,
> +                                 nb_sectors << BDRV_SECTOR_BITS) &&
> +        !bs->copy_on_read && !bs->io_limits_enabled &&
> +        bs->request_alignment <= BDRV_SECTOR_SIZE) {
> +        BlockDriverAIOCB *acb =
> +            bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
> +                                    cb, opaque);
> +        assert(acb);
> +        return acb;
> +    }
> +
>      return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
>                                   cb, opaque, false);
>  }
> @@ -4366,6 +4377,24 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState 
> *bs, int64_t sector_num,
>  {
>      trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>
> +    if (bs->drv && bs->drv->bdrv_aio_writev &&
> +        bs->drv->bdrv_aio_writev != bdrv_aio_writev_em &&
> +        nb_sectors >= 0 && nb_sectors <= (UINT_MAX >> BDRV_SECTOR_BITS) &&
> +        !bdrv_check_byte_request(bs, sector_num << BDRV_SECTOR_BITS,
> +                                 nb_sectors << BDRV_SECTOR_BITS) &&
> +        !bs->read_only && !bs->io_limits_enabled &&
> +        bs->request_alignment <= BDRV_SECTOR_SIZE &&
> +        bs->enable_write_cache &&
> +        QLIST_EMPTY(&bs->before_write_notifiers.notifiers) &&
> +        bs->wr_highest_sector >= sector_num + nb_sectors - 1 &&
> +        QLIST_EMPTY(&bs->dirty_bitmaps)) {
> +        BlockDriverAIOCB *acb =
> +            bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
> +                                     cb, opaque);
> +        assert(acb);
> +        return acb;
> +    }
> +
>      return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
>                                   cb, opaque, true);
>  }
> diff --git a/block/raw_bsd.c b/block/raw_bsd.c
> index 492f58d..b86f26b 100644
> --- a/block/raw_bsd.c
> +++ b/block/raw_bsd.c
> @@ -48,6 +48,22 @@ static int raw_reopen_prepare(BDRVReopenState 
> *reopen_state,
>      return 0;
>  }
>
> +static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, int64_t 
> sector_num,
> +                                     QEMUIOVector *qiov, int nb_sectors,
> +                                    BlockDriverCompletionFunc *cb, void 
> *opaque)
> +{
> +    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
> +    return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, 
> opaque);
> +}
> +
> +static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, int64_t 
> sector_num,
> +                                      QEMUIOVector *qiov, int nb_sectors,
> +                                     BlockDriverCompletionFunc *cb, void 
> *opaque)
> +{
> +    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
> +    return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, 
> opaque);
> +}
> +
>  static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t 
> sector_num,
>                                       int nb_sectors, QEMUIOVector *qiov)
>  {
> @@ -181,6 +197,8 @@ static BlockDriver bdrv_raw = {
>      .bdrv_open            = &raw_open,
>      .bdrv_close           = &raw_close,
>      .bdrv_create          = &raw_create,
> +    .bdrv_aio_readv       = &raw_aio_readv,
> +    .bdrv_aio_writev      = &raw_aio_writev,
>      .bdrv_co_readv        = &raw_co_readv,
>      .bdrv_co_writev       = &raw_co_writev,
>      .bdrv_co_write_zeroes = &raw_co_write_zeroes,

Re: [Qemu-devel] [PATCH v1 00/17] dataplane: optimization and multi virtqueue support

Reply via email to