24.01.2019 17:17, Kevin Wolf wrote: > Depending on the exact image layout and the storage backend (tmpfs is > konwn to have very slow SEEK_HOLE/SEEK_DATA), caching lseek results can > save us a lot of time e.g. during a mirror block job or qemu-img convert > with a fragmented source image (.bdrv_co_block_status on the protocol > layer can be called for every single cluster in the extreme case). > > We may only cache data regions because of possible concurrent writers. > This means that we can later treat a recently punched hole as data, but > this is safe. We can't cache holes because then we might treat recently > written data as holes, which can cause corruption. > > Signed-off-by: Kevin Wolf <kw...@redhat.com> > --- > block/file-posix.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 49 insertions(+), 2 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 8aee7a3fb8..7272c7c99d 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -168,6 +168,12 @@ typedef struct BDRVRawState { > bool needs_alignment; > bool check_cache_dropped; > > + struct seek_data_cache { > + bool valid; > + uint64_t start; > + uint64_t end; > + } seek_data_cache; > + > PRManager *pr_mgr; > } BDRVRawState; > > @@ -1555,8 +1561,17 @@ static int handle_aiocb_write_zeroes_unmap(void > *opaque) > { > RawPosixAIOData *aiocb = opaque; > BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque; > + struct seek_data_cache *sdc; > int ret; > > + /* Invalidate seek_data_cache if it overlaps */ > + sdc = &s->seek_data_cache; > + if (sdc->valid && !(sdc->end < aiocb->aio_offset || > + sdc->start > aiocb->aio_offset + aiocb->aio_nbytes))
to be presize: <= and >= > + { > + sdc->valid = false; > + } > + > /* First try to write zeros and unmap at the same time */ > Why not to drop cache on handle_aiocb_write_zeroes()? Otherwise, we'll return DATA for these regions which may unallocated read-as-zero, if I'm not mistaken. > #ifdef CONFIG_FALLOCATE_PUNCH_HOLE > @@ -1634,11 +1649,20 @@ static int handle_aiocb_discard(void *opaque) > RawPosixAIOData *aiocb = opaque; > int ret = -EOPNOTSUPP; > BDRVRawState *s = aiocb->bs->opaque; > + struct seek_data_cache *sdc; > > if (!s->has_discard) { > return -ENOTSUP; > } > > + /* Invalidate seek_data_cache if it overlaps */ > + sdc = &s->seek_data_cache; > + if (sdc->valid && !(sdc->end < aiocb->aio_offset || > + sdc->start > aiocb->aio_offset + aiocb->aio_nbytes)) and <= and >= and if add same to handle_aiocb_write_zeroes(), then it worth to create helper function to invalidate cache. > + { > + sdc->valid = false; > + } > + > if (aiocb->aio_type & QEMU_AIO_BLKDEV) { > #ifdef BLKDISCARD > do { > @@ -2424,6 +2448,8 @@ static int coroutine_fn > raw_co_block_status(BlockDriverState *bs, > int64_t *map, > BlockDriverState **file) > { > + BDRVRawState *s = bs->opaque; > + struct seek_data_cache *sdc; > off_t data = 0, hole = 0; > int ret; > > @@ -2439,6 +2465,14 @@ static int coroutine_fn > raw_co_block_status(BlockDriverState *bs, > return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; > } > > + sdc = &s->seek_data_cache; > + if (sdc->valid && sdc->start <= offset && sdc->end > offset) { > + *pnum = MIN(bytes, sdc->end - offset); > + *map = offset; > + *file = bs; > + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; > + } > + > ret = find_allocation(bs, offset, &data, &hole); > if (ret == -ENXIO) { > /* Trailing hole */ > @@ -2451,14 +2485,27 @@ static int coroutine_fn > raw_co_block_status(BlockDriverState *bs, > } else if (data == offset) { > /* On a data extent, compute bytes to the end of the extent, > * possibly including a partial sector at EOF. */ > - *pnum = MIN(bytes, hole - offset); > + *pnum = hole - offset; hmm, why? At least you didn't mention it in commit-message.. > ret = BDRV_BLOCK_DATA; > } else { > /* On a hole, compute bytes to the beginning of the next extent. */ > assert(hole == offset); > - *pnum = MIN(bytes, data - offset); > + *pnum = data - offset; > ret = BDRV_BLOCK_ZERO; > } > + > + /* Caching allocated ranges is okay even if another process writes to the > + * same file because we allow declaring things allocated even if there > is a > + * hole. However, we cannot cache holes without risking corruption. */ > + if (ret == BDRV_BLOCK_DATA) { > + *sdc = (struct seek_data_cache) { > + .valid = true, > + .start = offset, > + .end = offset + *pnum, > + }; > + } > + > + *pnum = MIN(*pnum, bytes); > *map = offset; > *file = bs; > return ret | BDRV_BLOCK_OFFSET_VALID; > -- Best regards, Vladimir