On Thu, Jul 26, 2018 at 2:33 PM Kevin Wolf <kw...@redhat.com> wrote: > The BLKDISCARD ioctl doesn't guarantee that the discarded blocks read as > all-zero afterwards, so don't try to abuse it for zero writing. We try > to only use this if BLKDISCARDZEROES tells us that it is safe, but this > is unreliable on older kernels and a constant 0 in newer kernels. In > other words, this code path is never actually used with newer kernels, > so we don't even try to unmap while writing zeros. > > This patch removes the abuse of discard for writing zeroes from > file-posix and instead adds a new function that uses interfaces that are > actually meant to deallocate and zero out at the same time. Only if > those fail, it falls back to zeroing out without unmap. We never fall > back to a discard operation any more that may or may not result in > zeros. > > Signed-off-by: Kevin Wolf <kw...@redhat.com> > --- > block/file-posix.c | 62 > +++++++++++++++++++++++++++++++++++++++++------------- > 1 file changed, 47 insertions(+), 15 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 60af4b3d51..9c66cd87d1 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -648,7 +648,7 @@ static int raw_open_common(BlockDriverState *bs, QDict > *options, > } > #endif > > - bs->supported_zero_flags = s->discard_zeroes ? BDRV_REQ_MAY_UNMAP : 0; > + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; > ret = 0; > fail: > if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { > @@ -1487,6 +1487,38 @@ static ssize_t > handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) > return -ENOTSUP; > } > > +static ssize_t handle_aiocb_write_zeroes_unmap(RawPosixAIOData *aiocb) > +{ > + BDRVRawState *s = aiocb->bs->opaque; > + int ret; > + > + /* First try to write zeros and unmap at the same time */ > + > +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE > + ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, > + aiocb->aio_offset, aiocb->aio_nbytes); > + if (ret != -ENOTSUP) { >
This fails with ENODEV on RHEL 7.5 when fd is a block device. The manual says: ENODEV fd does not refer to a regular file or a directory. (If fd is a pipe or FIFO, a different error results.) Same issue exists in this patch: http://lists.gnu.org/archive/html/qemu-devel/2018-03/msg07194.html + return ret; > + } > +#endif > + > +#ifdef CONFIG_XFS > + if (s->is_xfs) { > + /* xfs_discard() guarantees that the discarded area reads as > all-zero > + * afterwards, so we can use it here. */ > + return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); > + } > +#endif > + > + /* Make the compiler happy if neither of the above is compiled in */ > + (void) s; > + > + /* If we couldn't manage to unmap while guaranteed that the area > reads as > + * all-zero afterwards, just write zeroes without unmapping */ > + ret = handle_aiocb_write_zeroes(aiocb); > + return ret; > +} > + > #ifndef HAVE_COPY_FILE_RANGE > static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, > off_t *out_off, size_t len, unsigned int > flags) > @@ -1729,6 +1761,9 @@ static int aio_worker(void *arg) > case QEMU_AIO_WRITE_ZEROES: > ret = handle_aiocb_write_zeroes(aiocb); > break; > + case QEMU_AIO_WRITE_ZEROES | QEMU_AIO_DISCARD: > + ret = handle_aiocb_write_zeroes_unmap(aiocb); > + break; > case QEMU_AIO_COPY_RANGE: > ret = handle_aiocb_copy_range(aiocb); > break; > @@ -2553,15 +2588,13 @@ static int coroutine_fn raw_co_pwrite_zeroes( > int bytes, BdrvRequestFlags flags) > { > BDRVRawState *s = bs->opaque; > + int operation = QEMU_AIO_WRITE_ZEROES; > > - if (!(flags & BDRV_REQ_MAY_UNMAP)) { > - return paio_submit_co(bs, s->fd, offset, NULL, bytes, > - QEMU_AIO_WRITE_ZEROES); > - } else if (s->discard_zeroes) { > - return paio_submit_co(bs, s->fd, offset, NULL, bytes, > - QEMU_AIO_DISCARD); > + if (flags & BDRV_REQ_MAY_UNMAP) { > + operation |= QEMU_AIO_DISCARD; > } > - return -ENOTSUP; > + > + return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation); > } > > static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) > @@ -3054,20 +3087,19 @@ static coroutine_fn int > hdev_co_pwrite_zeroes(BlockDriverState *bs, > int64_t offset, int bytes, BdrvRequestFlags flags) > { > BDRVRawState *s = bs->opaque; > + int operation = QEMU_AIO_WRITE_ZEROES | QEMU_AIO_BLKDEV; > int rc; > > rc = fd_open(bs); > if (rc < 0) { > return rc; > } > - if (!(flags & BDRV_REQ_MAY_UNMAP)) { > - return paio_submit_co(bs, s->fd, offset, NULL, bytes, > - QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); > - } else if (s->discard_zeroes) { > - return paio_submit_co(bs, s->fd, offset, NULL, bytes, > - QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); > + > + if (flags & BDRV_REQ_MAY_UNMAP) { > + operation |= QEMU_AIO_DISCARD; > } > - return -ENOTSUP; > + > + return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation); > } > > static int coroutine_fn hdev_co_create_opts(const char *filename, > QemuOpts *opts, > -- > 2.13.6 > > >