On Wed, 11/12 20:27, Markus Armbruster wrote: > Commit 5500316 (May 2012) implemented raw_co_is_allocated() as > follows: > > 1. If defined(CONFIG_FIEMAP), use the FS_IOC_FIEMAP ioctl > > 2. Else if defined(SEEK_HOLE) && defined(SEEK_DATA), use lseek() > > 3. Else pretend there are no holes > > Later on, raw_co_is_allocated() was generalized to > raw_co_get_block_status(). > > Commit 4f11aa8 (May 2014) changed it to try the three methods in order > until success, because "there may be implementations which support > [SEEK_HOLE/SEEK_DATA] but not [FIEMAP] (e.g., NFSv4.2) as well as vice > versa." > > Unfortunately, we used FIEMAP incorrectly: we lacked FIEMAP_FLAG_SYNC. > Commit 38c4d0a (Sep 2014) added it. Because that's a significant > speed hit, the next commit 38c4d0a put SEEK_HOLE/SEEK_DATA first.
s/38c4d0a/7c159037/ > > As you see, the obvious use of FIEMAP is wrong, and the correct use is > slow. I guess this puts it somewhere between -7 "The obvious use is > wrong" and -10 "It's impossible to get right" on Rusty Russel's Hard > to Misuse scale[*]. Nice reading :) > > "Fortunately", the FIEMAP code is used only when > > * SEEK_HOLE/SEEK_DATA arent't defined, but CONFIG_FIEMAP is > > Uncommon. SEEK_HOLE had no XFS implementation between 2011 (when it > was introduced for ext4 and btrfs) and 2012. > > * SEEK_HOLE/SEEK_DATA and CONFIG_FIEMAP are defined, but lseek() fails > > Unlikely. > > Thus, the FIEMAP code executes rarely. Makes it a nice hidey-hole for > bugs. Worse, bugs hiding there can theoretically bite even on a host > that has SEEK_HOLE/SEEK_DATA. > > I don't want to worry about this crap, not even theoretically. Get > rid of it, then clean up the mess, including spotty error checking. > > [*] http://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html > > Signed-off-by: Markus Armbruster <arm...@redhat.com> > --- > block/raw-posix.c | 128 > ++++++++++++++++++++---------------------------------- > 1 file changed, 47 insertions(+), 81 deletions(-) > > diff --git a/block/raw-posix.c b/block/raw-posix.c > index 706d3c0..d16764c 100644 > --- a/block/raw-posix.c > +++ b/block/raw-posix.c > @@ -60,9 +60,6 @@ > #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ > #endif > #endif > -#ifdef CONFIG_FIEMAP > -#include <linux/fiemap.h> > -#endif > #ifdef CONFIG_FALLOCATE_PUNCH_HOLE > #include <linux/falloc.h> > #endif > @@ -1481,77 +1478,56 @@ out: > return result; > } > > -static int try_fiemap(BlockDriverState *bs, off_t start, off_t *data, > - off_t *hole, int nb_sectors) > +/* > + * Find allocation range in @bs around offset @start. > + * If @start is in a hole, store @start in @hole and the end of the > + * hole in @data. > + * If @start is in a data, store @start to @data, and the end of the > + * data to @hole. > + * If we can't find out, pretend there are no holes. > + */ > +static void find_allocation(BlockDriverState *bs, off_t start, > + off_t *data, off_t *hole) > { > -#ifdef CONFIG_FIEMAP > +#if defined(SEEK_DATA) && defined(SEEK_HOLE) > BDRVRawState *s = bs->opaque; > - int ret = 0; > - struct { > - struct fiemap fm; > - struct fiemap_extent fe; > - } f; > + off_t offs; > > - if (s->skip_fiemap) { > - return -ENOTSUP; > + offs = lseek(s->fd, start, SEEK_HOLE); > + if (offs < 0) { > + goto dunno; > } > + assert(offs >= start); > > - f.fm.fm_start = start; > - f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE; > - f.fm.fm_flags = FIEMAP_FLAG_SYNC; > - f.fm.fm_extent_count = 1; > - f.fm.fm_reserved = 0; > - if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) { > - s->skip_fiemap = true; > - return -errno; > - } > - > - if (f.fm.fm_mapped_extents == 0) { > - /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length. > - * f.fm.fm_start + f.fm.fm_length must be clamped to the file size! > - */ > - off_t length = lseek(s->fd, 0, SEEK_END); > - *hole = f.fm.fm_start; > - *data = MIN(f.fm.fm_start + f.fm.fm_length, length); > - } else { > - *data = f.fe.fe_logical; > - *hole = f.fe.fe_logical + f.fe.fe_length; > - if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) { > - ret |= BDRV_BLOCK_ZERO; > - } > - } > - > - return ret; > -#else > - return -ENOTSUP; > -#endif > -} > - > -static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, > - off_t *hole) > -{ > -#if defined SEEK_HOLE && defined SEEK_DATA > - BDRVRawState *s = bs->opaque; > - > - *hole = lseek(s->fd, start, SEEK_HOLE); > - if (*hole == -1) { > - return -errno; > - } > - > - if (*hole > start) { > + if (offs > start) { > + /* in data, next hole at offs */ > *data = start; > - } else { > - /* On a hole. We need another syscall to find its end. */ > - *data = lseek(s->fd, start, SEEK_DATA); > - if (*data == -1) { > - *data = lseek(s->fd, 0, SEEK_END); > - } > + *hole = offs; > + return; > } > > - return 0; > -#else > - return -ENOTSUP; > + /* in hole, end not yet known */ > + offs = lseek(s->fd, start, SEEK_DATA); > + if (offs < 0) { > + /* no idea where the hole ends, give up (unlikely to happen) */ > + goto dunno; > + } > + assert(offs >= start); > + *hole = start; > + *data = offs; > + return; > + > +dunno: > #endif > + /* assume all data */ > + offs = lseek(s->fd, 0, SEEK_END); > + if (offs < 0) { > + /* now that's *really* unexpected */ > + offs = (off_t)1 << (sizeof(off_t) * 8 - 1); > + offs += offs - 1; > + } > + *data = start; > + *hole = offs; > } > > /* > @@ -1591,28 +1567,18 @@ static int64_t coroutine_fn > raw_co_get_block_status(BlockDriverState *bs, > nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); > } > > - ret = try_seek_hole(bs, start, &data, &hole); > - if (ret < 0) { > - ret = try_fiemap(bs, start, &data, &hole, nb_sectors); > - if (ret < 0) { > - /* Assume everything is allocated. */ > - data = 0; > - hole = start + nb_sectors * BDRV_SECTOR_SIZE; > - ret = 0; > - } > - } > - > - assert(ret >= 0); > - > - if (data <= start) { > + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; > + find_allocation(bs, start, &data, &hole); > + if (data == start) { > /* On a data extent, compute sectors to the end of the extent. */ > *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE); > - return ret | BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; > } else { > /* On a hole, compute sectors to the beginning of the next extent. > */ > + assert(hole == start); > *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); > - return ret | BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID | start; > + ret |= BDRV_BLOCK_ZERO; > } > + return ret; > } > > static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, > -- > 1.9.3 > > Other than the wrong commit id in message, Reviewed-by: Fam Zheng <f...@redhat.com>