On Mon, Jan 13, 2025 at 04:48:31PM +1100, Alistair Popple wrote:
> On Sun, Jan 12, 2025 at 06:49:40PM -0800, Darrick J. Wong wrote:
> > On Mon, Jan 13, 2025 at 11:57:18AM +1100, Alistair Popple wrote:
> > > On Fri, Jan 10, 2025 at 08:50:19AM -0800, Darrick J. Wong wrote:
> > > > On Fri, Jan 10, 2025 at 05:00:35PM +1100, Alistair Popple wrote:
> > > > > File systems call dax_break_mapping() prior to reallocating file
> > > > > system blocks to ensure the page is not undergoing any DMA or other
> > > > > accesses. Generally this is needed when a file is truncated to ensure
> > > > > that if a block is reallocated nothing is writing to it. However
> > > > > filesystems currently don't call this when an FS DAX inode is evicted.
> > > > > 
> > > > > This can cause problems when the file system is unmounted as a page
> > > > > can continue to be under going DMA or other remote access after
> > > > > unmount. This means if the file system is remounted any truncate or
> > > > > other operation which requires the underlying file system block to be
> > > > > freed will not wait for the remote access to complete. Therefore a
> > > > > busy block may be reallocated to a new file leading to corruption.
> > > > > 
> > > > > Signed-off-by: Alistair Popple <apop...@nvidia.com>
> > > > > 
> > > > > ---
> > > > > 
> > > > > Changes for v5:
> > > > > 
> > > > >  - Don't wait for pages to be idle in non-DAX mappings
> > > > > ---
> > > > >  fs/dax.c            | 29 +++++++++++++++++++++++++++++
> > > > >  fs/ext4/inode.c     | 32 ++++++++++++++------------------
> > > > >  fs/xfs/xfs_inode.c  |  9 +++++++++
> > > > >  fs/xfs/xfs_inode.h  |  1 +
> > > > >  fs/xfs/xfs_super.c  | 18 ++++++++++++++++++
> > > > >  include/linux/dax.h |  2 ++
> > > > >  6 files changed, 73 insertions(+), 18 deletions(-)
> > > > > 
> > > > > diff --git a/fs/dax.c b/fs/dax.c
> > > > > index 7008a73..4e49cc4 100644
> > > > > --- a/fs/dax.c
> > > > > +++ b/fs/dax.c
> > > > > @@ -883,6 +883,14 @@ static int wait_page_idle(struct page *page,
> > > > >                               TASK_INTERRUPTIBLE, 0, 0, cb(inode));
> > > > >  }
> > > > >  
> > > > > +static void wait_page_idle_uninterruptible(struct page *page,
> > > > > +                                     void (cb)(struct inode *),
> > > > > +                                     struct inode *inode)
> > > > > +{
> > > > > +     ___wait_var_event(page, page_ref_count(page) == 1,
> > > > > +                     TASK_UNINTERRUPTIBLE, 0, 0, cb(inode));
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Unmaps the inode and waits for any DMA to complete prior to 
> > > > > deleting the
> > > > >   * DAX mapping entries for the range.
> > > > > @@ -911,6 +919,27 @@ int dax_break_mapping(struct inode *inode, 
> > > > > loff_t start, loff_t end,
> > > > >  }
> > > > >  EXPORT_SYMBOL_GPL(dax_break_mapping);
> > > > >  
> > > > > +void dax_break_mapping_uninterruptible(struct inode *inode,
> > > > > +                             void (cb)(struct inode *))
> > > > > +{
> > > > > +     struct page *page;
> > > > > +
> > > > > +     if (!dax_mapping(inode->i_mapping))
> > > > > +             return;
> > > > > +
> > > > > +     do {
> > > > > +             page = dax_layout_busy_page_range(inode->i_mapping, 0,
> > > > > +                                             LLONG_MAX);
> > > > > +             if (!page)
> > > > > +                     break;
> > > > > +
> > > > > +             wait_page_idle_uninterruptible(page, cb, inode);
> > > > > +     } while (true);
> > > > > +
> > > > > +     dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
> > > > > +}
> > > > > +EXPORT_SYMBOL_GPL(dax_break_mapping_uninterruptible);
> > > > > +
> > > > >  /*
> > > > >   * Invalidate DAX entry if it is clean.
> > > > >   */
> > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > > > index ee8e83f..fa35161 100644
> > > > > --- a/fs/ext4/inode.c
> > > > > +++ b/fs/ext4/inode.c
> > > > > @@ -163,6 +163,18 @@ int ext4_inode_is_fast_symlink(struct inode 
> > > > > *inode)
> > > > >              (inode->i_size < EXT4_N_BLOCKS * 4);
> > > > >  }
> > > > >  
> > > > > +static void ext4_wait_dax_page(struct inode *inode)
> > > > > +{
> > > > > +     filemap_invalidate_unlock(inode->i_mapping);
> > > > > +     schedule();
> > > > > +     filemap_invalidate_lock(inode->i_mapping);
> > > > > +}
> > > > > +
> > > > > +int ext4_break_layouts(struct inode *inode)
> > > > > +{
> > > > > +     return dax_break_mapping_inode(inode, ext4_wait_dax_page);
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Called at the last iput() if i_nlink is zero.
> > > > >   */
> > > > > @@ -181,6 +193,8 @@ void ext4_evict_inode(struct inode *inode)
> > > > >  
> > > > >       trace_ext4_evict_inode(inode);
> > > > >  
> > > > > +     dax_break_mapping_uninterruptible(inode, ext4_wait_dax_page);
> > > > > +
> > > > >       if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
> > > > >               ext4_evict_ea_inode(inode);
> > > > >       if (inode->i_nlink) {
> > > > > @@ -3902,24 +3916,6 @@ int ext4_update_disksize_before_punch(struct 
> > > > > inode *inode, loff_t offset,
> > > > >       return ret;
> > > > >  }
> > > > >  
> > > > > -static void ext4_wait_dax_page(struct inode *inode)
> > > > > -{
> > > > > -     filemap_invalidate_unlock(inode->i_mapping);
> > > > > -     schedule();
> > > > > -     filemap_invalidate_lock(inode->i_mapping);
> > > > > -}
> > > > > -
> > > > > -int ext4_break_layouts(struct inode *inode)
> > > > > -{
> > > > > -     struct page *page;
> > > > > -     int error;
> > > > > -
> > > > > -     if 
> > > > > (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
> > > > > -             return -EINVAL;
> > > > > -
> > > > > -     return dax_break_mapping_inode(inode, ext4_wait_dax_page);
> > > > > -}
> > > > > -
> > > > >  /*
> > > > >   * ext4_punch_hole: punches a hole in a file by releasing the blocks
> > > > >   * associated with the given offset and length
> > > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > > index 4410b42..c7ec5ab 100644
> > > > > --- a/fs/xfs/xfs_inode.c
> > > > > +++ b/fs/xfs/xfs_inode.c
> > > > > @@ -2997,6 +2997,15 @@ xfs_break_dax_layouts(
> > > > >       return dax_break_mapping_inode(inode, xfs_wait_dax_page);
> > > > >  }
> > > > >  
> > > > > +void
> > > > > +xfs_break_dax_layouts_uninterruptible(
> > > > > +     struct inode            *inode)
> > > > > +{
> > > > > +     xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
> > > > > +
> > > > > +     dax_break_mapping_uninterruptible(inode, xfs_wait_dax_page);
> > > > > +}
> > > > > +
> > > > >  int
> > > > >  xfs_break_layouts(
> > > > >       struct inode            *inode,
> > > > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > > > > index c4f03f6..613797a 100644
> > > > > --- a/fs/xfs/xfs_inode.h
> > > > > +++ b/fs/xfs/xfs_inode.h
> > > > > @@ -594,6 +594,7 @@ xfs_itruncate_extents(
> > > > >  }
> > > > >  
> > > > >  int  xfs_break_dax_layouts(struct inode *inode);
> > > > > +void xfs_break_dax_layouts_uninterruptible(struct inode *inode);
> > > > >  int  xfs_break_layouts(struct inode *inode, uint *iolock,
> > > > >               enum layout_break_reason reason);
> > > > >  
> > > > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > > > index 8524b9d..73ec060 100644
> > > > > --- a/fs/xfs/xfs_super.c
> > > > > +++ b/fs/xfs/xfs_super.c
> > > > > @@ -751,6 +751,23 @@ xfs_fs_drop_inode(
> > > > >       return generic_drop_inode(inode);
> > > > >  }
> > > > >  
> > > > > +STATIC void
> > > > > +xfs_fs_evict_inode(
> > > > > +     struct inode            *inode)
> > > > > +{
> > > > > +     struct xfs_inode        *ip = XFS_I(inode);
> > > > > +     uint                    iolock = XFS_IOLOCK_EXCL | 
> > > > > XFS_MMAPLOCK_EXCL;
> > > > > +
> > > > > +     if (IS_DAX(inode)) {
> > > > > +             xfs_ilock(ip, iolock);
> > > > > +             xfs_break_dax_layouts_uninterruptible(inode);
> > > > > +             xfs_iunlock(ip, iolock);
> > > > 
> > > > If we're evicting the inode, why is it necessary to take i_rwsem and the
> > > > mmap invalidation lock?  Shouldn't the evicting thread be the only one
> > > > with access to this inode?
> > > 
> > > Hmm, good point. I think you're right. I can easily stop taking
> > > XFS_IOLOCK_EXCL. Not taking XFS_MMAPLOCK_EXCL is slightly more difficult 
> > > because
> > > xfs_wait_dax_page() expects it to be taken. Do you think it is worth 
> > > creating a
> > > separate callback (xfs_wait_dax_page_unlocked()?) specifically for this 
> > > path or
> > > would you be happy with a comment explaining why we take the 
> > > XFS_MMAPLOCK_EXCL
> > > lock here?
> > 
> > There shouldn't be any other threads removing "pages" from i_mapping
> > during eviction, right?  If so, I think you can just call schedule()
> > directly from dax_break_mapping_uninterruptble.
> 
> Oh right, and I guess you are saying the same would apply to ext4 so no need 
> to
> cycle the filemap lock there either, which I've just noticed is buggy anyway. 
> So
> I can just remove the callback entirely for dax_break_mapping_uninterruptible.

Right.  You might want to rename dax_break_layouts_uninterruptible to
make it clearer that it's for evictions and doesn't go through the
mmap invalidation lock.

> > (dax mappings aren't allowed supposed to persist beyond unmount /
> > eviction, just like regular pagecache, right??)
> 
> Right they're not *supposed* to, but until at least this patch is applied they
> can ;-)

Yikes!

--D

>  - Alistair
> 
> > --D
> > 
> > >  - Alistair
> > > 
> > > > --D
> > > > 
> > > > > +     }
> > > > > +
> > > > > +     truncate_inode_pages_final(&inode->i_data);
> > > > > +     clear_inode(inode);
> > > > > +}
> > > > > +
> > > > >  static void
> > > > >  xfs_mount_free(
> > > > >       struct xfs_mount        *mp)
> > > > > @@ -1189,6 +1206,7 @@ static const struct super_operations 
> > > > > xfs_super_operations = {
> > > > >       .destroy_inode          = xfs_fs_destroy_inode,
> > > > >       .dirty_inode            = xfs_fs_dirty_inode,
> > > > >       .drop_inode             = xfs_fs_drop_inode,
> > > > > +     .evict_inode            = xfs_fs_evict_inode,
> > > > >       .put_super              = xfs_fs_put_super,
> > > > >       .sync_fs                = xfs_fs_sync_fs,
> > > > >       .freeze_fs              = xfs_fs_freeze,
> > > > > diff --git a/include/linux/dax.h b/include/linux/dax.h
> > > > > index ef9e02c..7c3773f 100644
> > > > > --- a/include/linux/dax.h
> > > > > +++ b/include/linux/dax.h
> > > > > @@ -274,6 +274,8 @@ static inline int __must_check 
> > > > > dax_break_mapping_inode(struct inode *inode,
> > > > >  {
> > > > >       return dax_break_mapping(inode, 0, LLONG_MAX, cb);
> > > > >  }
> > > > > +void dax_break_mapping_uninterruptible(struct inode *inode,
> > > > > +                             void (cb)(struct inode *));
> > > > >  int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > > > >                                 struct inode *dest, loff_t destoff,
> > > > >                                 loff_t len, bool *is_same,
> > > > > -- 
> > > > > git-series 0.9.1
> > > > > 
> > > 
> 

Reply via email to