There are a number of places in dax.c that look up the struct block_device
associated with an inode.  Previously this was done by just using
inode->i_sb->s_bdev.  This is correct in some cases, such as when using
ext2 and ext4.

However, for raw block devices and for XFS with a real-time device, the
value in inode->i_sb->s_bdev is not correct.  With the code as it is
currently written, an fsync or msync to a DAX enabled raw block device will
cause a NULL pointer dereference kernel BUG.  For this to work correctly we
need to ask the block device or filesystem what struct block_device is
appropriate for our inode.

To that end, add a get_bdev(struct inode *) entry point to struct
super_operations.  If this function pointer is non-NULL, this notifies DAX
that it needs to use it to look up the correct block_device.  If
i_sb->get_bdev() is NULL DAX will default to inode->i_sb->s_bdev.

I added the function to super_operations instead of another alternative
like inode_operations because the function pointer varies by filesystem or
block device, not per inode.  I believe that this will also save memory
because there is only one struct super_operations per mounted filesystem
but there could be many struct inode_operations and there is no need to
keep many copies of the same function pointer in memory.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 fs/block_dev.c     |  6 ++++++
 fs/dax.c           | 20 ++++++++++++++------
 fs/xfs/xfs_aops.c  |  2 +-
 fs/xfs/xfs_aops.h  |  1 +
 fs/xfs/xfs_super.c |  1 +
 include/linux/fs.h |  1 +
 6 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fa0507a..845b049 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -156,6 +156,11 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
        return 0;
 }
 
+static struct block_device *blkdev_get_bdev(struct inode *inode)
+{
+       return I_BDEV(inode);
+}
+
 static struct inode *bdev_file_inode(struct file *file)
 {
        return file->f_mapping->host;
@@ -569,6 +574,7 @@ static const struct super_operations bdev_sops = {
        .alloc_inode = bdev_alloc_inode,
        .destroy_inode = bdev_destroy_inode,
        .drop_inode = generic_delete_inode,
+       .get_bdev = blkdev_get_bdev,
        .evict_inode = bdev_evict_inode,
 };
 
diff --git a/fs/dax.c b/fs/dax.c
index 227974a..c701ea4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,14 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
+static struct block_device *dax_get_bdev(struct inode *inode)
+{
+       if (inode->i_sb->s_op->get_bdev)
+               return inode->i_sb->s_op->get_bdev(inode);
+       else
+               return inode->i_sb->s_bdev;
+}
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
        struct request_queue *q = bdev->bd_queue;
@@ -85,7 +93,7 @@ struct page *read_dax_sector(struct block_device *bdev, 
sector_t n)
  */
 int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
 {
-       struct block_device *bdev = inode->i_sb->s_bdev;
+       struct block_device *bdev = dax_get_bdev(inode);
        struct blk_dax_ctl dax = {
                .sector = block << (inode->i_blkbits - 9),
                .size = _size,
@@ -266,7 +274,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
        loff_t end = pos + iov_iter_count(iter);
 
        memset(&bh, 0, sizeof(bh));
-       bh.b_bdev = inode->i_sb->s_bdev;
+       bh.b_bdev = dax_get_bdev(inode);
 
        if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
                struct address_space *mapping = inode->i_mapping;
@@ -488,7 +496,7 @@ int dax_writeback_mapping_range(struct address_space 
*mapping, loff_t start,
                loff_t end)
 {
        struct inode *inode = mapping->host;
-       struct block_device *bdev = inode->i_sb->s_bdev;
+       struct block_device *bdev = dax_get_bdev(inode);
        pgoff_t start_index, end_index, pmd_index;
        pgoff_t indices[PAGEVEC_SIZE];
        struct pagevec pvec;
@@ -628,7 +636,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault 
*vmf,
 
        memset(&bh, 0, sizeof(bh));
        block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
-       bh.b_bdev = inode->i_sb->s_bdev;
+       bh.b_bdev = dax_get_bdev(inode);
        bh.b_size = PAGE_SIZE;
 
  repeat:
@@ -847,7 +855,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned 
long address,
        }
 
        memset(&bh, 0, sizeof(bh));
-       bh.b_bdev = inode->i_sb->s_bdev;
+       bh.b_bdev = dax_get_bdev(inode);
        block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
        bh.b_size = PMD_SIZE;
@@ -1100,7 +1108,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, 
unsigned length,
        BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 
        memset(&bh, 0, sizeof(bh));
-       bh.b_bdev = inode->i_sb->s_bdev;
+       bh.b_bdev = dax_get_bdev(inode);
        bh.b_size = PAGE_CACHE_SIZE;
        err = get_block(inode, index, &bh, 0);
        if (err < 0)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089..fc20518 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -55,7 +55,7 @@ xfs_count_page_state(
        } while ((bh = bh->b_this_page) != head);
 }
 
-STATIC struct block_device *
+struct block_device *
 xfs_find_bdev_for_inode(
        struct inode            *inode)
 {
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f6ffc9a..a4343c6 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,5 +62,6 @@ int   xfs_get_blocks_dax_fault(struct inode *inode, sector_t 
offset,
                                 struct buffer_head *map_bh, int create);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
+extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59c9b7b..26e7051 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1623,6 +1623,7 @@ static const struct super_operations xfs_super_operations 
= {
        .destroy_inode          = xfs_fs_destroy_inode,
        .evict_inode            = xfs_fs_evict_inode,
        .drop_inode             = xfs_fs_drop_inode,
+       .get_bdev               = xfs_find_bdev_for_inode,
        .put_super              = xfs_fs_put_super,
        .sync_fs                = xfs_fs_sync_fs,
        .freeze_fs              = xfs_fs_freeze,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b10002d..5b636eb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1730,6 +1730,7 @@ struct super_operations {
        int (*write_inode) (struct inode *, struct writeback_control *wbc);
        int (*drop_inode) (struct inode *);
        void (*evict_inode) (struct inode *);
+       struct block_device *(*get_bdev) (struct inode *);
        void (*put_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
        int (*freeze_super) (struct super_block *);
-- 
2.5.0

Reply via email to