On 3/23/26 5:38 PM, John Groves wrote:
> From: John Groves <[email protected]>
> 
> The new fsdev driver provides pages/folios initialized compatibly with
> fsdax - normal rather than devdax-style refcounting, and starting out
> with order-0 folios.
> 
> When fsdev binds to a daxdev, it is usually (always?) switching from the
> devdax mode (device.c), which pre-initializes compound folios according
> to its alignment. Fsdev uses fsdev_clear_folio_state() to switch the
> folios into a fsdax-compatible state.
> 
> A side effect of this is that raw mmap doesn't (can't?) work on an fsdev
> dax instance. Accordingly, The fsdev driver does not provide raw mmap -
> devices must be put in 'devdax' mode (drivers/dax/device.c) to get raw
> mmap capability.
> 
> In this commit is just the framework, which remaps pages/folios compatibly
> with fsdax.
> 
> Enabling dax changes:
> 
> - bus.h: add DAXDRV_FSDEV_TYPE driver type
> - bus.c: allow DAXDRV_FSDEV_TYPE drivers to bind to daxdevs
> - dax.h: prototype inode_dax(), which fsdev needs
> 
> Suggested-by: Dan Williams <[email protected]>
> Suggested-by: Gregory Price <[email protected]>
> Signed-off-by: John Groves <[email protected]>

Reviewed-by: Dave Jiang <[email protected]>


> ---
>  MAINTAINERS          |   8 ++
>  drivers/dax/Kconfig  |  11 ++
>  drivers/dax/Makefile |   2 +
>  drivers/dax/bus.c    |   4 +
>  drivers/dax/bus.h    |   1 +
>  drivers/dax/fsdev.c  | 245 +++++++++++++++++++++++++++++++++++++++++++
>  fs/dax.c             |   1 +
>  7 files changed, 272 insertions(+)
>  create mode 100644 drivers/dax/fsdev.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 7d10988cbc62..eedf4cce56ed 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -7298,6 +7298,14 @@ L:     [email protected]
>  S:   Supported
>  F:   drivers/dax/
>  
> +DEVICE DIRECT ACCESS (DAX) [fsdev_dax]
> +M:   John Groves <[email protected]>
> +M:   John Groves <[email protected]>
> +L:   [email protected]
> +L:   [email protected]
> +S:   Supported
> +F:   drivers/dax/fsdev.c
> +
>  DEVICE FREQUENCY (DEVFREQ)
>  M:   MyungJoo Ham <[email protected]>
>  M:   Kyungmin Park <[email protected]>
> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
> index d656e4c0eb84..7051b70980d5 100644
> --- a/drivers/dax/Kconfig
> +++ b/drivers/dax/Kconfig
> @@ -61,6 +61,17 @@ config DEV_DAX_HMEM_DEVICES
>       depends on DEV_DAX_HMEM && DAX
>       def_bool y
>  
> +config DEV_DAX_FSDEV
> +     tristate "FSDEV DAX: fs-dax compatible devdax driver"
> +     depends on DEV_DAX && FS_DAX
> +     help
> +       Support fs-dax access to DAX devices via a character device
> +       interface. Unlike device_dax (which pre-initializes compound folios
> +       based on device alignment), this driver leaves folios at order-0 so
> +       that fs-dax filesystems can manage folio order dynamically.
> +
> +       Say M if unsure.
> +
>  config DEV_DAX_KMEM
>       tristate "KMEM DAX: map dax-devices as System-RAM"
>       default DEV_DAX
> diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
> index 5ed5c39857c8..ba35bda7abef 100644
> --- a/drivers/dax/Makefile
> +++ b/drivers/dax/Makefile
> @@ -4,11 +4,13 @@ obj-$(CONFIG_DEV_DAX) += device_dax.o
>  obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
>  obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
>  obj-$(CONFIG_DEV_DAX_CXL) += dax_cxl.o
> +obj-$(CONFIG_DEV_DAX_FSDEV) += fsdev_dax.o
>  
>  dax-y := super.o
>  dax-y += bus.o
>  device_dax-y := device.o
>  dax_pmem-y := pmem.o
>  dax_cxl-y := cxl.o
> +fsdev_dax-y := fsdev.o
>  
>  obj-y += hmem/
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index e4bd5c9f006c..562e2b06f61a 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -81,6 +81,10 @@ static int dax_match_type(const struct dax_device_driver 
> *dax_drv, struct device
>           !IS_ENABLED(CONFIG_DEV_DAX_KMEM))
>               return 1;
>  
> +     /* fsdev driver can also bind to device-type dax devices */
> +     if (dax_drv->type == DAXDRV_FSDEV_TYPE && type == DAXDRV_DEVICE_TYPE)
> +             return 1;
> +
>       return 0;
>  }
>  
> diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
> index cbbf64443098..880bdf7e72d7 100644
> --- a/drivers/dax/bus.h
> +++ b/drivers/dax/bus.h
> @@ -31,6 +31,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
> *data);
>  enum dax_driver_type {
>       DAXDRV_KMEM_TYPE,
>       DAXDRV_DEVICE_TYPE,
> +     DAXDRV_FSDEV_TYPE,
>  };
>  
>  struct dax_device_driver {
> diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c
> new file mode 100644
> index 000000000000..8b5c6976ad17
> --- /dev/null
> +++ b/drivers/dax/fsdev.c
> @@ -0,0 +1,245 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2026 Micron Technology, Inc. */
> +#include <linux/memremap.h>
> +#include <linux/pagemap.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/cdev.h>
> +#include <linux/slab.h>
> +#include <linux/dax.h>
> +#include <linux/uio.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include "dax-private.h"
> +#include "bus.h"
> +
> +/*
> + * FS-DAX compatible devdax driver
> + *
> + * Unlike drivers/dax/device.c which pre-initializes compound folios based
> + * on device alignment (via vmemmap_shift), this driver leaves folios
> + * uninitialized similar to pmem. This allows fs-dax filesystems like famfs
> + * to work without needing special handling for pre-initialized folios.
> + *
> + * Key differences from device.c:
> + * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC)
> + * - vmemmap_shift is NOT set (folios remain order-0)
> + * - fs-dax can dynamically create compound folios as needed
> + * - No mmap support - all access is through fs-dax/iomap
> + */
> +
> +static void fsdev_cdev_del(void *cdev)
> +{
> +     cdev_del(cdev);
> +}
> +
> +static void fsdev_kill(void *dev_dax)
> +{
> +     kill_dev_dax(dev_dax);
> +}
> +
> +/*
> + * Page map operations for FS-DAX mode
> + * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c
> + *
> + * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX.
> + * The core mm code in free_zone_device_folio() handles the wake_up_var()
> + * directly for this memory type.
> + */
> +static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap,
> +             unsigned long pfn, unsigned long nr_pages, int mf_flags)
> +{
> +     struct dev_dax *dev_dax = pgmap->owner;
> +     u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start;
> +     u64 len = nr_pages << PAGE_SHIFT;
> +
> +     return dax_holder_notify_failure(dev_dax->dax_dev, offset,
> +                                      len, mf_flags);
> +}
> +
> +static const struct dev_pagemap_ops fsdev_pagemap_ops = {
> +     .memory_failure         = fsdev_pagemap_memory_failure,
> +};
> +
> +/*
> + * Clear any stale folio state from pages in the given range.
> + * This is necessary because device_dax pre-initializes compound folios
> + * based on vmemmap_shift, and that state may persist after driver unbind.
> + * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax
> + * expects to find clean order-0 folios that it can build into compound
> + * folios on demand.
> + *
> + * At probe time, no filesystem should be mounted yet, so all mappings
> + * are stale and must be cleared along with compound state.
> + */
> +static void fsdev_clear_folio_state(struct dev_dax *dev_dax)
> +{
> +     for (int i = 0; i < dev_dax->nr_range; i++) {
> +             struct range *range = &dev_dax->ranges[i].range;
> +             unsigned long pfn = PHYS_PFN(range->start);
> +             unsigned long end_pfn = PHYS_PFN(range->end) + 1;
> +
> +             while (pfn < end_pfn) {
> +                     struct folio *folio = pfn_folio(pfn);
> +                     int order = dax_folio_reset_order(folio);
> +
> +                     pfn += 1UL << order;
> +             }
> +     }
> +}
> +
> +static void fsdev_clear_folio_state_action(void *data)
> +{
> +     fsdev_clear_folio_state(data);
> +}
> +
> +static int fsdev_open(struct inode *inode, struct file *filp)
> +{
> +     struct dax_device *dax_dev = inode_dax(inode);
> +     struct dev_dax *dev_dax = dax_get_private(dax_dev);
> +
> +     filp->private_data = dev_dax;
> +
> +     return 0;
> +}
> +
> +static int fsdev_release(struct inode *inode, struct file *filp)
> +{
> +     return 0;
> +}
> +
> +static const struct file_operations fsdev_fops = {
> +     .llseek = noop_llseek,
> +     .owner = THIS_MODULE,
> +     .open = fsdev_open,
> +     .release = fsdev_release,
> +};
> +
> +static int fsdev_dax_probe(struct dev_dax *dev_dax)
> +{
> +     struct dax_device *dax_dev = dev_dax->dax_dev;
> +     struct device *dev = &dev_dax->dev;
> +     struct dev_pagemap *pgmap;
> +     struct inode *inode;
> +     struct cdev *cdev;
> +     void *addr;
> +     int rc, i;
> +
> +     if (static_dev_dax(dev_dax)) {
> +             if (dev_dax->nr_range > 1) {
> +                     dev_warn(dev, "static pgmap / multi-range device 
> conflict\n");
> +                     return -EINVAL;
> +             }
> +
> +             pgmap = dev_dax->pgmap;
> +     } else {
> +             size_t pgmap_size;
> +
> +             if (dev_dax->pgmap) {
> +                     dev_warn(dev, "dynamic-dax with pre-populated page 
> map\n");
> +                     return -EINVAL;
> +             }
> +
> +             pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1);
> +             pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL);
> +             if (!pgmap)
> +                     return -ENOMEM;
> +
> +             pgmap->nr_range = dev_dax->nr_range;
> +             dev_dax->pgmap = pgmap;
> +
> +             for (i = 0; i < dev_dax->nr_range; i++) {
> +                     struct range *range = &dev_dax->ranges[i].range;
> +
> +                     pgmap->ranges[i] = *range;
> +             }
> +     }
> +
> +     for (i = 0; i < dev_dax->nr_range; i++) {
> +             struct range *range = &dev_dax->ranges[i].range;
> +
> +             if (!devm_request_mem_region(dev, range->start,
> +                                     range_len(range), dev_name(dev))) {
> +                     dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve 
> range\n",
> +                              i, range->start, range->end);
> +                     return -EBUSY;
> +             }
> +     }
> +
> +     /*
> +      * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving
> +      * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this
> +      * lets fs-dax dynamically build compound folios as needed, similar
> +      * to pmem behavior.
> +      */
> +     pgmap->type = MEMORY_DEVICE_FS_DAX;
> +     pgmap->ops = &fsdev_pagemap_ops;
> +     pgmap->owner = dev_dax;
> +
> +     addr = devm_memremap_pages(dev, pgmap);
> +     if (IS_ERR(addr))
> +             return PTR_ERR(addr);
> +
> +     /*
> +      * Clear any stale compound folio state left over from a previous
> +      * driver (e.g., device_dax with vmemmap_shift). Also register this
> +      * as a devm action so folio state is cleared on unbind, ensuring
> +      * clean pages for subsequent drivers (e.g., kmem for system-ram).
> +      */
> +     fsdev_clear_folio_state(dev_dax);
> +     rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action,
> +                                   dev_dax);
> +     if (rc)
> +             return rc;
> +
> +     /* Detect whether the data is at a non-zero offset into the memory */
> +     if (pgmap->range.start != dev_dax->ranges[0].range.start) {
> +             u64 phys = dev_dax->ranges[0].range.start;
> +             u64 pgmap_phys = dev_dax->pgmap[0].range.start;
> +             u64 data_offset = 0;
> +
> +             if (!WARN_ON(pgmap_phys > phys))
> +                     data_offset = phys - pgmap_phys;
> +
> +             pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx 
> offset=%llx\n",
> +                    __func__, phys, pgmap_phys, data_offset);
> +     }
> +
> +     inode = dax_inode(dax_dev);
> +     cdev = inode->i_cdev;
> +     cdev_init(cdev, &fsdev_fops);
> +     cdev->owner = dev->driver->owner;
> +     cdev_set_parent(cdev, &dev->kobj);
> +     rc = cdev_add(cdev, dev->devt, 1);
> +     if (rc)
> +             return rc;
> +
> +     rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev);
> +     if (rc)
> +             return rc;
> +
> +     run_dax(dax_dev);
> +     return devm_add_action_or_reset(dev, fsdev_kill, dev_dax);
> +}
> +
> +static struct dax_device_driver fsdev_dax_driver = {
> +     .probe = fsdev_dax_probe,
> +     .type = DAXDRV_FSDEV_TYPE,
> +};
> +
> +static int __init dax_init(void)
> +{
> +     return dax_driver_register(&fsdev_dax_driver);
> +}
> +
> +static void __exit dax_exit(void)
> +{
> +     dax_driver_unregister(&fsdev_dax_driver);
> +}
> +
> +MODULE_AUTHOR("John Groves");
> +MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver");
> +MODULE_LICENSE("GPL");
> +module_init(dax_init);
> +module_exit(dax_exit);
> +MODULE_ALIAS_DAX_DEVICE(0);
> diff --git a/fs/dax.c b/fs/dax.c
> index eba86802a7a7..b91a2535149a 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -430,6 +430,7 @@ int dax_folio_reset_order(struct folio *folio)
>  
>       return order;
>  }
> +EXPORT_SYMBOL_GPL(dax_folio_reset_order);
>  
>  static inline unsigned long dax_folio_put(struct folio *folio)
>  {


Reply via email to