On Thu, 19 Mar 2026 01:14:59 +0000
Smita Koralahalli <[email protected]> wrote:
The current probe time ownership check for Soft Reserved memory based
solely on CXL window intersection is insufficient. dax_hmem probing is not
always guaranteed to run after CXL enumeration and region assembly, which
can lead to incorrect ownership decisions before the CXL stack has
finished publishing windows and assembling committed regions.
Introduce deferred ownership handling for Soft Reserved ranges that
intersect CXL windows. When such a range is encountered during the
initial dax_hmem probe, schedule deferred work to wait for the CXL stack
to complete enumeration and region assembly before deciding ownership.
Once the deferred work runs, evaluate each Soft Reserved range
individually: if a CXL region fully contains the range, skip it and let
dax_cxl bind. Otherwise, register it with dax_hmem. This per-range
ownership model avoids the need for CXL region teardown and
alloc_dax_region() resource exclusion prevents double claiming.
Introduce a boolean flag dax_hmem_initial_probe to live inside device.c
so it survives module reload. Ensure dax_cxl defers driver registration
until dax_hmem has completed ownership resolution. dax_cxl calls
dax_hmem_flush_work() before cxl_driver_register(), which both waits for
the deferred work to complete and creates a module symbol dependency that
forces dax_hmem.ko to load before dax_cxl.
Co-developed-by: Dan Williams <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
Signed-off-by: Smita Koralahalli <[email protected]>
Hi Smita,
I think this is very likely to be what is causing the bug Alison
saw in cxl_test.
It looks to be possible to flush work before the work structure has
been configured. Even though it's not on a work queue and there is
nothing to do, there are early sanity checks that fail giving the warning
Alison reported.
A couple of ways to fix that inline. I'd be tempted to both initialize
the function statically and gate against flushing if the whole thing isn't
set up yet.
Jonathan
---
drivers/dax/bus.h | 7 +++++
drivers/dax/cxl.c | 1 +
drivers/dax/hmem/device.c | 3 ++
drivers/dax/hmem/hmem.c | 66 +++++++++++++++++++++++++++++++++++++--
4 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index cbbf64443098..ebbfe2d6da14 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -49,6 +49,13 @@ void dax_driver_unregister(struct dax_device_driver
*dax_drv);
void kill_dev_dax(struct dev_dax *dev_dax);
bool static_dev_dax(struct dev_dax *dev_dax);
+#if IS_ENABLED(CONFIG_DEV_DAX_HMEM)
+extern bool dax_hmem_initial_probe;
+void dax_hmem_flush_work(void);
+#else
+static inline void dax_hmem_flush_work(void) { }
+#endif
+
#define MODULE_ALIAS_DAX_DEVICE(type) \
MODULE_ALIAS("dax:t" __stringify(type) "*")
#define DAX_DEVICE_MODALIAS_FMT "dax:t%d"
diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
index a2136adfa186..3ab39b77843d 100644
--- a/drivers/dax/cxl.c
+++ b/drivers/dax/cxl.c
@@ -44,6 +44,7 @@ static struct cxl_driver cxl_dax_region_driver = {
static void cxl_dax_region_driver_register(struct work_struct *work)
{
+ dax_hmem_flush_work();
cxl_driver_register(&cxl_dax_region_driver);
}
diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
index 56e3cbd181b5..991a4bf7d969 100644
--- a/drivers/dax/hmem/device.c
+++ b/drivers/dax/hmem/device.c
@@ -8,6 +8,9 @@
static bool nohmem;
module_param_named(disable, nohmem, bool, 0444);
+bool dax_hmem_initial_probe;
+EXPORT_SYMBOL_GPL(dax_hmem_initial_probe);
+
static bool platform_initialized;
static DEFINE_MUTEX(hmem_resource_lock);
static struct resource hmem_active = {
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 1e3424358490..8c574123bd3b 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -3,6 +3,7 @@
#include <linux/memregion.h>
#include <linux/module.h>
#include <linux/dax.h>
+#include <cxl/cxl.h>
#include "../bus.h"
static bool region_idle;
@@ -58,6 +59,19 @@ static void release_hmem(void *pdev)
platform_device_unregister(pdev);
}
+struct dax_defer_work {
+ struct platform_device *pdev;
+ struct work_struct work;
+};
+
+static struct dax_defer_work dax_hmem_work;
static struct dax_defer_work dax_hmem_work = {
.work = __WORK_INITIALIZER(&dax_hmem_work.work,
process_defer_work),
};
or something similar.
+
+void dax_hmem_flush_work(void)
+{
+ flush_work(&dax_hmem_work.work);
+}
+EXPORT_SYMBOL_GPL(dax_hmem_flush_work);
+
static int hmem_register_device(struct device *host, int target_nid,
const struct resource *res)
{
@@ -69,8 +83,11 @@ static int hmem_register_device(struct device *host, int
target_nid,
if (IS_ENABLED(CONFIG_DEV_DAX_CXL) &&
region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
IORES_DESC_CXL) != REGION_DISJOINT) {
- dev_dbg(host, "deferring range to CXL: %pr\n", res);
- return 0;
+ if (!dax_hmem_initial_probe) {
+ dev_dbg(host, "deferring range to CXL: %pr\n", res);
+ queue_work(system_long_wq, &dax_hmem_work.work);
+ return 0;
+ }
}
rc = region_intersects_soft_reserve(res->start, resource_size(res));
@@ -123,8 +140,48 @@ static int hmem_register_device(struct device *host, int
target_nid,
return rc;
}
+static int hmem_register_cxl_device(struct device *host, int target_nid,
+ const struct resource *res)
+{
+ if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM,
+ IORES_DESC_CXL) == REGION_DISJOINT)
+ return 0;
+
+ if (cxl_region_contains_resource((struct resource *)res)) {
+ dev_dbg(host, "CXL claims resource, dropping: %pr\n", res);
+ return 0;
+ }
+
+ dev_dbg(host, "CXL did not claim resource, registering: %pr\n", res);
+ return hmem_register_device(host, target_nid, res);
+}
+
+static void process_defer_work(struct work_struct *w)
+{
+ struct dax_defer_work *work = container_of(w, typeof(*work), work);
+ struct platform_device *pdev = work->pdev;
If you do the suggested __INITIALIZE_WORK() then I'd add
a paranoid
if (!work->pdev)
return;
We don't actually queue the work before pdev is set, but that might
be obvious once we spilt up assigning the function and the data
it uses.
+
+ wait_for_device_probe();
+
+ guard(device)(&pdev->dev);
+ if (!pdev->dev.driver)
+ return;
+
+ dax_hmem_initial_probe = true;
+ walk_hmem_resources(&pdev->dev, hmem_register_cxl_device);
+}
+
static int dax_hmem_platform_probe(struct platform_device *pdev)
{
+ if (work_pending(&dax_hmem_work.work))
+ return -EBUSY;
+
+ if (!dax_hmem_work.pdev) {
+ get_device(&pdev->dev);
+ dax_hmem_work.pdev = pdev;
Using the pdev rather than dev breaks the pattern of doing a get_device()
and assigning in one line. This is a bit ugly.
dax_hmem_work.pdev = to_pci_dev(get_device(&pdev->dev));
but perhaps makes the association tighter than current code.
+ INIT_WORK(&dax_hmem_work.work, process_defer_work);
See above. I think assigning the work function should be static
which should resolve the issue Alison was seeing as then it should
be fine to call flush_work() on the item that isn't on a work queue
yet but is initialized.
+ }
+
return walk_hmem_resources(&pdev->dev, hmem_register_device);
}
@@ -162,6 +219,11 @@ static __init int dax_hmem_init(void)
static __exit void dax_hmem_exit(void)
{
+ flush_work(&dax_hmem_work.work);
I think this needs to be under the if (dax_hmem_work.pdev)
Not sure there is any guarantee dax_hmem_platform_probe() has run
before we get here otherwise. Alternative is to assign
the work function statically.
+
+ if (dax_hmem_work.pdev)
+ put_device(&dax_hmem_work.pdev->dev);
+
platform_driver_unregister(&dax_hmem_driver);
platform_driver_unregister(&dax_hmem_platform_driver);
}