A DC_ADD_CAPACITY event can span multiple event records grouped together
by the CXL_DCD_EVENT_MORE flag. Extents are staged in the pending list until
the last event record ('More'=0) is received, at which point the pending
list is processed. If the device opens such a chain (More=1) but never
sends the closing record, the staged list sits indefinitely.Add a delayed-work watchdog that, on expiry, refuses the chain with an empty ADD_DC_RESPONSE and drops the staged list. The 20s timeout is a conservative upper bound and may be tightened later. The timeout is purely defensive — the spec does not require it, but prevents issues from a lost mailbox response or a crashed fabric manager. The watchdog bounds how long a chain may stall, but a device could still defeat it by streaming More=1 records faster than the timeout, growing the staged list without bound. Also cap a runtime chain at CXL_DC_MAX_PENDING_EXTENTS and refuse it once exceeded; existing-extent recovery is bounded separately by the device's reported extent count. Signed-off-by: Anisa Su <[email protected]> --- Changes: 1. mbox.c: Fix comment in handle_add_event(), before closing the 'More' chain and disabling the watchdog. The comment incorrectly claimed handle_add_event() runs in system_wq. 2. mbox.c: Drop unnecessary initialization of add_ctx.armed=false in cxl_memdev_state_create(), as allocated memory is already zeroed 3. mbox.c: assert add_ctx.lock is held in add_to_pending_list(); it serializes access to add_ctx.pending_extents. 4. mbox.c: cap a runtime More=1 chain at CXL_DC_MAX_PENDING_EXTENTS in handle_add_event() so a buggy device cannot grow the staged list without bound (the watchdog bounds time, not memory). --- drivers/cxl/core/mbox.c | 98 ++++++++++++++++++++++++++++++++++++++++- drivers/cxl/cxlmem.h | 24 ++++++++-- 2 files changed, 117 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 7dd40fb8d613..4e887b5cdc3e 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1208,15 +1208,78 @@ static void clear_pending_extents(void *_mds) list_for_each_entry_safe(pos, tmp, &mds->add_ctx.pending_extents, list) delete_extent_node(pos); + mds->add_ctx.nr_pending = 0; mds->add_ctx.group = NULL; } +/* + * Defensive cap on extents staged in one runtime More=1 chain: a buggy + * device could otherwise grow the list without bound. Not spec-defined. + */ +#define CXL_DC_MAX_PENDING_EXTENTS 100 + +/* + * Bound on how long the host will wait for a device to finish a + * multi-record DC_ADD_CAPACITY chain (More=1 ... More=0) before + * refusing the chain. + * The timeout is not defined in the spec, but added for defensive purposes. + * Since there is no spec-defined timeout, 20s is chosen as a generous + * upper bound and matches the GPF timeout. + */ +#define CXL_DC_ADD_TIMEOUT (20 * HZ) + +static void cxl_dc_add_timeout(struct work_struct *work) +{ + struct pending_add_ctx *ctx = container_of(to_delayed_work(work), + struct pending_add_ctx, + timeout_work); + struct cxl_memdev_state *mds = container_of(ctx, + struct cxl_memdev_state, + add_ctx); + struct device *dev = mds->cxlds.dev; + + guard(mutex)(&ctx->lock); + + /* + * handle_add_event() cancels this work non-synchronously (a sync + * cancel would deadlock on @ctx->lock, which the chain-close path + * holds), so a callback that already started running can reach here + * after its chain has moved on. Abort only if a chain is still armed + * AND the timer has not been re-armed since this expiry fired: a fresh + * mod_delayed_work() (a later extent in this chain, or a new chain) + * makes delayed_work_pending() true, meaning this expiry belongs to a + * superseded deadline and must not abort the current chain. + */ + if (!ctx->armed || delayed_work_pending(&ctx->timeout_work)) + return; + + dev_warn(dev, "DC add chain timed out; refusing staged extents\n"); + + if (cxl_send_dc_response(mds, CXL_MBOX_OP_ADD_DC_RESPONSE, + &ctx->pending_extents, 0)) + dev_dbg(dev, "Failed to send empty ADD_DC_RESPONSE on timeout\n"); + + clear_pending_extents(mds); + ctx->armed = false; +} + +static void cxl_cancel_dcd_add_chain_work(void *_mds) +{ + struct cxl_memdev_state *mds = _mds; + + cancel_delayed_work_sync(&mds->add_ctx.timeout_work); +} + static int add_to_pending_list(struct list_head *pending_list, struct cxl_extent *to_add) { + struct pending_add_ctx *ctx = + container_of(pending_list, struct pending_add_ctx, pending_extents); struct cxl_extent_list_node *node = kzalloc(sizeof(*node), GFP_KERNEL); struct cxl_extent *extent; + lockdep_assert_held(&ctx->lock); + if (!node) return -ENOMEM; extent = kmemdup(to_add, sizeof(*extent), GFP_KERNEL); @@ -1227,6 +1290,7 @@ static int add_to_pending_list(struct list_head *pending_list, node->extent = extent; list_add_tail(&node->list, pending_list); + ctx->nr_pending++; return 0; } @@ -1239,10 +1303,20 @@ static int add_to_pending_list(struct list_head *pending_list, static int handle_add_event(struct cxl_memdev_state *mds, struct cxl_event_dcd *event) { + struct pending_add_ctx *ctx = &mds->add_ctx; struct device *dev = mds->cxlds.dev; int rc; - rc = add_to_pending_list(&mds->add_ctx.pending_extents, &event->extent); + guard(mutex)(&ctx->lock); + + if (ctx->nr_pending >= CXL_DC_MAX_PENDING_EXTENTS) { + dev_warn(dev, "DC add chain exceeds %u extents; dropping (firmware bug)\n", + CXL_DC_MAX_PENDING_EXTENTS); + clear_pending_extents(mds); + return -ENOSPC; + } + + rc = add_to_pending_list(&ctx->pending_extents, &event->extent); if (rc) { clear_pending_extents(mds); return rc; @@ -1250,9 +1324,19 @@ static int handle_add_event(struct cxl_memdev_state *mds, if (event->flags & CXL_DCD_EVENT_MORE) { dev_dbg(dev, "more bit set; delay the surfacing of extent\n"); + mod_delayed_work(system_wq, &ctx->timeout_work, + CXL_DC_ADD_TIMEOUT); + ctx->armed = true; return 0; } + /* + * Chain is closing. Disarm before flushing so a pending watchdog + * (queued but blocked on @ctx->lock) sees !armed and bails out. + */ + ctx->armed = false; + cancel_delayed_work(&ctx->timeout_work); + rc = cxl_send_dc_response(mds, CXL_MBOX_OP_ADD_DC_RESPONSE, &mds->add_ctx.pending_extents, 0); clear_pending_extents(mds); @@ -2036,11 +2120,23 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev, u64 serial, mutex_init(&mds->event.log_lock); INIT_LIST_HEAD(&mds->add_ctx.pending_extents); + mutex_init(&mds->add_ctx.lock); + INIT_DELAYED_WORK(&mds->add_ctx.timeout_work, + cxl_dc_add_timeout); rc = devm_add_action_or_reset(dev, clear_pending_extents, mds); if (rc) return ERR_PTR(rc); + /* + * Registered after clear_pending_extents so devm's reverse-order + * unwind cancels (and waits for) the watchdog first, then the list + * cleanup runs with the watchdog guaranteed not to refire. + */ + rc = devm_add_action_or_reset(dev, cxl_cancel_dcd_add_chain_work, mds); + if (rc) + return ERR_PTR(rc); + rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier); if (rc == -EOPNOTSUPP) dev_warn(dev, "CXL MCE unsupported\n"); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 4ffa7bd1e5f1..81498d47f309 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -8,6 +8,8 @@ #include <linux/uuid.h> #include <linux/node.h> #include <linux/list.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> #include <cxl/event.h> #include <cxl/mailbox.h> #include "cxl.h" @@ -407,19 +409,33 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) /** * struct pending_add_ctx - Staging state for an in-progress - * DCD_ADD_CAPACITY event chain + * DCD_ADD_CAPACITY event chain * @pending_extents: extents received so far in the chain; flushed when - * the chain closes (More=0) + * the chain closes (More=0) * @group: tag group being assembled from the chain + * @timeout_work: watchdog that fires if a chain is opened with + * CXL_DCD_EVENT_MORE but the closing record never arrives + * @lock: serialises updates to the chain state against the watchdog + * @armed: set when a More=1 chain opens; cleared when the chain closes, + * either by a More=0 event record or by the watchdog firing. * * A DCD_ADD_CAPACITY notification can span multiple event records * stitched together by the CXL_DCD_EVENT_MORE flag. Records are staged - * here until the device clears More, at which point the staged batch is - * processed and responded to as a single Add_DC_Response. + * here until an event record with 'More'=0 is received, at which point the + * staged batch is processed and responded to as a single Add_DC_Response. + * + * If a chain is opened (More=1) but the device never sends the closing + * record, the staged list would otherwise sit indefinitely. @timeout_work + * is a defensive watchdog that refuses such a chain with an empty response + * and drops the staged list. */ struct pending_add_ctx { struct list_head pending_extents; struct cxl_dc_tag_group *group; + struct delayed_work timeout_work; + struct mutex lock; + unsigned int nr_pending; + bool armed; }; /** -- 2.43.0

