nvme_dev_disable will issue command on adminq to clear HMB and
delete io cq/sqs, maybe more in the future. When adminq no response,
it has to depends on timeout path. However, nvme_timeout has to
invoke nvme_dev_disable before return, so that the DMA mappings
could be released safely. This will introduce dangerous circular
dependence. Moreover, the whole nvme_dev_disable is under
shutdown_lock even waiting for the command, this makes things
worse.

To avoid this, this patch refactors the nvme_timeout. The basic
principle is:
 - When need to schedule reset_work, hand over expired requests
   to nvme_dev_disable. They will be completed after the controller
   is disabled/shtudown.
 - When requests from nvme_dev_disable and nvme_reset_work expires,
   disable the controller directly then the request could be completed
   to wakeup the waiter. nvme_pci_disable_ctrl_directly is introduced
   for this, it doesn't send commands on adminq and the shutdown_lock
   is not needed here, because the nvme_abort_requests_sync in
   nvme_dev_disable could synchronize with nvme_timeout.

Signed-off-by: Jianchao Wang <jianchao.w.w...@oracle.com>
---
 drivers/nvme/host/pci.c | 199 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 134 insertions(+), 65 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e186158..ce09057 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -70,6 +70,7 @@ struct nvme_queue;
 
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
+static void nvme_pci_disable_ctrl_directly(struct nvme_dev *dev);
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -98,6 +99,7 @@ struct nvme_dev {
        u32 cmbloc;
        struct nvme_ctrl ctrl;
        struct completion ioq_wait;
+       bool inflight_flushed;
 
        /* shadow doorbell buffer support: */
        u32 *dbbuf_dbs;
@@ -1180,73 +1182,13 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 
csts)
                         csts, result);
 }
 
-static enum blk_eh_timer_return nvme_timeout(struct request *req, bool 
reserved)
+static enum blk_eh_timer_return nvme_pci_abort_io_req(struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct nvme_queue *nvmeq = iod->nvmeq;
        struct nvme_dev *dev = nvmeq->dev;
-       struct request *abort_req;
        struct nvme_command cmd;
-       u32 csts = readl(dev->bar + NVME_REG_CSTS);
-
-       /*
-        * Reset immediately if the controller is failed
-        */
-       if (nvme_should_reset(dev, csts)) {
-               nvme_warn_reset(dev, csts);
-               nvme_dev_disable(dev, false);
-               nvme_reset_ctrl(&dev->ctrl);
-               return BLK_EH_HANDLED;
-       }
-
-       /*
-        * Did we miss an interrupt?
-        */
-       if (__nvme_poll(nvmeq, req->tag)) {
-               dev_warn(dev->ctrl.device,
-                        "I/O %d QID %d timeout, completion polled\n",
-                        req->tag, nvmeq->qid);
-               return BLK_EH_HANDLED;
-       }
-
-       /*
-        * Shutdown immediately if controller times out while starting. The
-        * reset work will see the pci device disabled when it gets the forced
-        * cancellation error. All outstanding requests are completed on
-        * shutdown, so we return BLK_EH_HANDLED.
-        */
-       switch (dev->ctrl.state) {
-       case NVME_CTRL_CONNECTING:
-       case NVME_CTRL_RESETTING:
-               dev_warn(dev->ctrl.device,
-                        "I/O %d QID %d timeout, disable controller\n",
-                        req->tag, nvmeq->qid);
-               nvme_dev_disable(dev, false);
-               set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
-               return BLK_EH_HANDLED;
-       default:
-               break;
-       }
-
-       /*
-        * Shutdown the controller immediately and schedule a reset if the
-        * command was already aborted once before and still hasn't been
-        * returned to the driver, or if this is the admin queue.
-        */
-       if (!nvmeq->qid || iod->aborted) {
-               dev_warn(dev->ctrl.device,
-                        "I/O %d QID %d timeout, reset controller\n",
-                        req->tag, nvmeq->qid);
-               nvme_dev_disable(dev, false);
-               nvme_reset_ctrl(&dev->ctrl);
-
-               /*
-                * Mark the request as handled, since the inline shutdown
-                * forces all outstanding requests to complete.
-                */
-               set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
-               return BLK_EH_HANDLED;
-       }
+       struct request *abort_req;
 
        if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
                atomic_inc(&dev->ctrl.abort_limit);
@@ -1282,6 +1224,105 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
        return BLK_EH_RESET_TIMER;
 }
 
+static enum blk_eh_timer_return nvme_timeout(struct request *req, bool 
reserved)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct nvme_queue *nvmeq = iod->nvmeq;
+       struct nvme_dev *dev = nvmeq->dev;
+       u32 csts = readl(dev->bar + NVME_REG_CSTS);
+       enum {ABORT, RESET, DISABLE} action;
+       enum blk_eh_timer_return ret;
+       /*
+        * This is for nvme_abort_req. This request will be completed
+        * by nvme_flush_aborted_requests after the controller is
+        * disabled/shutdown
+        */
+       if (test_bit(NVME_REQ_ABORTED, &nvme_req(req)->flags))
+               return BLK_EH_NOT_HANDLED;
+
+       /*
+        * Reset immediately if the controller is failed.
+        * Defer the completion to nvme_flush_aborted_requests.
+        */
+       if (nvme_should_reset(dev, csts)) {
+               nvme_warn_reset(dev, csts);
+               nvme_reset_ctrl(&dev->ctrl);
+               return BLK_EH_RESET_TIMER;
+       }
+
+       /*
+        * Did we miss an interrupt?
+        */
+       if (__nvme_poll(nvmeq, req->tag)) {
+               dev_warn(dev->ctrl.device,
+                        "I/O %d QID %d timeout, completion polled\n",
+                        req->tag, nvmeq->qid);
+               return BLK_EH_HANDLED;
+       }
+
+       if (nvmeq->qid) {
+               if (dev->ctrl.state == NVME_CTRL_RESETTING ||
+                               iod->aborted)
+                       action = RESET;
+               else
+                       action = ABORT;
+       } else {
+               /*
+                * Disable immediately if controller times out while disabling/
+                * shuting down/starting. The nvme_dev_disable/nvme_reset_work
+                * will see the error.
+                * Note: inflight_flushed is set in nvme_dev_disable when it
+                * abort all the inflight requests. Introducing this flag is due
+                * to there is no state to represent the shutdown procedure.
+                */
+               if (dev->ctrl.state == NVME_CTRL_CONNECTING ||
+                       dev->inflight_flushed)
+                       action = DISABLE;
+               else
+                       action = RESET;
+       }
+
+       switch (action) {
+       case ABORT:
+               ret = nvme_pci_abort_io_req(req);
+               break;
+       case RESET:
+               dev_warn(dev->ctrl.device,
+                        "I/O %d QID %d timeout, reset controller\n",
+                        req->tag, nvmeq->qid);
+               set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
+               nvme_reset_ctrl(&dev->ctrl);
+               /*
+                * The reset work will take over this request. nvme_abort_req
+                * employs blk_abort_request to force the request to be timed
+                * out. So we need to return BLK_EH_RESET_TIMER then the
+                * RQF_MQ_TIMEOUT_EXPIRED could be cleared.
+                */
+               ret = BLK_EH_RESET_TIMER;
+               break;
+       case DISABLE:
+               /*
+                * We disable the controller directly here so that we could
+                * complete the request safely to wake up the nvme_dev_disable/
+                * nvme_reset_work who is waiting on adminq. We cannot return
+                * BLK_EH_RESET_TIMER to depend on error recovery procedure
+                * because it is waiting for timeout path.
+                */
+               dev_warn(dev->ctrl.device,
+                        "I/O %d QID %d timeout, disable controller\n",
+                        req->tag, nvmeq->qid);
+               nvme_pci_disable_ctrl_directly(dev);
+               set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
+               ret = BLK_EH_HANDLED;
+               break;
+       default:
+               WARN_ON(1);
+               break;
+       }
+
+       return ret;
+}
+
 static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
        dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
@@ -2169,6 +2210,33 @@ static void nvme_pci_disable(struct nvme_dev *dev)
        }
 }
 
+/*
+ * This is only invoked by nvme_timeout. shutdown_lock is not needed
+ * here because nvme_abort_requests_sync in nvme_dev_disable will
+ * synchronize with the timeout path.
+ */
+static void nvme_pci_disable_ctrl_directly(struct nvme_dev *dev)
+{
+       int i;
+       struct pci_dev *pdev = to_pci_dev(dev->dev);
+       bool dead = true;
+
+       if (pci_is_enabled(pdev)) {
+               u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+               dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
+                       pdev->error_state  != pci_channel_io_normal);
+
+               if (!dead)
+                       nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+       }
+
+       for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
+               nvme_suspend_queue(&dev->queues[i]);
+
+       nvme_pci_disable(dev);
+}
+
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 {
        int i;
@@ -2205,6 +2273,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
 
        }
        nvme_stop_queues(&dev->ctrl);
+       nvme_abort_requests_sync(&dev->ctrl);
+       dev->inflight_flushed = true;
 
        if (!dead) {
                nvme_disable_io_queues(dev);
@@ -2215,9 +2285,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
 
        nvme_pci_disable(dev);
 
-       blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
-       blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, 
&dev->ctrl);
-
+       nvme_flush_aborted_requests(&dev->ctrl);
+       dev->inflight_flushed = false;
        /*
         * The driver will not be starting up queues again if shutting down so
         * must flush all entered requests to their failed completion to avoid
-- 
2.7.4

Reply via email to