Currently, the complicated relationship between nvme_dev_disable
and nvme_timeout has become a devil that will introduce many
circular pattern which may trigger deadlock or IO hang. Let's
enumerate the tangles between them:
 - nvme_timeout has to invoke nvme_dev_disable to stop the
   controller doing DMA access before free the request.
 - nvme_dev_disable has to depend on nvme_timeout to complete
   adminq requests to set HMB or delete sq/cq when the controller
   has no response.
 - nvme_dev_disable will race with nvme_timeout when cancels the
   previous outstanding requests.
2nd case is necessary. This patch is to break up 1st and 3th case.

To achieve this:
Use blk_abort_request to force all the previous outstanding requests
expired in nvme_dev_disable. Set NVME_REQ_CANCELLED and return
BLK_EH_NOT_HANDLED in nvme_timeout. Then the request will not be
completed and freed. We needn't invoke nvme_dev_disable any more.

We use NVME_REQ_CANCELLED to identify them. After the controller is
totally disabled/shutdown, we invoke blk_mq_rq_update_aborted_gstate
to clear requests and invoke blk_mq_complete_request to complete them.
In addition, to identify the previous adminq/IOq request and adminq
requests from nvme_dev_disable, we introduce
NVME_PCI_OUTSTANDING_GRABBING and NVME_PCI_OUTSTANDING_GRABBED to
let nvme_timeout be able to distinguish them.

For the adminq requests from nvme_dev_disable/nvme_reset_work:
invoke nvme_disable_ctrl directly, then set NVME_REQ_CANCELLED and
return BLK_EH_HANDLED. nvme_dev_disable/nvme_reset_work will
see the error.

With this patch, we could avoid nvme_dev_disable to be invoked
by nvme_timeout and implementat synchronization between them.

Suggested-by: Keith Busch <keith.bu...@intel.com>
(If IO requests timeout in RECONNECTING state, fail them and kill the
 controller)
Signed-off-by: Jianchao Wang <jianchao.w.w...@oracle.com>
---
 drivers/nvme/host/pci.c | 192 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 157 insertions(+), 35 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f3e0eae..a0ff18e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -70,6 +70,9 @@ struct nvme_queue;
 
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
+static void nvme_pci_free_and_disable(struct nvme_dev *dev);
+#define NVME_PCI_OUTSTANDING_GRABBING 1
+#define NVME_PCI_OUTSTANDING_GRABBED 2
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -80,6 +83,7 @@ struct nvme_dev {
        struct blk_mq_tag_set admin_tagset;
        u32 __iomem *dbs;
        struct device *dev;
+       int grab_flag;
        struct dma_pool *prp_page_pool;
        struct dma_pool *prp_small_pool;
        unsigned online_queues;
@@ -1130,6 +1134,24 @@ static void abort_endio(struct request *req, 
blk_status_t error)
        blk_mq_free_request(req);
 }
 
+static void nvme_pci_disable_ctrl_directly(struct nvme_dev *dev)
+{
+       struct pci_dev *pdev = to_pci_dev(dev->dev);
+       u32 csts;
+       bool dead;
+
+       if (!pci_is_enabled(pdev))
+               return;
+
+       csts = readl(dev->bar + NVME_REG_CSTS);
+       dead = !!((csts & NVME_CSTS_CFS) ||
+                       !(csts & NVME_CSTS_RDY) ||
+                       pdev->error_state  != pci_channel_io_normal);
+       if (!dead)
+               nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+       nvme_pci_free_and_disable(dev);
+}
+
 static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
 {
 
@@ -1191,12 +1213,13 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
 
        /*
         * Reset immediately if the controller is failed
+        * nvme_dev_disable will take over the expired requests.
         */
        if (nvme_should_reset(dev, csts)) {
+               nvme_req(req)->flags |= NVME_REQ_CANCELLED;
                nvme_warn_reset(dev, csts);
-               nvme_dev_disable(dev, false);
                nvme_reset_ctrl(&dev->ctrl);
-               return BLK_EH_HANDLED;
+               return BLK_EH_NOT_HANDLED;
        }
 
        /*
@@ -1210,38 +1233,59 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
        }
 
        /*
-        * Shutdown immediately if controller times out while starting. The
-        * reset work will see the pci device disabled when it gets the forced
-        * cancellation error. All outstanding requests are completed on
-        * shutdown, so we return BLK_EH_HANDLED.
+        * The previous outstanding requests on adminq and ioq have been
+        * grabbed by nvme_dev_disable, so it must be admin request from
+        * the nvme_dev_disable.
         */
-       if (dev->ctrl.state == NVME_CTRL_RESETTING) {
-               dev_warn(dev->ctrl.device,
-                        "I/O %d QID %d timeout, disable controller\n",
-                        req->tag, nvmeq->qid);
-               nvme_dev_disable(dev, false);
+       if ((READ_ONCE(dev->grab_flag) == NVME_PCI_OUTSTANDING_GRABBED)) {
+               nvme_pci_disable_ctrl_directly(dev);
                nvme_req(req)->flags |= NVME_REQ_CANCELLED;
                return BLK_EH_HANDLED;
        }
 
        /*
-        * Shutdown the controller immediately and schedule a reset if the
-        * command was already aborted once before and still hasn't been
-        * returned to the driver, or if this is the admin queue.
+        * nvme_dev_disable is grabbing all the outstanding requests.
+        * Just set NVME_REQ_CANCELLED and return BLK_EH_NOT_HANDLED.
+        * The nvme_dev_disable will take over all the things.
+        */
+       if (READ_ONCE(dev->grab_flag) == NVME_PCI_OUTSTANDING_GRABBING) {
+               nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+               return BLK_EH_NOT_HANDLED;
+       }
+
+       /*
+        * There could IO request timeout during RECONNECTING state. It is
+        * due to the wait freeze in nvme_reset_work. Plus the adminq request
+        * timeout, don't try to requeue them and change state to DELETING to
+        * prevent reset work from moving forward.
+        */
+       if (dev->ctrl.state == NVME_CTRL_RECONNECTING) {
+               nvme_pci_disable_ctrl_directly(dev);
+               nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+               nvme_req(req)->status |= NVME_SC_DNR;
+               nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+               return BLK_EH_HANDLED;
+       }
+       /*
+        * If the controller is DELETING, needn't to do any recovery,
+        */
+       if (dev->ctrl.state == NVME_CTRL_DELETING) {
+               nvme_pci_disable_ctrl_directly(dev);
+               nvme_req(req)->status = NVME_SC_ABORT_REQ;
+               nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+               return BLK_EH_HANDLED;
+       }
+       /*
+        * Just set NVME_REQ_CANCELLED and return BLK_EH_NOT_HANDLED,
+        * The nvme_dev_disable will take over all the things.
         */
        if (!nvmeq->qid || iod->aborted) {
                dev_warn(dev->ctrl.device,
                         "I/O %d QID %d timeout, reset controller\n",
                         req->tag, nvmeq->qid);
-               nvme_dev_disable(dev, false);
                nvme_reset_ctrl(&dev->ctrl);
-
-               /*
-                * Mark the request as handled, since the inline shutdown
-                * forces all outstanding requests to complete.
-                */
                nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-               return BLK_EH_HANDLED;
+               return BLK_EH_NOT_HANDLED;
        }
 
        if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -2149,25 +2193,97 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
        pci_release_mem_regions(to_pci_dev(dev->dev));
 }
 
-static void nvme_pci_disable(struct nvme_dev *dev)
+static void nvme_pci_abort_rq(struct request *req, void *data, bool reserved)
 {
+       if (!blk_mq_request_started(req))
+               return;
+
+       dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
+                               "Abort I/O %d", req->tag);
+
+       nvme_req(req)->status = NVME_SC_ABORT_REQ;
+       blk_abort_request(req);
+}
+
+/*
+ * nvme_pci_grab_outstanding and shutdown_lock ensure will be
+ * only one nvme_pci_free_and_disable running at one moment.
+ */
+static void nvme_pci_free_and_disable(struct nvme_dev *dev)
+{
+       int onlines, i;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
 
+       if (!pci_is_enabled(pdev))
+               return;
+
+       onlines = dev->online_queues;
+       for (i = onlines - 1; i >= 0; i--)
+               nvme_suspend_queue(&dev->queues[i]);
+
        nvme_release_cmb(dev);
        pci_free_irq_vectors(pdev);
 
-       if (pci_is_enabled(pdev)) {
-               pci_disable_pcie_error_reporting(pdev);
-               pci_disable_device(pdev);
-       }
+       pci_disable_pcie_error_reporting(pdev);
+       pci_disable_device(pdev);
+}
+
+/*
+ * Now the controller has been disabled, no interrupt will race with us,
+ * so it is safe to complete them
+ *  - IO requests will be requeued.
+ *  - adminq requests will be failed, the tags will also be freed.
+ */
+static void nvme_pci_cancel_rq(struct request *req, void *data, bool reserved)
+{
+       if (!blk_mq_request_started(req))
+               return;
+
+       dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
+                               "Cancel I/O %d", req->tag);
+
+       /* controller has been disabled/shtdown, it is safe here to clear
+        * the aborted_gstate and complete request.
+        */
+       if (nvme_req(req)->flags | NVME_REQ_CANCELLED)
+               blk_mq_rq_update_aborted_gstate(req, 0);
+       blk_mq_complete_request(req);
+}
+
+
+static void nvme_pci_sync_timeout_work(struct nvme_ctrl *ctrl)
+{
+       struct nvme_ns *ns;
+
+       /* ensure the timeout_work is queued */
+       kblockd_schedule_work(&ctrl->admin_q->timeout_work);
+       flush_work(&ctrl->admin_q->timeout_work);
+
+       down_read(&ctrl->namespaces_rwsem);
+       list_for_each_entry(ns, &ctrl->namespaces, list)
+               kblockd_schedule_work(&ns->queue->timeout_work);
+
+       list_for_each_entry(ns, &ctrl->namespaces, list)
+               flush_work(&ns->queue->timeout_work);
+       up_read(&ctrl->namespaces_rwsem);
+}
+
+static void nvme_pci_grab_outstanding(struct nvme_dev *dev)
+{
+       blk_mq_tagset_busy_iter(&dev->tagset, nvme_pci_abort_rq, &dev->ctrl);
+       blk_mq_tagset_busy_iter(&dev->admin_tagset,
+                       nvme_pci_abort_rq, &dev->ctrl);
+       nvme_pci_sync_timeout_work(&dev->ctrl);
+       /* All the outstanding requests have been grabbed. They are forced to be
+        * expired, neither irq completion path nor timeout path could take them
+        * away.
+        */
 }
 
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 {
-       int i;
        bool dead = true;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
-       int onlines;
 
        mutex_lock(&dev->shutdown_lock);
        if (pci_is_enabled(pdev)) {
@@ -2192,6 +2308,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
 
        nvme_stop_queues(&dev->ctrl);
 
+       WRITE_ONCE(dev->grab_flag, NVME_PCI_OUTSTANDING_GRABBING);
+       nvme_pci_grab_outstanding(dev);
+       WRITE_ONCE(dev->grab_flag, NVME_PCI_OUTSTANDING_GRABBED);
+       /*
+        * All the previous outstanding requests have been grabbed and
+        * nvme_timeout path is guaranteed to be drained.
+        */
+
        if (!dead) {
                /*
                 * If the controller is still alive tell it to stop using the
@@ -2205,18 +2329,16 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
                nvme_disable_admin_queue(dev, shutdown);
        }
 
-       onlines = dev->online_queues;
-       for (i = onlines - 1; i >= 0; i--)
-               nvme_suspend_queue(&dev->queues[i]);
-
        if (dev->ctrl.admin_q)
                blk_mq_quiesce_queue(dev->ctrl.admin_q);
 
-       nvme_pci_disable(dev);
+       nvme_pci_free_and_disable(dev);
 
-       blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
-       blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, 
&dev->ctrl);
+       blk_mq_tagset_busy_iter(&dev->tagset, nvme_pci_cancel_rq, &dev->ctrl);
+       blk_mq_tagset_busy_iter(&dev->admin_tagset,
+                       nvme_pci_cancel_rq, &dev->ctrl);
 
+       WRITE_ONCE(dev->grab_flag, 0);
        /*
         * For shutdown case, controller will not be setup again soon. If any
         * residual requests here, the controller must have go wrong. Drain and
-- 
2.7.4

Reply via email to