This patch adds support for reset and loading or unloading driver
occur simultaneously.

Signed-off-by: Wei Hu (Xavier) <xavier.hu...@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  21 ++++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 151 ++++++++++++++++++++++++++--
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h  |   7 ++
 3 files changed, 169 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h 
b/drivers/infiniband/hw/hns/hns_roce_device.h
index 9518615..d0d03a6 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -217,6 +217,26 @@ enum {
        HNS_ROCE_DB_PER_PAGE = PAGE_SIZE / 4
 };
 
+enum hns_roce_reset_stage {
+       HNS_ROCE_STATE_NON_RST,
+       HNS_ROCE_STATE_RST_BEF_DOWN,
+       HNS_ROCE_STATE_RST_DOWN,
+       HNS_ROCE_STATE_RST_UNINIT,
+       HNS_ROCE_STATE_RST_INIT,
+       HNS_ROCE_STATE_RST_INITED,
+};
+
+enum hns_roce_instance_state {
+       HNS_ROCE_STATE_NON_INIT,
+       HNS_ROCE_STATE_INIT,
+       HNS_ROCE_STATE_INITED,
+       HNS_ROCE_STATE_UNINIT,
+};
+
+enum {
+       HNS_ROCE_RST_DIRECT_RETURN              = 0,
+};
+
 #define HNS_ROCE_CMD_SUCCESS                   1
 
 #define HNS_ROCE_PORT_DOWN                     0
@@ -932,6 +952,7 @@ struct hns_roce_dev {
        spinlock_t              bt_cmd_lock;
        bool                    active;
        bool                    is_reset;
+       unsigned long           reset_cnt;
        struct hns_roce_ib_iboe iboe;
 
        struct list_head        pgdir_list;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c 
b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 84b0245..896dd59 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5933,6 +5933,7 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
 static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
                                  struct hnae3_handle *handle)
 {
+       struct hns_roce_v2_priv *priv = hr_dev->priv;
        const struct pci_device_id *id;
        int i;
 
@@ -5963,10 +5964,13 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev 
*hr_dev,
        hr_dev->cmd_mod = 1;
        hr_dev->loop_idc = 0;
 
+       hr_dev->reset_cnt = handle->ae_algo->ops->ae_dev_reset_cnt(handle);
+       priv->handle = handle;
+
        return 0;
 }
 
-static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
+static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
 {
        struct hns_roce_dev *hr_dev;
        int ret;
@@ -5983,7 +5987,6 @@ static int hns_roce_hw_v2_init_instance(struct 
hnae3_handle *handle)
 
        hr_dev->pci_dev = handle->pdev;
        hr_dev->dev = &handle->pdev->dev;
-       handle->priv = hr_dev;
 
        ret = hns_roce_hw_v2_get_cfg(hr_dev, handle);
        if (ret) {
@@ -5997,6 +6000,8 @@ static int hns_roce_hw_v2_init_instance(struct 
hnae3_handle *handle)
                goto error_failed_get_cfg;
        }
 
+       handle->priv = hr_dev;
+
        return 0;
 
 error_failed_get_cfg:
@@ -6008,7 +6013,7 @@ static int hns_roce_hw_v2_init_instance(struct 
hnae3_handle *handle)
        return ret;
 }
 
-static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
+static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
                                           bool reset)
 {
        struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
@@ -6016,24 +6021,132 @@ static void hns_roce_hw_v2_uninit_instance(struct 
hnae3_handle *handle,
        if (!hr_dev)
                return;
 
+       handle->priv = NULL;
        hns_roce_exit(hr_dev);
        kfree(hr_dev->priv);
        ib_dealloc_device(&hr_dev->ib_dev);
 }
 
+static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
+{
+       const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+       struct hns_roce_dev *hr_dev;
+       unsigned long end;
+       int ret;
+
+       handle->rinfo.instance_state = HNS_ROCE_STATE_INIT;
+
+       if (ops->ae_dev_resetting(handle) || ops->get_hw_reset_stat(handle)) {
+               handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+               goto head_chk_err;
+       }
+
+       ret = __hns_roce_hw_v2_init_instance(handle);
+       if (ret) {
+               handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+               dev_err(&handle->pdev->dev,
+                       "RoCE instance init failed! ret = %d\n", ret);
+               if (ops->ae_dev_resetting(handle) ||
+                   ops->get_hw_reset_stat(handle))
+                       goto head_chk_err;
+               else
+                       return ret;
+       }
+
+       handle->rinfo.instance_state = HNS_ROCE_STATE_INITED;
+
+       hr_dev = (struct hns_roce_dev *)handle->priv;
+       if (ops->ae_dev_resetting(handle) || ops->get_hw_reset_stat(handle) ||
+           hr_dev->reset_cnt != ops->ae_dev_reset_cnt(handle)) {
+               handle->rinfo.instance_state = HNS_ROCE_STATE_INIT;
+               goto tail_chk_err;
+       }
+
+       return 0;
+
+tail_chk_err:
+       /* Wait until software reset process finished, in order to ensure that
+        * reset process and this function will not call
+        * __hns_roce_hw_v2_uninit_instance at the same time.
+        * If a timeout occurs, it indicates that the network subsystem has
+        * encountered a serious error and cannot be recovered from the reset
+        * processing.
+        */
+       end = msecs_to_jiffies(HNS_ROCE_V2_RST_PRC_MAX_TIME) + jiffies;
+       while (ops->ae_dev_resetting(handle) && time_before(jiffies, end))
+               msleep(20);
+
+       if (!ops->ae_dev_resetting(handle))
+               dev_warn(&handle->pdev->dev, "Device completed reset.\n");
+       else
+               dev_warn(&handle->pdev->dev,
+                        "Device is still resetting! timeout!\n");
+
+       __hns_roce_hw_v2_uninit_instance(handle, false);
+       handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+
+head_chk_err:
+       dev_err(&handle->pdev->dev, "Device is busy in resetting state.\n"
+                                   "please retry later.\n");
+
+       return -EBUSY;
+}
+
+static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
+                                          bool reset)
+{
+       const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+       unsigned long end;
+
+       if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
+               return;
+
+       handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
+
+       /* Check the status of the current software reset process, if in
+        * software reset process, wait until software reset process finished,
+        * in order to ensure that reset process and this function will not call
+        * __hns_roce_hw_v2_uninit_instance at the same time.
+        * If a timeout occurs, it indicates that the network subsystem has
+        * encountered a serious error and cannot be recovered from the reset
+        * processing.
+        */
+       if (ops->ae_dev_resetting(handle)) {
+               dev_warn(&handle->pdev->dev,
+                        "Device is busy in resetting state. waiting.\n");
+               end = msecs_to_jiffies(HNS_ROCE_V2_RST_PRC_MAX_TIME) + jiffies;
+               while (ops->ae_dev_resetting(handle) &&
+                      time_before(jiffies, end))
+                       msleep(20);
+
+               if (!ops->ae_dev_resetting(handle))
+                       dev_warn(&handle->pdev->dev,
+                                "Device completed reset.\n");
+               else
+                       dev_warn(&handle->pdev->dev,
+                                "Device is still resetting! timeout!\n");
+       }
+
+       __hns_roce_hw_v2_uninit_instance(handle, reset);
+
+       handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+}
 static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
 {
        struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
        struct ib_event event;
 
-       if (!hr_dev) {
-               dev_err(&handle->pdev->dev,
-                       "Input parameter handle->priv is NULL!\n");
-               return -EINVAL;
+       if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) {
+               set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
+               return 0;
        }
 
+       handle->rinfo.reset_state = HNS_ROCE_STATE_RST_DOWN;
+       clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
+       if (!hr_dev)
+               return 0;
+
        hr_dev->active = false;
-       hr_dev->is_reset = true;
 
        event.event = IB_EVENT_DEVICE_FATAL;
        event.device = &hr_dev->ib_dev;
@@ -6047,7 +6160,16 @@ static int hns_roce_hw_v2_reset_notify_init(struct 
hnae3_handle *handle)
 {
        int ret;
 
-       ret = hns_roce_hw_v2_init_instance(handle);
+       if (test_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state)) {
+               clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
+               handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+               return 0;
+       }
+
+       handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INIT;
+
+       dev_info(&handle->pdev->dev, "In reset process RoCE client reinit.\n");
+       ret = __hns_roce_hw_v2_init_instance(handle);
        if (ret) {
                /* when reset notify type is HNAE3_INIT_CLIENT In reset notify
                 * callback function, RoCE Engine reinitialize. If RoCE reinit
@@ -6056,6 +6178,10 @@ static int hns_roce_hw_v2_reset_notify_init(struct 
hnae3_handle *handle)
                handle->priv = NULL;
                dev_err(&handle->pdev->dev,
                        "In reset process RoCE reinit failed %d.\n", ret);
+       } else {
+               handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+               dev_info(&handle->pdev->dev,
+                        "Reset done, RoCE client reinit finished.\n");
        }
 
        return ret;
@@ -6063,8 +6189,13 @@ static int hns_roce_hw_v2_reset_notify_init(struct 
hnae3_handle *handle)
 
 static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle)
 {
+       if (test_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state))
+               return 0;
+
+       handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT;
        msleep(100);
-       hns_roce_hw_v2_uninit_instance(handle, false);
+       __hns_roce_hw_v2_uninit_instance(handle, false);
+
        return 0;
 }
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h 
b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 1ad6bf1..2857669 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -95,6 +95,12 @@
 #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE       2
 #define HNS_ROCE_V2_RSV_QPS                    8
 
+/* The longest time for software reset process in NIC subsystem, if a timeout
+ * occurs, it indicates that the network subsystem has encountered a serious
+ * error and cannot be recovered from the reset processing.
+ */
+#define HNS_ROCE_V2_RST_PRC_MAX_TIME           300000
+
 #define HNS_ROCE_CONTEXT_HOP_NUM               1
 #define HNS_ROCE_SCC_CTX_HOP_NUM               1
 #define HNS_ROCE_MTT_HOP_NUM                   1
@@ -1594,6 +1600,7 @@ struct hns_roce_link_table_entry {
 #define HNS_ROCE_LINK_TABLE_NXT_PTR_M GENMASK(31, 20)
 
 struct hns_roce_v2_priv {
+       struct hnae3_handle *handle;
        struct hns_roce_v2_cmq cmq;
        struct hns_roce_link_table tsq;
        struct hns_roce_link_table tpq;
-- 
1.9.1

Reply via email to