The patch fixes an issue when repeated EEH reports with a single error
on the bus of Intel X710 4-port 10G Base-T adapter, in the MSI domain
causing the devices to be permanently disabled.  It fully resets and
restart the devices when handling the PCI EEH error.

Two new functions, i40e_io_suspend() and i40e_io_resume(), have been
introduced.  These functions were refactor from the existing
i40e_suspend() and i40e_resume() respectively.  This refactoring was
done due to concerns about the logic of the I40E_SUSPENSED state, which
caused the device not able to recover.  The functios are now used in the
EEH handling for device suspend/resume callbacks.

- In the PCI error detected callback, replaced i40e_prep_for_reset()
  with i40e_io_suspend(). The chance is to fully suspend all I/O
  operations
- In the PCI error slot reset callback, replaced pci_enable_device_mem()
  with pci_enable_device(). This change enables both I/O and memory of 
  the device.

- In the PCI error resume callback, replace i40e_handle_reset_warning()
  with i40e_io_resume(). This change allows the system to resume I/O 
  operations


Signed-off-by: Thinh Tran <thin...@linux.ibm.com>
Tested-by: Robert Thomas <rob.tho...@ibm.com>

---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 29 ++++++++++++++++-----
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 48b9ddb2b1b3..58418aa9231e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -54,6 +54,9 @@ static int i40e_get_capabilities(struct i40e_pf *pf,
                                 enum i40e_admin_queue_opc list_type);
 static bool i40e_is_total_port_shutdown_enabled(struct i40e_pf *pf);
 
+static int i40e_io_suspend(struct i40e_pf *pf);
+static int i40e_io_resume(struct i40e_pf *pf);
+
 /* i40e_pci_tbl - PCI Device ID Table
  *
  * Last entry must be all 0s
@@ -11138,6 +11141,8 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, 
bool reinit,
        ret = i40e_reset(pf);
        if (!ret)
                i40e_rebuild(pf, reinit, lock_acquired);
+       else
+               dev_err(&pf->pdev->dev, "%s: i40e_reset() FAILED", __func__);
 }
 
 /**
@@ -16327,7 +16332,7 @@ static pci_ers_result_t i40e_pci_error_detected(struct 
pci_dev *pdev,
 
        /* shutdown all operations */
        if (!test_bit(__I40E_SUSPENDED, pf->state))
-               i40e_prep_for_reset(pf);
+               i40e_io_suspend(pf);
 
        /* Request a slot reset */
        return PCI_ERS_RESULT_NEED_RESET;
@@ -16349,7 +16354,8 @@ static pci_ers_result_t 
i40e_pci_error_slot_reset(struct pci_dev *pdev)
        u32 reg;
 
        dev_dbg(&pdev->dev, "%s\n", __func__);
-       if (pci_enable_device_mem(pdev)) {
+       /* enable I/O and memory of the device  */
+       if (pci_enable_device(pdev)) {
                dev_info(&pdev->dev,
                         "Cannot re-enable PCI device after reset.\n");
                result = PCI_ERS_RESULT_DISCONNECT;
@@ -16411,8 +16417,7 @@ static void i40e_pci_error_resume(struct pci_dev *pdev)
        dev_dbg(&pdev->dev, "%s\n", __func__);
        if (test_bit(__I40E_SUSPENDED, pf->state))
                return;
-
-       i40e_handle_reset_warning(pf, false);
+       i40e_io_resume(pf);
 }
 
 /**
@@ -16521,11 +16526,16 @@ static void i40e_shutdown(struct pci_dev *pdev)
 static int __maybe_unused i40e_suspend(struct device *dev)
 {
        struct i40e_pf *pf = dev_get_drvdata(dev);
-       struct i40e_hw *hw = &pf->hw;
 
        /* If we're already suspended, then there is nothing to do */
        if (test_and_set_bit(__I40E_SUSPENDED, pf->state))
                return 0;
+       return i40e_io_suspend(pf);
+}
+
+static int i40e_io_suspend(struct i40e_pf *pf)
+{
+       struct i40e_hw *hw = &pf->hw;
 
        set_bit(__I40E_DOWN, pf->state);
 
@@ -16572,11 +16582,16 @@ static int __maybe_unused i40e_suspend(struct device 
*dev)
 static int __maybe_unused i40e_resume(struct device *dev)
 {
        struct i40e_pf *pf = dev_get_drvdata(dev);
-       int err;
 
        /* If we're not suspended, then there is nothing to do */
        if (!test_bit(__I40E_SUSPENDED, pf->state))
                return 0;
+       return i40e_io_resume(pf);
+}
+
+static int i40e_io_resume(struct i40e_pf *pf)
+{
+       int err;
 
        /* We need to hold the RTNL lock prior to restoring interrupt schemes,
         * since we're going to be restoring queues
@@ -16588,7 +16603,7 @@ static int __maybe_unused i40e_resume(struct device 
*dev)
         */
        err = i40e_restore_interrupt_scheme(pf);
        if (err) {
-               dev_err(dev, "Cannot restore interrupt scheme: %d\n",
+               dev_err(&pf->pdev->dev, "Cannot restore interrupt scheme: %d\n",
                        err);
        }
 
-- 
2.39.3

Reply via email to