The patch fixes an issue when repeated EEH reports with a single error on the bus of Intel X710 4-port 10G Base-T adapter, in the MSI domain causing the devices to be permanently disabled. It fully resets and restart the devices when handling the PCI EEH error.
Two new functions, i40e_io_suspend() and i40e_io_resume(), have been introduced. These functions were refactor from the existing i40e_suspend() and i40e_resume() respectively. This refactoring was done due to concerns about the logic of the I40E_SUSPENSED state, which caused the device not able to recover. The functios are now used in the EEH handling for device suspend/resume callbacks. - In the PCI error detected callback, replaced i40e_prep_for_reset() with i40e_io_suspend(). The chance is to fully suspend all I/O operations - In the PCI error slot reset callback, replaced pci_enable_device_mem() with pci_enable_device(). This change enables both I/O and memory of the device. - In the PCI error resume callback, replace i40e_handle_reset_warning() with i40e_io_resume(). This change allows the system to resume I/O operations Signed-off-by: Thinh Tran <thin...@linux.ibm.com> Tested-by: Robert Thomas <rob.tho...@ibm.com> --- drivers/net/ethernet/intel/i40e/i40e_main.c | 29 ++++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 48b9ddb2b1b3..58418aa9231e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -54,6 +54,9 @@ static int i40e_get_capabilities(struct i40e_pf *pf, enum i40e_admin_queue_opc list_type); static bool i40e_is_total_port_shutdown_enabled(struct i40e_pf *pf); +static int i40e_io_suspend(struct i40e_pf *pf); +static int i40e_io_resume(struct i40e_pf *pf); + /* i40e_pci_tbl - PCI Device ID Table * * Last entry must be all 0s @@ -11138,6 +11141,8 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit, ret = i40e_reset(pf); if (!ret) i40e_rebuild(pf, reinit, lock_acquired); + else + dev_err(&pf->pdev->dev, "%s: i40e_reset() FAILED", __func__); } /** @@ -16327,7 +16332,7 @@ static pci_ers_result_t i40e_pci_error_detected(struct pci_dev *pdev, /* shutdown all operations */ if (!test_bit(__I40E_SUSPENDED, pf->state)) - i40e_prep_for_reset(pf); + i40e_io_suspend(pf); /* Request a slot reset */ return PCI_ERS_RESULT_NEED_RESET; @@ -16349,7 +16354,8 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev) u32 reg; dev_dbg(&pdev->dev, "%s\n", __func__); - if (pci_enable_device_mem(pdev)) { + /* enable I/O and memory of the device */ + if (pci_enable_device(pdev)) { dev_info(&pdev->dev, "Cannot re-enable PCI device after reset.\n"); result = PCI_ERS_RESULT_DISCONNECT; @@ -16411,8 +16417,7 @@ static void i40e_pci_error_resume(struct pci_dev *pdev) dev_dbg(&pdev->dev, "%s\n", __func__); if (test_bit(__I40E_SUSPENDED, pf->state)) return; - - i40e_handle_reset_warning(pf, false); + i40e_io_resume(pf); } /** @@ -16521,11 +16526,16 @@ static void i40e_shutdown(struct pci_dev *pdev) static int __maybe_unused i40e_suspend(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); - struct i40e_hw *hw = &pf->hw; /* If we're already suspended, then there is nothing to do */ if (test_and_set_bit(__I40E_SUSPENDED, pf->state)) return 0; + return i40e_io_suspend(pf); +} + +static int i40e_io_suspend(struct i40e_pf *pf) +{ + struct i40e_hw *hw = &pf->hw; set_bit(__I40E_DOWN, pf->state); @@ -16572,11 +16582,16 @@ static int __maybe_unused i40e_suspend(struct device *dev) static int __maybe_unused i40e_resume(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); - int err; /* If we're not suspended, then there is nothing to do */ if (!test_bit(__I40E_SUSPENDED, pf->state)) return 0; + return i40e_io_resume(pf); +} + +static int i40e_io_resume(struct i40e_pf *pf) +{ + int err; /* We need to hold the RTNL lock prior to restoring interrupt schemes, * since we're going to be restoring queues @@ -16588,7 +16603,7 @@ static int __maybe_unused i40e_resume(struct device *dev) */ err = i40e_restore_interrupt_scheme(pf); if (err) { - dev_err(dev, "Cannot restore interrupt scheme: %d\n", + dev_err(&pf->pdev->dev, "Cannot restore interrupt scheme: %d\n", err); } -- 2.39.3