Synchronize access to eeh_pe. Signed-off-by: Sam Bobroff <sbobr...@linux.ibm.com> --- arch/powerpc/kernel/eeh_driver.c | 15 +++++--- arch/powerpc/platforms/powernv/eeh-powernv.c | 38 ++++++++++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index c9d73070793e..bc5d58bf3904 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -1184,6 +1184,7 @@ void eeh_handle_special_event(void) do { + /* Acquire ref if rc == _FROZEN_PE, _FENCED_PHB or _DEAD_PHB */ rc = eeh_ops->next_error(&pe); switch (rc) { @@ -1195,10 +1196,11 @@ void eeh_handle_special_event(void) eeh_remove_event(NULL, true); list_for_each_entry(hose, &hose_list, list_node) { - phb_pe = eeh_phb_pe_get(hose); + phb_pe = eeh_phb_pe_get(hose); /* Acquire ref */ if (!phb_pe) continue; eeh_pe_mark_isolated(phb_pe); + eeh_put_pe(phb_pe); /* Release ref */ } eeh_serialize_unlock(flags); @@ -1236,15 +1238,17 @@ void eeh_handle_special_event(void) if (rc == EEH_NEXT_ERR_FROZEN_PE || rc == EEH_NEXT_ERR_FENCED_PHB) { eeh_pe_state_mark(pe, EEH_PE_RECOVERING); - eeh_handle_normal_event(pe); + eeh_handle_normal_event(pe); /* Give ref */ } else { pci_lock_rescan_remove(); list_for_each_entry(hose, &hose_list, list_node) { - phb_pe = eeh_phb_pe_get(hose); + phb_pe = eeh_phb_pe_get(hose); /* Acquire ref */ if (!phb_pe || !(phb_pe->state & EEH_PE_ISOLATED) || - (phb_pe->state & EEH_PE_RECOVERING)) + (phb_pe->state & EEH_PE_RECOVERING)) { + eeh_put_pe(phb_pe); /* Release ref */ continue; + } eeh_for_each_pe(pe, tmp_pe) eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) @@ -1263,11 +1267,14 @@ void eeh_handle_special_event(void) __func__, pe->phb->global_number, pe->addr); + eeh_put_pe(phb_pe); /* Release ref */ break; } pci_hp_remove_devices(bus); + eeh_put_pe(phb_pe); /* Release ref */ } pci_unlock_rescan_remove(); + eeh_put_pe(pe); /* Release ref */ } /* diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index e477e0b70968..c56a796dd894 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1404,6 +1404,7 @@ static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose) } } +/* A return of 0 indicates that *pe is set, and referenced. */ static int pnv_eeh_get_pe(struct pci_controller *hose, u16 pe_no, struct eeh_pe **pe) { @@ -1431,6 +1432,7 @@ static int pnv_eeh_get_pe(struct pci_controller *hose, /* Freeze the (compound) PE */ *pe = dev_pe; + eeh_get_pe(*pe); /* Acquire ref */ if (!(dev_pe->state & EEH_PE_ISOLATED)) phb->freeze_pe(phb, pe_no); @@ -1439,23 +1441,26 @@ static int pnv_eeh_get_pe(struct pci_controller *hose, * have been frozen. However, we still need poke until * hitting the frozen PE on top level. */ - dev_pe = dev_pe->parent; + eeh_pe_move_to_parent(&dev_pe); while (dev_pe && !(dev_pe->type & EEH_PE_PHB)) { int ret; ret = eeh_ops->get_state(dev_pe, NULL); if (ret <= 0 || eeh_state_active(ret)) { - dev_pe = dev_pe->parent; + eeh_pe_move_to_parent(&dev_pe); continue; } /* Frozen parent PE */ + eeh_put_pe(*pe); /* Release ref */ *pe = dev_pe; + eeh_get_pe(*pe); /* Acquire ref */ if (!(dev_pe->state & EEH_PE_ISOLATED)) phb->freeze_pe(phb, dev_pe->addr); /* Next one */ - dev_pe = dev_pe->parent; + eeh_pe_move_to_parent(&dev_pe); } + eeh_put_pe(dev_pe); return 0; } @@ -1469,6 +1474,8 @@ static int pnv_eeh_get_pe(struct pci_controller *hose, * OPAL APIs for next error to handle. The informational error is * handled internally by platform. However, the dead IOC, dead PHB, * fenced PHB and frozen PE should be handled by EEH core eventually. + * On return, *pe will be ref'd iff returning _FROZEN_PE, _FENCED_PHB or + * _DEAD_PHB. */ static int pnv_eeh_next_error(struct eeh_pe **pe) { @@ -1479,6 +1486,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) __be16 err_type, severity; long rc; int state, ret = EEH_NEXT_ERR_NONE; + unsigned long flags; /* * While running here, it's safe to purge the event queue. The @@ -1493,9 +1501,11 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) * needn't take care of it any more. */ phb = hose->private_data; - phb_pe = eeh_phb_pe_get(hose); - if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED)) + phb_pe = eeh_phb_pe_get(hose); /* Acquire ref */ + if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED)) { + eeh_put_pe(phb_pe); /* Release ref */ continue; + } rc = opal_pci_next_error(phb->opal_id, &frozen_pe_no, &err_type, &severity); @@ -1503,6 +1513,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) pr_devel("%s: Invalid return value on " "PHB#%x (0x%lx) from opal_pci_next_error", __func__, hose->global_number, rc); + eeh_put_pe(phb_pe); /* Release ref */ continue; } @@ -1511,6 +1522,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) { pr_devel("%s: No error found on PHB#%x\n", __func__, hose->global_number); + eeh_put_pe(phb_pe); /* Release ref */ continue; } @@ -1539,19 +1551,23 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) case OPAL_EEH_PHB_ERROR: if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) { *pe = phb_pe; + eeh_get_pe(*pe); /* Acquire ref */ pr_err("EEH: dead PHB#%x detected, " "location: %s\n", hose->global_number, eeh_pe_loc_get(phb_pe)); ret = EEH_NEXT_ERR_DEAD_PHB; + /* Retain ref on pe */ } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_FENCED) { *pe = phb_pe; + eeh_get_pe(*pe); /* Acquire ref */ pr_err("EEH: Fenced PHB#%x detected, " "location: %s\n", hose->global_number, eeh_pe_loc_get(phb_pe)); ret = EEH_NEXT_ERR_FENCED_PHB; + /* Retain ref on pe */ } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) { pr_info("EEH: PHB#%x informative error " "detected, location: %s\n", @@ -1568,8 +1584,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) * If we can't find the corresponding PE, we * just try to unfreeze. */ + /* Maybe acquire ref */ if (pnv_eeh_get_pe(hose, be64_to_cpu(frozen_pe_no), pe)) { + /* 'pe' was not set by pnv_eeh_get_pe() */ pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n", hose->global_number, be64_to_cpu(frozen_pe_no)); pr_info("EEH: PHB location: %s\n", @@ -1589,6 +1607,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) ret = EEH_NEXT_ERR_NONE; } else if ((*pe)->state & EEH_PE_ISOLATED || eeh_pe_passed(*pe)) { + eeh_put_pe(*pe); /* Release ref */ ret = EEH_NEXT_ERR_NONE; } else { pr_err("EEH: Frozen PE#%x " @@ -1600,6 +1619,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) eeh_pe_loc_get(*pe), eeh_pe_loc_get(phb_pe)); ret = EEH_NEXT_ERR_FROZEN_PE; + /* Retain ref on pe */ } break; @@ -1631,7 +1651,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) * we need have to handle frozen parent PE firstly. */ if (ret == EEH_NEXT_ERR_FROZEN_PE) { + eeh_lock_pes(&flags); parent_pe = (*pe)->parent; + eeh_get_pe(parent_pe); + eeh_unlock_pes(flags); while (parent_pe) { /* Hit the ceiling ? */ if (parent_pe->type & EEH_PE_PHB) @@ -1643,13 +1666,15 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) *pe = parent_pe; /* Next parent level */ - parent_pe = parent_pe->parent; + eeh_pe_move_to_parent(&parent_pe); } + eeh_put_pe(parent_pe); /* Release ref (for early-out) */ /* We possibly migrate to another PE */ eeh_pe_mark_isolated(*pe); } + eeh_put_pe(phb_pe); /* Release ref */ /* * If we have no errors on the specific PHB or only * informative error there, we continue poking it. @@ -1664,6 +1689,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) if (ret == EEH_NEXT_ERR_NONE && eeh_enabled()) enable_irq(eeh_event_irq); + /* *pe may be ref'd, see above */ return ret; } -- 2.22.0.216.g00a2a96fc9