On Fri, 2014-02-21 at 19:53 +0800, Gavin Shan wrote:
> EEH core tries to recover from fenced PHB or frozen PE. Unfortunately,
> the log isn't consistent with the site because enabling IO path for
> the frozen PE before collecting log would ruin the site. The patch
> solves the problem to cache the PHB diag-data in advance with the
> help of additional flag PNV_PHB_FLAG_DIAG to pnv_phb::flags.

Ok, so correct me if I'm wrong, but you are

  - Collecting the diag data in get_state, as a sort of side effect
(what happens if get_state is called multiple times ?)

  - Dumping it later on

Any reason why we can't instead dump it immediately ? Also do we have a
clean trigger for when we detect an error condition ? It can either be
the result of an interrupt or a driver called get_state following an
ffffffff. Are both path eventually going into the same function to
handle a "new" error condition ? That's where I would both collect and
dump the EEH state..

Cheers,
Ben.

> Signed-off-by: Gavin Shan <sha...@linux.vnet.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/eeh-ioda.c |   65 
> ++++++++++++++++++-----------
>  arch/powerpc/platforms/powernv/pci.c      |   21 ++++++----
>  arch/powerpc/platforms/powernv/pci.h      |    1 +
>  3 files changed, 55 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
> b/arch/powerpc/platforms/powernv/eeh-ioda.c
> index 04b4710..3ed8d22 100644
> --- a/arch/powerpc/platforms/powernv/eeh-ioda.c
> +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
> @@ -114,6 +114,27 @@ DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, 
> ioda_eeh_inbB_dbgfs_get,
>                       ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
>  #endif /* CONFIG_DEBUG_FS */
>  
> +static void ioda_eeh_phb_diag(struct pci_controller *hose)
> +{
> +     struct pnv_phb *phb = hose->private_data;
> +     unsigned long flags;
> +     long rc;
> +
> +     spin_lock_irqsave(&phb->lock, flags);
> +
> +     rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
> +                                      PNV_PCI_DIAG_BUF_SIZE);
> +     if (rc == OPAL_SUCCESS) {
> +             phb->flags |= PNV_PHB_FLAG_DIAG;
> +     } else {
> +             pr_warn("%s: Can't get diag-data for PHB#%x (%ld)\n",
> +                     __func__, hose->global_number, rc);
> +             phb->flags &= ~PNV_PHB_FLAG_DIAG;
> +     }
> +
> +     spin_unlock_irqrestore(&phb->lock, flags);
> +}
> +
>  /**
>   * ioda_eeh_post_init - Chip dependent post initialization
>   * @hose: PCI controller
> @@ -272,6 +293,8 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
>                       result |= EEH_STATE_DMA_ACTIVE;
>                       result |= EEH_STATE_MMIO_ENABLED;
>                       result |= EEH_STATE_DMA_ENABLED;
> +             } else {
> +                     ioda_eeh_phb_diag(hose);
>               }
>  
>               return result;
> @@ -541,24 +564,13 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
>  static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
>                           char *drv_log, unsigned long len)
>  {
> -     s64 ret;
> +     struct pnv_phb *phb = pe->phb->private_data;
>       unsigned long flags;
> -     struct pci_controller *hose = pe->phb;
> -     struct pnv_phb *phb = hose->private_data;
>  
>       spin_lock_irqsave(&phb->lock, flags);
>  
> -     ret = opal_pci_get_phb_diag_data2(phb->opal_id,
> -                     phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
> -     if (ret) {
> -             spin_unlock_irqrestore(&phb->lock, flags);
> -             pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n",
> -                        __func__, hose->global_number, pe->addr, ret);
> -             return -EIO;
> -     }
> -
> -     /* The PHB diag-data is always indicative */
> -     pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
> +     pnv_pci_dump_phb_diag_data(pe->phb, phb->diag.blob);
> +     phb->flags &= ~PNV_PHB_FLAG_DIAG;
>  
>       spin_unlock_irqrestore(&phb->lock, flags);
>  
> @@ -646,19 +658,11 @@ static void ioda_eeh_hub_diag(struct pci_controller 
> *hose)
>       }
>  }
>  
> -static void ioda_eeh_phb_diag(struct pci_controller *hose)
> +static void ioda_eeh_phb_diag_dump(struct pci_controller *hose)
>  {
>       struct pnv_phb *phb = hose->private_data;
> -     long rc;
> -
> -     rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
> -                                      PNV_PCI_DIAG_BUF_SIZE);
> -     if (rc != OPAL_SUCCESS) {
> -             pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
> -                         __func__, hose->global_number, rc);
> -             return;
> -     }
>  
> +     ioda_eeh_phb_diag(hose);
>       pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
>  }
>  
> @@ -778,7 +782,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
>                               pr_info("EEH: PHB#%x informative error "
>                                       "detected\n",
>                                       hose->global_number);
> -                             ioda_eeh_phb_diag(hose);
> +                             ioda_eeh_phb_diag_dump(hose);
>                               ret = EEH_NEXT_ERR_NONE;
>                       }
>  
> @@ -809,6 +813,17 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
>               }
>  
>               /*
> +              * EEH core will try recover from fenced PHB or
> +              * frozen PE. In the time for frozen PE, EEH core
> +              * enable IO path for that before collecting logs,
> +              * but it ruins the site. So we have to cache the
> +              * log in advance here.
> +              */
> +             if (ret == EEH_NEXT_ERR_FROZEN_PE ||
> +                 ret == EEH_NEXT_ERR_FENCED_PHB)
> +                     ioda_eeh_phb_diag(hose);
> +
> +             /*
>                * If we have no errors on the specific PHB or only
>                * informative error there, we continue poking it.
>                * Otherwise, we need actions to be taken by upper
> diff --git a/arch/powerpc/platforms/powernv/pci.c 
> b/arch/powerpc/platforms/powernv/pci.c
> index 437c37d..67b2254 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -259,11 +259,15 @@ static void pnv_pci_dump_phb3_diag_data(struct 
> pci_controller *hose,
>  void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
>                               unsigned char *log_buff)
>  {
> +     struct pnv_phb *phb = hose->private_data;
>       struct OpalIoPhbErrorCommon *common;
>  
>       if (!hose || !log_buff)
>               return;
>  
> +     if (!(phb->flags & PNV_PHB_FLAG_DIAG))
> +             return;
> +
>       common = (struct OpalIoPhbErrorCommon *)log_buff;
>       switch (common->ioType) {
>       case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
> @@ -281,13 +285,19 @@ void pnv_pci_dump_phb_diag_data(struct pci_controller 
> *hose,
>  static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
>  {
>       unsigned long flags, rc;
> -     int has_diag;
> +     bool has_diag = false;
>  
>       spin_lock_irqsave(&phb->lock, flags);
>  
> -     rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
> -                                      PNV_PCI_DIAG_BUF_SIZE);
> -     has_diag = (rc == OPAL_SUCCESS);
> +     if (!(phb->flags & PNV_PHB_FLAG_DIAG)) {
> +             rc = opal_pci_get_phb_diag_data2(phb->opal_id,
> +                                              phb->diag.blob,
> +                                              PNV_PCI_DIAG_BUF_SIZE);
> +             if (rc == OPAL_SUCCESS) {
> +                     phb->flags |= PNV_PHB_FLAG_DIAG;
> +                     has_diag = true;
> +             }
> +     }
>  
>       rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
>                                      OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> @@ -303,9 +313,6 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb 
> *phb, u32 pe_no)
>                */
>               if (has_diag)
>                       pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob);
> -             else
> -                     pr_warning("PCI %d: No diag data available\n",
> -                                phb->hose->global_number);
>       }
>  
>       spin_unlock_irqrestore(&phb->lock, flags);
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> index adeb3c4..153af9a 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -82,6 +82,7 @@ struct pnv_eeh_ops {
>  #endif /* CONFIG_EEH */
>  
>  #define PNV_PHB_FLAG_EEH     (1 << 0)
> +#define PNV_PHB_FLAG_DIAG    (1 << 1)
>  
>  struct pnv_phb {
>       struct pci_controller   *hose;


_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to