On Fri, Feb 08, 2019 at 02:08:02PM +1100, Oliver O'Halloran wrote: > This patch adds a debugfs interface to force scheduling a recovery event. > This can be used to recover a specific PE or schedule a "special" recovery > even that checks for errors at the PHB level. > To force a recovery of a normal PE, use: > > echo '<#pe>:<#phb>' > /sys/kernel/debug/powerpc/eeh_force_recover
How about placing these in the per-PHB debugfs directory? echo '<#pe>' > /sys/kernel/debug/powerpc/PCI0000/eeh_force_recover > To force a scan broken PHBs: > > echo 'null' > /sys/kernel/debug/powerpc/eeh_force_recover And keep this one where it is, and just trigger with any write (or a '1' or whatever)? Sam. > Signed-off-by: Oliver O'Halloran <ooh...@gmail.com> > --- > arch/powerpc/include/asm/eeh_event.h | 1 + > arch/powerpc/kernel/eeh.c | 60 ++++++++++++++++++++++++++++ > arch/powerpc/kernel/eeh_event.c | 25 +++++++----- > 3 files changed, 76 insertions(+), 10 deletions(-) > > diff --git a/arch/powerpc/include/asm/eeh_event.h > b/arch/powerpc/include/asm/eeh_event.h > index 9884e872686f..6d0412b846ac 100644 > --- a/arch/powerpc/include/asm/eeh_event.h > +++ b/arch/powerpc/include/asm/eeh_event.h > @@ -33,6 +33,7 @@ struct eeh_event { > > int eeh_event_init(void); > int eeh_send_failure_event(struct eeh_pe *pe); > +int __eeh_send_failure_event(struct eeh_pe *pe); > void eeh_remove_event(struct eeh_pe *pe, bool force); > void eeh_handle_normal_event(struct eeh_pe *pe); > void eeh_handle_special_event(void); > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c > index 92809b137e39..63b91a4918c9 100644 > --- a/arch/powerpc/kernel/eeh.c > +++ b/arch/powerpc/kernel/eeh.c > @@ -1805,6 +1805,63 @@ static int eeh_enable_dbgfs_get(void *data, u64 *val) > > DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, > eeh_enable_dbgfs_set, "0x%llx\n"); > + > +static ssize_t eeh_force_recover_write(struct file *filp, > + const char __user *user_buf, > + size_t count, loff_t *ppos) > +{ > + struct pci_controller *hose; > + uint32_t phbid, pe_no; > + struct eeh_pe *pe; > + char buf[20]; > + int ret; > + > + ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count); > + if (!ret) > + return -EFAULT; > + > + /* > + * When PE is NULL the event is a "special" event. Rather than > + * recovering a specific PE it forces the EEH core to scan for failed > + * PHBs and recovers each. This needs to be done before any device > + * recoveries can occur. > + */ > + if (!strncmp(buf, "null", 4)) { > + pr_err("sending failure event\n"); > + __eeh_send_failure_event(NULL); > + return count; > + } > + > + ret = sscanf(buf, "%x:%x", &phbid, &pe_no); > + if (ret != 2) > + return -EINVAL; > + > + hose = pci_find_hose_for_domain(phbid); > + if (!hose) > + return -ENODEV; > + > + /* Retrieve PE */ > + pe = eeh_pe_get(hose, pe_no, 0); > + if (!pe) > + return -ENODEV; > + > + /* > + * We don't do any state checking here since the detection > + * process is async to the recovery process. The recovery > + * thread *should* not break even if we schedule a recovery > + * from an odd state (e.g. PE removed, or recovery of a > + * non-isolated PE) > + */ > + __eeh_send_failure_event(pe); > + > + return ret < 0 ? ret : count; > +} > + > +static const struct file_operations eeh_force_recover_fops = { > + .open = simple_open, > + .llseek = no_llseek, > + .write = eeh_force_recover_write, > +}; > #endif > > static int __init eeh_init_proc(void) > @@ -1820,6 +1877,9 @@ static int __init eeh_init_proc(void) > debugfs_create_bool("eeh_disable_recovery", 0600, > powerpc_debugfs_root, > &eeh_debugfs_no_recover); > + debugfs_create_file_unsafe("eeh_force_recover", 0600, > + powerpc_debugfs_root, NULL, > + &eeh_force_recover_fops); > eeh_cache_debugfs_init(); > #endif > } > diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c > index 19837798bb1d..539aca055d70 100644 > --- a/arch/powerpc/kernel/eeh_event.c > +++ b/arch/powerpc/kernel/eeh_event.c > @@ -121,20 +121,11 @@ int eeh_event_init(void) > * the actual event will be delivered in a normal context > * (from a workqueue). > */ > -int eeh_send_failure_event(struct eeh_pe *pe) > +int __eeh_send_failure_event(struct eeh_pe *pe) > { > unsigned long flags; > struct eeh_event *event; > > - /* > - * If we've manually supressed recovery events via debugfs > - * then just drop it on the floor. > - */ > - if (eeh_debugfs_no_recover) { > - pr_err("EEH: Event dropped due to no_recover setting\n"); > - return 0; > - } > - > event = kzalloc(sizeof(*event), GFP_ATOMIC); > if (!event) { > pr_err("EEH: out of memory, event not handled\n"); > @@ -153,6 +144,20 @@ int eeh_send_failure_event(struct eeh_pe *pe) > return 0; > } > > +int eeh_send_failure_event(struct eeh_pe *pe) > +{ > + /* > + * If we've manually supressed recovery events via debugfs > + * then just drop it on the floor. > + */ > + if (eeh_debugfs_no_recover) { > + pr_err("EEH: Event dropped due to no_recover setting\n"); > + return 0; > + } > + > + return __eeh_send_failure_event(pe); > +} > + > /** > * eeh_remove_event - Remove EEH event from the queue > * @pe: Event binding to the PE > -- > 2.20.1 >
signature.asc
Description: PGP signature