[PATCH] powerpc/process: fix altivec SPR not being saved
In save_sprs() in process.c contains the following test: if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC))) t->vrsave = mfspr(SPRN_VRSAVE); CPU feature with the mask 0x1 is CPU_FTR_COHERENT_ICACHE so the test is equivilent to: if (cpu_has_feature(CPU_FTR_ALTIVEC) && cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) On CPUs without support for both (i.e G5) this results in vrsave not being saved between context switches. The vector register save/restore code doesn't use VRSAVE to determine which registers to save/restore, but the value of VRSAVE is used to determine if altivec is being used in several code paths. Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 8224852..5a4d4d1 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -855,7 +855,7 @@ void restore_tm_state(struct pt_regs *regs) static inline void save_sprs(struct thread_struct *t) { #ifdef CONFIG_ALTIVEC - if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC))) + if (cpu_has_feature(CPU_FTR_ALTIVEC)) t->vrsave = mfspr(SPRN_VRSAVE); #endif #ifdef CONFIG_PPC_BOOK3S_64 -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/process: fix altivec SPR not being saved
In save_sprs() in process.c contains the following test: if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC))) t->vrsave = mfspr(SPRN_VRSAVE); CPU feature with the mask 0x1 is CPU_FTR_COHERENT_ICACHE so the test is equivilent to: if (cpu_has_feature(CPU_FTR_ALTIVEC) && cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) On CPUs without support for both (i.e G5) this results in vrsave not being saved between context switches. The vector register save/restore code doesn't use VRSAVE to determine which registers to save/restore, but the value of VRSAVE is used to determine if altivec is being used in several code paths. Signed-off-by: Oliver O'Halloran Signed-off-by: Anton Blanchard Fixes: 152d523e6307 ("powerpc: Create context switch helpers save_sprs() and restore_sprs()") Cc: sta...@vger.kernel.org --- arch/powerpc/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index dccc87e8fee5..bc6aa87a3b12 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -854,7 +854,7 @@ void restore_tm_state(struct pt_regs *regs) static inline void save_sprs(struct thread_struct *t) { #ifdef CONFIG_ALTIVEC - if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC))) + if (cpu_has_feature(CPU_FTR_ALTIVEC)) t->vrsave = mfspr(SPRN_VRSAVE); #endif #ifdef CONFIG_PPC_BOOK3S_64 -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/2] powerpc/timer - large decrementer support
POWER ISA v3 adds large decrementer (LD) mode of operation which increases the size of the decrementer register from 32 bits to an implementation defined with of up to 64 bits. This patch adds support for the LD on processors with the CPU_FTR_ARCH_300 cpu feature flag set. Even for CPUs with this feature LD mode is only enabled when the property ibm,dec-bits devicetree property is supplied for the boot CPU. The decrementer value is a signed quantity (with negative values indicating a pending exception) and this property is required to find the maximum positive decrementer value. If this property is not supplied then the traditional decrementer width of 32 bits is assumed and LD mode is disabled. This patch was based on inital work by Jack Miller. Signed-off-by: Oliver O'Halloran Cc: Jack Miller --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/include/asm/time.h | 6 +-- arch/powerpc/kernel/time.c | 89 + 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index f5f4c66bbbc9..ff581ed1ab9d 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -332,6 +332,7 @@ #define LPCR_AIL_0 0x /* MMU off exception offset 0x0 */ #define LPCR_AIL_3 0x0180 /* MMU on exception offset 0xc00...4xxx */ #define LPCR_ONL 0x0004 /* online - PURR/SPURR count */ +#define LPCR_LD 0x0002 /* large decremeter */ #define LPCR_PECE0x0001f000 /* powersave exit cause enable */ #define LPCR_PECEDP0x0001 /* directed priv dbells cause exit */ #define LPCR_PECEDH0x8000 /* directed hyp dbells cause exit */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1092fdd7e737..09211640a0e0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int lower) * in auto-reload mode. The problem is PIT stops counting when it * hits zero. If it would wrap, we could use it just like a decrementer. */ -static inline unsigned int get_dec(void) +static inline u64 get_dec(void) { #if defined(CONFIG_40x) return (mfspr(SPRN_PIT)); @@ -160,10 +160,10 @@ static inline unsigned int get_dec(void) * in when the decrementer generates its interrupt: on the 1 to 0 * transition for Book E/4xx, but on the 0 to -1 transition for others. */ -static inline void set_dec(int val) +static inline void set_dec(u64 val) { #if defined(CONFIG_40x) - mtspr(SPRN_PIT, val); + mtspr(SPRN_PIT, (u32) val); #else #ifndef CONFIG_BOOKE --val; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 81b0900a39ee..0afaef6b5b6a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -95,7 +95,8 @@ static struct clocksource clocksource_timebase = { .read = timebase_read, }; -#define DECREMENTER_MAX0x7fff +#define DECREMENTER_DEFAULT_MAX 0x7FFF +u64 decrementer_max = DECREMENTER_DEFAULT_MAX; static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); @@ -503,7 +504,7 @@ static void __timer_interrupt(void) __this_cpu_inc(irq_stat.timer_irqs_event); } else { now = *next_tb - now; - if (now <= DECREMENTER_MAX) + if (now <= decrementer_max) set_dec((int)now); /* We may have raced with new irq work */ if (test_irq_work_pending()) @@ -534,7 +535,7 @@ void timer_interrupt(struct pt_regs * regs) /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); /* Some implementations of hotplug will get timer interrupts while * offline, just ignore these and we also need to set @@ -562,6 +563,7 @@ void timer_interrupt(struct pt_regs * regs) irq_enter(); __timer_interrupt(); + irq_exit(); set_irq_regs(old_regs); } @@ -582,9 +584,9 @@ static void generic_suspend_disable_irqs(void) * with suspending. */ - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); local_irq_disable(); - set_dec(DECREMENTER_MAX); + set_dec(decrementer_max); } static void generic_suspend_enable_irqs(void) @@ -865,7 +867,7 @@ static int decrementer_set_next_event(unsigned long evt, static int decrementer_shutdown(struct clock_event_device *dev) { - decrementer_set_next_event(DECREMENTER_MAX, dev); + decrementer_set_next_event(decrementer_max, dev); return 0; } @@ -891,6 +893,72 @@
[PATCH 2/2] KVM: PPC: hypervisor large decrementer support
Power ISAv3 extends the width of the decrementer register from 32 bits. The enlarged register width is implementation dependent, but reads from these registers are automatically sign extended to produce a 64 bit output when operating in large mode. The HDEC always operates in large mode while the DEC register can be operated in 32bit mode or large mode depending on the setting of the LPCR.LD bit. Currently the hypervisor assumes that reads from the DEC and HDEC register produce a 32 bit result which it sign extends to 64 bits using the extsw instruction. This behaviour can result in the guest DEC register value being corrupted by the hypervisor when the guest is operating in LD mode since the results of the extsw instruction only depends on the value of bit 31 in the register to be sign extended. This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading from the decrementer registers. These macros will return the current decrementer value as a 64 bit quantity regardless of the Host CPU or guest decrementer operating mode. Additionally this patch corrects several uses of decrementer values that assume a 32 bit register width. Signed-off-by: Oliver O'Halloran Cc: Paul Mackerras --- arch/powerpc/include/asm/exception-64s.h | 22 ++ arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/include/uapi/asm/kvm.h | 2 +- arch/powerpc/kernel/exceptions-64s.S | 9 +++- arch/powerpc/kvm/book3s_hv_interrupts.S | 3 +-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 38 ++-- arch/powerpc/kvm/emulate.c | 4 ++-- 8 files changed, 57 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 93ae809fe5ea..d922f76c682d 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -545,4 +545,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define FINISH_NAP #endif +/* these ensure that we always get a 64bit value from the + * decrementer register. */ + +#define IS_LD_ENABLED(reg) \ + mfspr reg,SPRN_LPCR; \ + andis. reg,reg,(LPCR_LD >> 16); + +#define GET_DEC(reg) \ + IS_LD_ENABLED(reg);\ + mfspr reg, SPRN_DEC; \ + bne 99f; \ + extsw reg, reg;\ +99: + +/* For CPUs that support it the Hypervisor LD is + * always enabled, so this needs to be feature gated */ +#define GET_HDEC(reg) \ + mfspr reg, SPRN_HDEC; \ +BEGIN_FTR_SECTION \ + extsw reg, reg; \ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + #endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d7b343170453..6330d3fca083 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -516,7 +516,7 @@ struct kvm_vcpu_arch { ulong mcsrr0; ulong mcsrr1; ulong mcsr; - u32 dec; + u64 dec; #ifdef CONFIG_BOOKE u32 decar; #endif diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2544edabe7f3..4de0102930e9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); -extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); +extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c93cf35ce379..2dd92e841127 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -215,7 +215,7 @@ struct kvm_sregs { __u32 tsr; /* KVM_SREGS_E_UPDATE_TSR */ __u32 tcr; __u32 decar; - __u32 dec; /* KVM_SREGS_E_UPDATE_DEC */ + __u64 dec; /* KVM_SREGS_E_UPDATE_DEC */ /* * Userspace can read TB directly, but the diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716cebf4b8e..984ae894e758 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -641,7 +641,14 @@ masked_##_H##interrupt: \ stb r11,PACAIRQH
[PATCH] Fix fall-through from case 30 (rld*) to case 31
I think this bug can only be triggered if the instruction to simulate is malformed. The switch in the else case only handles the zero and one case, but it extracts bits 4:1 from the instruction word so it may be other values. It's pretty minor, but a bug is a bug. Signed-off-by: Oliver O'Halloran --- arch/powerpc/lib/sstep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index dc885b3..e25f73c 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, } } #endif + break; /* illegal instruction */ case 31: switch ((instr >> 1) & 0x3ff) { -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/lib/sstep.c - Fix emulation fall-through
There is a switch fallthough in instr_analyze() which can cause an invalid instruction to be emulated as a different, valid, instruction. The rld* (opcode 30) case extracts a sub-opcode from bits 3:1 of the instruction word. However, the only valid values of this field a 001 and 000. These cases are correctly handled, but the others are not which causes execution to fall through into case 31. Breaking out of the switch causes the instruction to be marked as unknown and allows the caller to deal with the invalid instruction in a manner consistent with other invalid instructions. Signed-off-by: Oliver O'Halloran --- arch/powerpc/lib/sstep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index dc885b3..e25f73c 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, } } #endif + break; /* illegal instruction */ case 31: switch ((instr >> 1) & 0x3ff) { -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2] powerpc/sstep.c - Fix emulation fall-through
There is a switch fallthough in instr_analyze() which can cause an invalid instruction to be emulated as a different, valid, instruction. The rld* (opcode 30) case extracts a sub-opcode from bits 3:1 of the instruction word. However, the only valid values of this field a 001 and 000. These cases are correctly handled, but the others are not which causes execution to fall through into case 31. Breaking out of the switch causes the instruction to be marked as unknown and allows the caller to deal with the invalid instruction in a manner consistent with other invalid instructions. Signed-off-by: Oliver O'Halloran --- arch/powerpc/lib/sstep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index dc885b3..e25f73c 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, } } #endif + break; /* illegal instruction */ case 31: switch ((instr >> 1) & 0x3ff) { -- 2.5.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc/powernv: Remove ifdef
The PowerNV platform depends on CONFIG_PPC64 so this #ifdef in the middle of platforms/powernv/pci.c really doesn't need to be there. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index d235c2c..a5b2c7b 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -1039,7 +1039,6 @@ int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable) } EXPORT_SYMBOL_GPL(pnv_pci_set_tunnel_bar); -#ifdef CONFIG_PPC64/* for thread.tidr */ int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid, u32 *pid, u32 *tid) { @@ -1060,7 +1059,6 @@ int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid, u32 *pid, return 0; } EXPORT_SYMBOL_GPL(pnv_pci_get_as_notify_info); -#endif void pnv_pci_shutdown(void) { -- 2.9.4
Re: [PATCH 3/8] powerpc/eeh: Convert PNV_PHB_FLAG_EEH to global flag
On Wed, 2019-03-20 at 13:58 +1100, Sam Bobroff wrote: > The PHB flag, PNV_PHB_FLAG_EEH, is set (on PowerNV) individually on > each PHB once the EEH subsystem is ready. It is the only use of the > flags member of the phb struct. > > However there is no need to store this separately on each PHB, so > convert it to a global flag. For symmetry, the flag is now also set > for pSeries; although it is currently unused it may be useful in the > future. I'd drop this patch and keep it as a seperate flag. Making this a global flag doesn't really buy us anything either. It's also worth remembering that we do have virtual PHBs, like the NVLink ones, that don't have real EEH support and we should be doing something more intelligent about that. If you are going to remove the PNV_PHB flag then i would look at moving that flag into the generic pci_controller structure that we use for tracking PHBs since that would give us a way to toggle EEH support on a per PHB basis on pseries and powernv. > Signed-off-by: Sam Bobroff > --- > arch/powerpc/include/asm/eeh.h | 11 +++ > arch/powerpc/platforms/powernv/eeh-powernv.c | 14 +++--- > arch/powerpc/platforms/powernv/pci.c | 7 +++ > arch/powerpc/platforms/powernv/pci.h | 2 -- > arch/powerpc/platforms/pseries/pci.c | 4 > 5 files changed, 21 insertions(+), 17 deletions(-) > > diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h > index 3613a56281f2..fe4cf7208890 100644 > --- a/arch/powerpc/include/asm/eeh.h > +++ b/arch/powerpc/include/asm/eeh.h > @@ -43,6 +43,7 @@ struct pci_dn; > #define EEH_VALID_PE_ZERO0x10/* PE#0 is valid */ > #define EEH_ENABLE_IO_FOR_LOG0x20/* Enable IO for log > */ > #define EEH_EARLY_DUMP_LOG 0x40/* Dump log immediately */ > +#define EEH_PHB_ENABLED 0x80/* PHB recovery uses EEH > */ > > /* > * Delay for PE reset, all in ms > @@ -245,6 +246,11 @@ static inline bool eeh_enabled(void) > return eeh_has_flag(EEH_ENABLED) && !eeh_has_flag(EEH_FORCE_DISABLED); > } > > +static inline bool eeh_phb_enabled(void) > +{ > + return eeh_has_flag(EEH_PHB_ENABLED); > +} > + > static inline void eeh_serialize_lock(unsigned long *flags) > { > raw_spin_lock_irqsave(&confirm_error_lock, *flags); > @@ -332,6 +338,11 @@ static inline bool eeh_enabled(void) > return false; > } > > +static inline bool eeh_phb_enabled(void) > +{ > + return false; > +} > + > static inline void eeh_probe_devices(void) { } > > static inline void *eeh_dev_init(struct pci_dn *pdn, void *data) > diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c > b/arch/powerpc/platforms/powernv/eeh-powernv.c > index 6fc1a463b796..f0a95f663810 100644 > --- a/arch/powerpc/platforms/powernv/eeh-powernv.c > +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c > @@ -264,22 +264,14 @@ int pnv_eeh_post_init(void) > return ret; > } > > - if (!eeh_enabled()) > + if (eeh_enabled()) > + eeh_add_flag(EEH_PHB_ENABLED); > + else > disable_irq(eeh_event_irq); > > list_for_each_entry(hose, &hose_list, list_node) { > phb = hose->private_data; > > - /* > - * If EEH is enabled, we're going to rely on that. > - * Otherwise, we restore to conventional mechanism > - * to clear frozen PE during PCI config access. > - */ > - if (eeh_enabled()) > - phb->flags |= PNV_PHB_FLAG_EEH; > - else > - phb->flags &= ~PNV_PHB_FLAG_EEH; > - > /* Create debugfs entries */ > #ifdef CONFIG_DEBUG_FS > if (phb->has_dbgfs || !phb->dbgfs) > diff --git a/arch/powerpc/platforms/powernv/pci.c > b/arch/powerpc/platforms/powernv/pci.c > index 307181fd8a17..d2b50f3bf6b1 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -717,10 +717,9 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, > static bool pnv_pci_cfg_check(struct pci_dn *pdn) > { > struct eeh_dev *edev = NULL; > - struct pnv_phb *phb = pdn->phb->private_data; > > /* EEH not enabled ? */ > - if (!(phb->flags & PNV_PHB_FLAG_EEH)) > + if (!eeh_phb_enabled()) > return true; > > /* PE reset or device removed ? */ > @@ -761,7 +760,7 @@ static int pnv_pci_read_config(struct pci_bus *bus, > > ret = pnv_pci_cfg_read(pdn, where, size, val); > phb = pdn->phb->private_data; > - if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) { > + if (eeh_phb_enabled() && pdn->edev) { > if (*val == EEH_IO_ERROR_VALUE(size) && > eeh_dev_check_failure(pdn->edev)) > return PCIBIOS_DEVICE_NOT_FOUND; > @@ -789,7 +788,7 @@ static int pnv_pci_write_config(struct pci_bus *bus, >
Re: [PATCH 5/8] powerpc/eeh: Add eeh_show_enabled()
On Wed, 2019-03-20 at 13:58 +1100, Sam Bobroff wrote: > Move the EEH enabled message into it's own function so that future > work can call it from multiple places. > > Signed-off-by: Sam Bobroff > --- > arch/powerpc/include/asm/eeh.h | 3 +++ > arch/powerpc/kernel/eeh.c | 16 +++- > 2 files changed, 14 insertions(+), 5 deletions(-) > > diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h > index fe4cf7208890..e217ccda55d0 100644 > --- a/arch/powerpc/include/asm/eeh.h > +++ b/arch/powerpc/include/asm/eeh.h > @@ -289,6 +289,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); > > struct eeh_dev *eeh_dev_init(struct pci_dn *pdn); > void eeh_dev_phb_init_dynamic(struct pci_controller *phb); > +void eeh_show_enabled(void); > void eeh_probe_devices(void); > int __init eeh_ops_register(struct eeh_ops *ops); > int __exit eeh_ops_unregister(const char *name); > @@ -338,6 +339,8 @@ static inline bool eeh_enabled(void) > return false; > } > > +static inline void eeh_show_enabled(void) { } > + > static inline bool eeh_phb_enabled(void) > { > return false; > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c > index b14d89547895..3dcff29cb9b3 100644 > --- a/arch/powerpc/kernel/eeh.c > +++ b/arch/powerpc/kernel/eeh.c > @@ -163,6 +163,16 @@ static int __init eeh_setup(char *str) > } > __setup("eeh=", eeh_setup); > > +void eeh_show_enabled(void) > +{ > + if (eeh_has_flag(EEH_FORCE_DISABLED)) > + pr_info("EEH: PCI Enhanced I/O Error Handling DISABLED (by > eeh=off)\n"); The allcaps looks kind of stupid. > + else if (eeh_enabled()) > + pr_info("EEH: PCI Enhanced I/O Error Handling ENABLED (capable > adapter found)\n"); > + else > + pr_info("EEH: PCI Enhanced I/O Error Handling DISABLED (no > capable adapter found)\n"); If we really have to keep these messages then make it say "no EEH capable buses found" or something. EEH support has absolutely nothing to do with the adapters and I'm not even sure we can get the "DISABLED" message on PowerNV since the root port will always be there. > +} > + > /* > * This routine captures assorted PCI configuration space data > * for the indicated PCI device, and puts them into a buffer > @@ -1166,11 +1176,7 @@ void eeh_probe_devices(void) > pdn = hose->pci_data; > traverse_pci_dn(pdn, eeh_ops->probe, NULL); > } > - if (eeh_enabled()) > - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); > - else > - pr_info("EEH: No capable adapters found\n"); > - > + eeh_show_enabled(); > } > > /**
Re: [PATCH 6/8] powerpc/eeh: Initialize EEH address cache earlier
On Wed, 2019-03-20 at 13:58 +1100, Sam Bobroff wrote: > The EEH address cache is currently initialized and populated by a > single function: eeh_addr_cache_build(). While the initial population > of the cache can only be done once resources are allocated, > initialization (just setting up a spinlock) could be done much > earlier. > > So move the initialization step into a separate function and call it > from a core_initcall (rather than a subsys initcall). > > This will allow future work to make use of the cache during boot time > PCI scanning. What's the idea there? Checking for overlaps in the BAR assignments? > Signed-off-by: Sam Bobroff > --- > arch/powerpc/include/asm/eeh.h | 3 +++ > arch/powerpc/kernel/eeh.c | 2 ++ > arch/powerpc/kernel/eeh_cache.c | 13 +++-- > 3 files changed, 16 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h > index e217ccda55d0..791b9e6fcc45 100644 > --- a/arch/powerpc/include/asm/eeh.h > +++ b/arch/powerpc/include/asm/eeh.h > @@ -295,6 +295,7 @@ int __init eeh_ops_register(struct eeh_ops *ops); > int __exit eeh_ops_unregister(const char *name); > int eeh_check_failure(const volatile void __iomem *token); > int eeh_dev_check_failure(struct eeh_dev *edev); > +void eeh_addr_cache_init(void); > void eeh_addr_cache_build(void); > void eeh_add_device_early(struct pci_dn *); > void eeh_add_device_tree_early(struct pci_dn *); > @@ -362,6 +363,8 @@ static inline int eeh_check_failure(const volatile void > __iomem *token) > > #define eeh_dev_check_failure(x) (0) > > +static inline void eeh_addr_cache_init(void) { } > + > static inline void eeh_addr_cache_build(void) { } > > static inline void eeh_add_device_early(struct pci_dn *pdn) { } > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c > index 3dcff29cb9b3..7a406d58d2c0 100644 > --- a/arch/powerpc/kernel/eeh.c > +++ b/arch/powerpc/kernel/eeh.c > @@ -1219,6 +1219,8 @@ static int eeh_init(void) > list_for_each_entry_safe(hose, tmp, &hose_list, list_node) > eeh_dev_phb_init_dynamic(hose); > > + eeh_addr_cache_init(); > + > /* Initialize EEH event */ > return eeh_event_init(); > } > diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c > index 9c68f0837385..f93dd5cf6a39 100644 > --- a/arch/powerpc/kernel/eeh_cache.c > +++ b/arch/powerpc/kernel/eeh_cache.c > @@ -267,6 +267,17 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev) > spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); > } > > +/** > + * eeh_addr_cache_init - Initialize a cache of I/O addresses > + * > + * Initialize a cache of pci i/o addresses. This cache will be used to > + * find the pci device that corresponds to a given address. > + */ > +void eeh_addr_cache_init(void) > +{ > + spin_lock_init(&pci_io_addr_cache_root.piar_lock); > +} You could move this out of the pci_io_addr_cache structure and use DEFINE_SPINLOCK() too. We might even be able to get rid of the hand- rolled interval tree in eeh_cache.c in favour of the generic implementation (see mm/interval_tree.c). I'm pretty sure the generic one is a drop-in replacement, but I haven't had a chance to have a detailed look to see if there's any differences in behaviour. > + > /** > * eeh_addr_cache_build - Build a cache of I/O addresses > * > @@ -282,8 +293,6 @@ void eeh_addr_cache_build(void) > struct eeh_dev *edev; > struct pci_dev *dev = NULL; > > - spin_lock_init(&pci_io_addr_cache_root.piar_lock); > - > for_each_pci_dev(dev) { > pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); > if (!pdn)
Re: [PATCH] Add OPAL_GET_SYMBOL / OPAL_LOOKUP_SYMBOL
On Fri, Feb 28, 2020 at 2:09 PM Nicholas Piggin wrote: > > These calls can be used by Linux to annotate BUG addresses with symbols, > look up symbol addresses in xmon, etc. > > This is preferable over having Linux parse the OPAL symbol map itself, > because OPAL's parsing code already exists for its own symbol printing, > and it can support other code regions than the skiboot symbols, e.g., > the wake-up code in the HOMER (where CPUs have been seen to get stuck). > > Signed-off-by: Nicholas Piggin > --- > core/opal.c | 2 + > core/utils.c| 92 +++-- > doc/opal-api/opal-get-symbol-181.rst| 42 +++ > doc/opal-api/opal-lookup-symbol-182.rst | 35 ++ > include/opal-api.h | 4 +- > 5 files changed, 168 insertions(+), 7 deletions(-) > create mode 100644 doc/opal-api/opal-get-symbol-181.rst > create mode 100644 doc/opal-api/opal-lookup-symbol-182.rst > > diff --git a/core/opal.c b/core/opal.c > index d6ff6027b..d9fc4fe05 100644 > --- a/core/opal.c > +++ b/core/opal.c > @@ -142,6 +142,8 @@ int64_t opal_entry_check(struct stack_frame *eframe) > case OPAL_CEC_REBOOT: > case OPAL_CEC_REBOOT2: > case OPAL_SIGNAL_SYSTEM_RESET: > + case OPAL_GET_SYMBOL: > + case OPAL_LOOKUP_SYMBOL: These names are still awful :| > break; > default: > printf("CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=%04lx > cpu @%p -> pir=%04x token=%llu\n", > diff --git a/core/utils.c b/core/utils.c > index 8fd63fcb7..5f0d5130b 100644 > --- a/core/utils.c > +++ b/core/utils.c > @@ -48,40 +48,120 @@ char __attrconst tohex(uint8_t nibble) > return __tohex[nibble]; > } > > -static unsigned long get_symbol(unsigned long addr, char **sym, char > **sym_end) > +static unsigned long get_symbol(unsigned long addr, char **sym, char > **sym_end, unsigned long *size) > { > unsigned long prev = 0, next; > char *psym = NULL, *p = __sym_map_start; > > *sym = *sym_end = NULL; > - while(p < __sym_map_end) { > + while (p < __sym_map_end) { > next = strtoul(p, &p, 16) | SKIBOOT_BASE; > if (next > addr && prev <= addr) { > - p = psym + 3;; > + p = psym + 3; > if (p >= __sym_map_end) > return 0; > *sym = p; > - while(p < __sym_map_end && *p != 10) > + while (p < __sym_map_end && *p != '\n') > p++; > *sym_end = p; > + *size = next - prev; > return prev; > } > prev = next; > psym = p; > - while(p < __sym_map_end && *p != 10) > + while (p < __sym_map_end && *p != '\n') > p++; > p++; > } > return 0; > } > > +static unsigned long lookup_symbol(const char *name, unsigned long *size) > +{ > + size_t len = strlen(name); > + unsigned long addr = 0; > + char *sym; > + char *p = __sym_map_start; > + > + while (p < __sym_map_end) { > + addr = strtoul(p, &p, 16) | SKIBOOT_BASE; > + p += 3; > + if (p >= __sym_map_end) > + return 0; > + > + if (*(p + len) == '\n' && !strncmp(name, p, len)) { > + char *sym_end; > + > + if (get_symbol(addr, &sym, &sym_end, size) == 0) { > + assert(!strcmp(name, "_end")); > + *size = 0; > + } > + > + /* > +* May be more than one symbol at this address but > +* symbol length calculation should still work in > +* that case. > +*/ > + > + return addr; > + } > + > + while(p < __sym_map_end && *p != '\n') > + p++; > + p++; > + } > + return 0; > +} > + > +static int64_t opal_get_symbol(uint64_t addr, __be64 *symaddr, __be64 > *symsize, char *namebuf, uint64_t buflen) > +{ > + unsigned long saddr; > + unsigned long ssize; > + char *sym, *sym_end; > + size_t l; > + > + saddr = get_symbol(addr, &sym, &sym_end, &ssize); > + if (!saddr) > + return OPAL_RESOURCE; > + > + if (buflen > sym_end - sym) > + l = sym_end - sym; > + else > + l = buflen - 1; > + memcpy(namebuf, sym, l); > + namebuf[l] = '\0'; > + > + *symaddr = cpu_to_be64(saddr); > + *symsize = cpu_to_be64(ssize); > + > +
[PATCH v2 1/6] powerpc/eeh: Add sysfs files in late probe
Move creating the EEH specific sysfs files into eeh_add_device_late() rather than being open-coded all over the place. Calling the function is generally done immediately after calling eeh_add_device_late() anyway. This is also a correctness fix since currently the sysfs files will be added even if the EEH probe happens to fail. Similarly, on pseries we currently add the sysfs files before calling eeh_add_device_late(). This is flat-out broken since the sysfs files require the pci_dev->dev.archdata.edev pointer to be set, and that is done in eeh_add_device_late(). Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran --- v2: Reworded commit message based on Sam Bobroff's comments. About the current behaviour being broken. --- arch/powerpc/include/asm/eeh.h | 3 --- arch/powerpc/kernel/eeh.c| 24 +--- arch/powerpc/kernel/of_platform.c| 3 --- arch/powerpc/kernel/pci-common.c | 3 --- arch/powerpc/platforms/powernv/eeh-powernv.c | 1 - arch/powerpc/platforms/pseries/eeh_pseries.c | 3 +-- 6 files changed, 2 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 6f9b2a1..5a34907 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -305,7 +305,6 @@ void eeh_add_device_early(struct pci_dn *); void eeh_add_device_tree_early(struct pci_dn *); void eeh_add_device_late(struct pci_dev *); void eeh_add_device_tree_late(struct pci_bus *); -void eeh_add_sysfs_files(struct pci_bus *); void eeh_remove_device(struct pci_dev *); int eeh_unfreeze_pe(struct eeh_pe *pe); int eeh_pe_reset_and_recover(struct eeh_pe *pe); @@ -368,8 +367,6 @@ static inline void eeh_add_device_late(struct pci_dev *dev) { } static inline void eeh_add_device_tree_late(struct pci_bus *bus) { } -static inline void eeh_add_sysfs_files(struct pci_bus *bus) { } - static inline void eeh_remove_device(struct pci_dev *dev) { } #define EEH_POSSIBLE_ERROR(val, type) (0) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 17cb3e9..0878912 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1210,6 +1210,7 @@ void eeh_add_device_late(struct pci_dev *dev) dev->dev.archdata.edev = edev; eeh_addr_cache_insert_dev(dev); + eeh_sysfs_add_device(dev); } /** @@ -1238,29 +1239,6 @@ void eeh_add_device_tree_late(struct pci_bus *bus) EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); /** - * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus - * @bus: PCI bus - * - * This routine must be used to add EEH sysfs files for PCI - * devices which are attached to the indicated PCI bus. The PCI bus - * is added after system boot through hotplug or dlpar. - */ -void eeh_add_sysfs_files(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - eeh_sysfs_add_device(dev); - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - struct pci_bus *subbus = dev->subordinate; - if (subbus) - eeh_add_sysfs_files(subbus); - } - } -} -EXPORT_SYMBOL_GPL(eeh_add_sysfs_files); - -/** * eeh_remove_device - Undo EEH setup for the indicated pci device * @dev: pci device to be removed * diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 427fc22..cb68800 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -86,9 +86,6 @@ static int of_pci_phb_probe(struct platform_device *dev) /* Add probed PCI devices to the device model */ pci_bus_add_devices(phb->bus); - /* sysfs files should only be added after devices are added */ - eeh_add_sysfs_files(phb->bus); - return 0; } diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index c6c0341..3d2b1cf 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1404,9 +1404,6 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus) /* Add new devices to global lists. Register in proc, sysfs. */ pci_bus_add_devices(bus); - - /* sysfs files should only be added after devices are added */ - eeh_add_sysfs_files(bus); } EXPORT_SYMBOL_GPL(pcibios_finish_adding_to_bus); diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 6f300ab..ef727ec 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -48,7 +48,6 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) dev_dbg(&pdev->dev, "EEH: Setting up device\n"); eeh_add_device_early(pdn); eeh_add_device_late(pdev); - eeh_sysfs_add_device(pdev); } sta
[PATCH v2 2/6] powerpc/eeh: Remove eeh_add_device_tree_late()
On pseries and PowerNV pcibios_bus_add_device() calls eeh_add_device_late() so there's no need to do a separate tree traversal to bind the eeh_dev and pci_dev together setting up the PHB at boot. As a result we can remove eeh_add_device_tree_late(). Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/eeh.h| 3 --- arch/powerpc/kernel/eeh.c | 25 - arch/powerpc/kernel/of_platform.c | 3 --- arch/powerpc/kernel/pci-common.c | 3 --- 4 files changed, 34 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 5a34907..5d10781 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -304,7 +304,6 @@ void eeh_addr_cache_init(void); void eeh_add_device_early(struct pci_dn *); void eeh_add_device_tree_early(struct pci_dn *); void eeh_add_device_late(struct pci_dev *); -void eeh_add_device_tree_late(struct pci_bus *); void eeh_remove_device(struct pci_dev *); int eeh_unfreeze_pe(struct eeh_pe *pe); int eeh_pe_reset_and_recover(struct eeh_pe *pe); @@ -365,8 +364,6 @@ static inline void eeh_add_device_tree_early(struct pci_dn *pdn) { } static inline void eeh_add_device_late(struct pci_dev *dev) { } -static inline void eeh_add_device_tree_late(struct pci_bus *bus) { } - static inline void eeh_remove_device(struct pci_dev *dev) { } #define EEH_POSSIBLE_ERROR(val, type) (0) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 0878912..9cb3370 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1214,31 +1214,6 @@ void eeh_add_device_late(struct pci_dev *dev) } /** - * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus - * @bus: PCI bus - * - * This routine must be used to perform EEH initialization for PCI - * devices which are attached to the indicated PCI bus. The PCI bus - * is added after system boot through hotplug or dlpar. - */ -void eeh_add_device_tree_late(struct pci_bus *bus) -{ - struct pci_dev *dev; - - if (eeh_has_flag(EEH_FORCE_DISABLED)) - return; - list_for_each_entry(dev, &bus->devices, bus_list) { - eeh_add_device_late(dev); - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - struct pci_bus *subbus = dev->subordinate; - if (subbus) - eeh_add_device_tree_late(subbus); - } - } -} -EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); - -/** * eeh_remove_device - Undo EEH setup for the indicated pci device * @dev: pci device to be removed * diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index cb68800..64edac81 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -80,9 +80,6 @@ static int of_pci_phb_probe(struct platform_device *dev) */ pcibios_claim_one_bus(phb->bus); - /* Finish EEH setup */ - eeh_add_device_tree_late(phb->bus); - /* Add probed PCI devices to the device model */ pci_bus_add_devices(phb->bus); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 3d2b1cf..8983afa 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1399,9 +1399,6 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus) pci_assign_unassigned_bus_resources(bus); } - /* Fixup EEH */ - eeh_add_device_tree_late(bus); - /* Add new devices to global lists. Register in proc, sysfs. */ pci_bus_add_devices(bus); } -- 2.9.5
[PATCH v2 3/6] powerpc/eeh: Do early EEH init only when required
The pci hotplug helper (pci_hp_add_devices()) calls eeh_add_device_tree_early() to scan the device-tree for new PCI devices and do the early EEH probe before the device is scanned. This early probe is a no-op in a lot of cases because: a) The early init is only required to satisfy a PAPR requirement that EEH be configured before we start doing config accesses. On PowerNV it is a no-op. b) It's a no-op for devices that have already had their eeh_dev initialised. There are four callers of pci_hp_add_devices(): 1. arch/powerpc/kernel/eeh_driver.c Here the hotplug helper is called when re-scanning pci_devs that were removed during an EEH recovery pass. The EEH stat for each removed device (the eeh_dev) is retained across a recovery pass so the early init is a no-op in this case. 2. drivers/pci/hotplug/pnv_php.c This is also a no-op since the PowerNV hotplug driver is, suprisingly, PowerNV specific. 3. drivers/pci/hotplug/rpaphp_core.c 4. drivers/pci/hotplug/rpaphp_pci.c In these two cases new devices have been hotplugged and FW has provided new DT nodes for each. These are the only two cases where the EEH we might have new PCI device nodes in the DT so these are the only two cases where the early EEH probe needs to be done. We can move the calls to eeh_add_device_tree_early() to the locations where it's needed and remove it from the generic path. This is preparation for making the early EEH probe pseries specific. Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/pci-hotplug.c | 2 -- drivers/pci/hotplug/rpaphp_core.c | 2 ++ drivers/pci/hotplug/rpaphp_pci.c | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index d6a67f8..bf83f76 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -112,8 +112,6 @@ void pci_hp_add_devices(struct pci_bus *bus) struct pci_controller *phb; struct device_node *dn = pci_bus_to_OF_node(bus); - eeh_add_device_tree_early(PCI_DN(dn)); - phb = pci_bus_to_host(bus); mode = PCI_PROBE_NORMAL; diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index e408e40..9c1e43e 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -494,6 +494,8 @@ static int enable_slot(struct hotplug_slot *hotplug_slot) return retval; if (state == PRESENT) { + eeh_add_device_tree_early(PCI_DN(slot->dn)); + pci_lock_rescan_remove(); pci_hp_add_devices(slot->bus); pci_unlock_rescan_remove(); diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c index beca61b..61ebbd8 100644 --- a/drivers/pci/hotplug/rpaphp_pci.c +++ b/drivers/pci/hotplug/rpaphp_pci.c @@ -95,8 +95,10 @@ int rpaphp_enable_slot(struct slot *slot) return -EINVAL; } - if (list_empty(&bus->devices)) + if (list_empty(&bus->devices)) { + eeh_add_device_tree_early(PCI_DN(slot->dn)); pci_hp_add_devices(bus); + } if (!list_empty(&bus->devices)) { slot->state = CONFIGURED; -- 2.9.5
[PATCH v2 4/6] powerpc/eeh: Remove PHB check in probe
This check for a missing PHB has existing in various forms since the initial PPC64 port was upstreamed in 2002. The idea seems to be that we need to guard against creating pci-specific data structures for the non-pci children of a PCI device tree node (e.g. USB devices). However, we only create pci_dn structures for DT nodes that correspond to PCI devices so there's not much point in doing this check in the eeh_probe path. Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/eeh.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 9cb3370..a9e4ca7 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1120,7 +1120,6 @@ core_initcall_sync(eeh_init); */ void eeh_add_device_early(struct pci_dn *pdn) { - struct pci_controller *phb = pdn ? pdn->phb : NULL; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); if (!edev) @@ -1129,11 +1128,6 @@ void eeh_add_device_early(struct pci_dn *pdn) if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) return; - /* USB Bus children of PCI devices will not have BUID's */ - if (NULL == phb || - (eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid)) - return; - eeh_ops->probe(pdn, NULL); } -- 2.9.5
[PATCH v2 5/6] powerpc/eeh: Make early EEH init pseries specific
The eeh_ops->probe() function is called from two different contexts: 1. On pseries, where we set EEH_PROBE_MODE_DEVTREE, it's called in eeh_add_device_early() which is supposed to run before we create a pci_dev. 2. On PowerNV, where we set EEH_PROBE_MODE_DEV, it's called in eeh_device_add_late() which is supposed to run *after* the pci_dev is created. The "early" probe is required because PAPR requires that we perform an RTAS call to enable EEH support on a device before we start interacting with it via config space or MMIO. This requirement doesn't exist on PowerNV and shoehorning two completely separate initialisation paths into a common interface just results in a convoluted code everywhere. Additionally the early probe requires the probe function to take an pci_dn rather than a pci_dev argument. We'd like to make pci_dn a pseries specific data structure since there's no real requirement for them on PowerNV. To help both goals move the early probe into the pseries containment zone so the platform depedence is more explicit. Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran --- v2: s/set set/we set/ in the commit message --- arch/powerpc/include/asm/eeh.h | 14 +++--- arch/powerpc/kernel/eeh.c| 46 arch/powerpc/kernel/of_platform.c| 6 +-- arch/powerpc/platforms/powernv/eeh-powernv.c | 6 --- arch/powerpc/platforms/pseries/eeh_pseries.c | 65 ++-- arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +- drivers/pci/hotplug/rpadlpar_core.c | 2 +- drivers/pci/hotplug/rpaphp_core.c| 2 +- drivers/pci/hotplug/rpaphp_pci.c | 2 +- 9 files changed, 64 insertions(+), 81 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 5d10781..8580238 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -301,8 +301,6 @@ int __exit eeh_ops_unregister(const char *name); int eeh_check_failure(const volatile void __iomem *token); int eeh_dev_check_failure(struct eeh_dev *edev); void eeh_addr_cache_init(void); -void eeh_add_device_early(struct pci_dn *); -void eeh_add_device_tree_early(struct pci_dn *); void eeh_add_device_late(struct pci_dev *); void eeh_remove_device(struct pci_dev *); int eeh_unfreeze_pe(struct eeh_pe *pe); @@ -358,10 +356,6 @@ static inline int eeh_check_failure(const volatile void __iomem *token) static inline void eeh_addr_cache_init(void) { } -static inline void eeh_add_device_early(struct pci_dn *pdn) { } - -static inline void eeh_add_device_tree_early(struct pci_dn *pdn) { } - static inline void eeh_add_device_late(struct pci_dev *dev) { } static inline void eeh_remove_device(struct pci_dev *dev) { } @@ -370,6 +364,14 @@ static inline void eeh_remove_device(struct pci_dev *dev) { } #define EEH_IO_ERROR_VALUE(size) (-1UL) #endif /* CONFIG_EEH */ +#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_EEH) +void pseries_eeh_init_edev(struct pci_dn *pdn); +void pseries_eeh_init_edev_recursive(struct pci_dn *pdn); +#else +static inline void pseries_eeh_add_device_early(struct pci_dn *pdn) { } +static inline void pseries_eeh_add_device_tree_early(struct pci_dn *pdn) { } +#endif + #ifdef CONFIG_PPC64 /* * MMIO read/write operations with EEH support. diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index a9e4ca7..55d3ef6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1107,52 +1107,6 @@ static int eeh_init(void) core_initcall_sync(eeh_init); /** - * eeh_add_device_early - Enable EEH for the indicated device node - * @pdn: PCI device node for which to set up EEH - * - * This routine must be used to perform EEH initialization for PCI - * devices that were added after system boot (e.g. hotplug, dlpar). - * This routine must be called before any i/o is performed to the - * adapter (inluding any config-space i/o). - * Whether this actually enables EEH or not for this device depends - * on the CEC architecture, type of the device, on earlier boot - * command-line arguments & etc. - */ -void eeh_add_device_early(struct pci_dn *pdn) -{ - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - - if (!edev) - return; - - if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) - return; - - eeh_ops->probe(pdn, NULL); -} - -/** - * eeh_add_device_tree_early - Enable EEH for the indicated device - * @pdn: PCI device node - * - * This routine must be used to perform EEH initialization for the - * indicated PCI device that was added after system boot (e.g. - * hotplug, dlpar). - */ -void eeh_add_device_tree_early(struct pci_dn *pdn) -{ - struct pci_dn *n; - - if (!pdn) - return; - - list_for_each_entry(n, &pdn->child_list, list) - eeh_add_device_tree_early(n); - eeh_add_d
[PATCH v2 6/6] powerpc/eeh: Rework eeh_ops->probe()
With the EEH early probe now being pseries specific there's no need for eeh_ops->probe() to take a pci_dn. Instead, we can make it take a pci_dev and use the probe function to map a pci_dev to an eeh_dev. This allows the platform to implement it's own method for finding (or creating) an eeh_dev for a given pci_dev which also removes a use of pci_dn in generic EEH code. This patch also renames eeh_device_add_late() to eeh_device_probe(). This better reflects what it does does and removes the last vestiges of the early/late EEH probe split. Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran --- v2: Fixed the comment block above eeh_probe_device() to use the new function name. --- arch/powerpc/include/asm/eeh.h | 6 ++-- arch/powerpc/kernel/eeh.c| 44 +++- arch/powerpc/platforms/powernv/eeh-powernv.c | 30 +-- arch/powerpc/platforms/pseries/eeh_pseries.c | 23 ++- 4 files changed, 62 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 8580238..964a542 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -215,7 +215,7 @@ enum { struct eeh_ops { char *name; int (*init)(void); - void* (*probe)(struct pci_dn *pdn, void *data); + struct eeh_dev *(*probe)(struct pci_dev *pdev); int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); int (*get_state)(struct eeh_pe *pe, int *delay); @@ -301,7 +301,7 @@ int __exit eeh_ops_unregister(const char *name); int eeh_check_failure(const volatile void __iomem *token); int eeh_dev_check_failure(struct eeh_dev *edev); void eeh_addr_cache_init(void); -void eeh_add_device_late(struct pci_dev *); +void eeh_probe_device(struct pci_dev *pdev); void eeh_remove_device(struct pci_dev *); int eeh_unfreeze_pe(struct eeh_pe *pe); int eeh_pe_reset_and_recover(struct eeh_pe *pe); @@ -356,7 +356,7 @@ static inline int eeh_check_failure(const volatile void __iomem *token) static inline void eeh_addr_cache_init(void) { } -static inline void eeh_add_device_late(struct pci_dev *dev) { } +static inline void eeh_probe_device(struct pci_dev *dev) { } static inline void eeh_remove_device(struct pci_dev *dev) { } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 55d3ef6..7cdcb41 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1107,35 +1107,43 @@ static int eeh_init(void) core_initcall_sync(eeh_init); /** - * eeh_add_device_late - Perform EEH initialization for the indicated pci device + * eeh_probe_device() - Perform EEH initialization for the indicated pci device * @dev: pci device for which to set up EEH * * This routine must be used to complete EEH initialization for PCI * devices that were added after system boot (e.g. hotplug, dlpar). */ -void eeh_add_device_late(struct pci_dev *dev) +void eeh_probe_device(struct pci_dev *dev) { - struct pci_dn *pdn; struct eeh_dev *edev; - if (!dev) + pr_debug("EEH: Adding device %s\n", pci_name(dev)); + + /* +* pci_dev_to_eeh_dev() can only work if eeh_probe_dev() was +* already called for this device. +*/ + if (WARN_ON_ONCE(pci_dev_to_eeh_dev(dev))) { + pci_dbg(dev, "Already bound to an eeh_dev!\n"); return; + } - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - edev = pdn_to_eeh_dev(pdn); - eeh_edev_dbg(edev, "Adding device\n"); - if (edev->pdev == dev) { - eeh_edev_dbg(edev, "Device already referenced!\n"); + edev = eeh_ops->probe(dev); + if (!edev) { + pr_debug("EEH: Adding device failed\n"); return; } /* -* The EEH cache might not be removed correctly because of -* unbalanced kref to the device during unplug time, which -* relies on pcibios_release_device(). So we have to remove -* that here explicitly. +* FIXME: We rely on pcibios_release_device() to remove the +* existing EEH state. The release function is only called if +* the pci_dev's refcount drops to zero so if something is +* keeping a ref to a device (e.g. a filesystem) we need to +* remove the old EEH state. +* +* FIXME: HEY MA, LOOK AT ME, NO LOCKING! */ - if (edev->pdev) { + if (edev->pdev && edev->pdev != dev) { eeh_rmv_from_parent_pe(edev); eeh_addr_cache_rmv_dev(edev->pdev); eeh_sysfs_remove_device(edev->pdev); @@ -1146,17 +1154,11 @@ void eeh_add_device_late(struct pci_dev *dev) * into error handler afterwards. */ edev->
Re: [PATCHv3 2/2] pseries/scm: buffer pmem's bound addr in dt for kexec kernel
On Wed, Mar 4, 2020 at 7:50 PM Pingfan Liu wrote: > > At present, plpar_hcall(H_SCM_BIND_MEM, ...) takes a very long time, so > if dumping to fsdax, it will take a very long time. > > Take a closer look, during the papr_scm initialization, the only > configuration is through drc_pmem_bind()-> plpar_hcall(H_SCM_BIND_MEM, > ...), which helps to set up the bound address. > > On pseries, for kexec -l/-p kernel, there is no reset of hardware, and this > step can be stepped around to save times. So the pmem bound address can be > passed to the 2nd kernel through a dynamic added property "bound-addr" in > dt node 'ibm,pmemory'. > > Signed-off-by: Pingfan Liu > To: linuxppc-dev@lists.ozlabs.org > Cc: Benjamin Herrenschmidt > Cc: Paul Mackerras > Cc: Michael Ellerman > Cc: Hari Bathini > Cc: Aneesh Kumar K.V > Cc: Oliver O'Halloran > Cc: Dan Williams > Cc: Andrew Donnellan > Cc: Christophe Leroy > Cc: Rob Herring > Cc: Frank Rowand > Cc: ke...@lists.infradead.org > --- > note: This patch has not been tested since I can not get such a pseries with > pmem. > Please kindly to give some suggestion, thanks. There was some qemu patches to implement the Hcall interface floating around a while ago. I'm not sure they ever made it into upstream qemu though. > --- > arch/powerpc/platforms/pseries/of_helpers.c | 1 + > arch/powerpc/platforms/pseries/papr_scm.c | 33 > - > drivers/of/base.c | 1 + > 3 files changed, 25 insertions(+), 10 deletions(-) > > diff --git a/arch/powerpc/platforms/pseries/of_helpers.c > b/arch/powerpc/platforms/pseries/of_helpers.c > index 1022e0f..2c7bab4 100644 > --- a/arch/powerpc/platforms/pseries/of_helpers.c > +++ b/arch/powerpc/platforms/pseries/of_helpers.c > @@ -34,6 +34,7 @@ struct property *new_property(const char *name, const int > length, > kfree(new); > return NULL; > } > +EXPORT_SYMBOL(new_property); > > /** > * pseries_of_derive_parent - basically like dirname(1) > diff --git a/arch/powerpc/platforms/pseries/papr_scm.c > b/arch/powerpc/platforms/pseries/papr_scm.c > index 0b4467e..54ae903 100644 > --- a/arch/powerpc/platforms/pseries/papr_scm.c > +++ b/arch/powerpc/platforms/pseries/papr_scm.c > @@ -14,6 +14,7 @@ > #include > > #include > +#include "of_helpers.h" > > #define BIND_ANY_ADDR (~0ul) > > @@ -383,7 +384,7 @@ static int papr_scm_probe(struct platform_device *pdev) > { > struct device_node *dn = pdev->dev.of_node; > u32 drc_index, metadata_size; > - u64 blocks, block_size; > + u64 blocks, block_size, bound_addr = 0; > struct papr_scm_priv *p; > const char *uuid_str; > u64 uuid[2]; > @@ -440,17 +441,29 @@ static int papr_scm_probe(struct platform_device *pdev) > p->metadata_size = metadata_size; > p->pdev = pdev; > > - /* request the hypervisor to bind this region to somewhere in memory > */ > - rc = drc_pmem_bind(p); > + of_property_read_u64(dn, "bound-addr", &bound_addr); > + if (bound_addr) { > + p->bound_addr = bound_addr; > + } else { > + struct property *property; > + u64 big; > > - /* If phyp says drc memory still bound then force unbound and retry */ > - if (rc == H_OVERLAP) > - rc = drc_pmem_query_n_bind(p); > + /* request the hypervisor to bind this region to somewhere in > memory */ > + rc = drc_pmem_bind(p); > > - if (rc != H_SUCCESS) { > - dev_err(&p->pdev->dev, "bind err: %d\n", rc); > - rc = -ENXIO; > - goto err; > + /* If phyp says drc memory still bound then force unbound and > retry */ > + if (rc == H_OVERLAP) > + rc = drc_pmem_query_n_bind(p); > + > + if (rc != H_SUCCESS) { > + dev_err(&p->pdev->dev, "bind err: %d\n", rc); > + rc = -ENXIO; > + goto err; > + } > + big = cpu_to_be64(p->bound_addr); > + property = new_property("bound-addr", sizeof(u64), (const > unsigned char *)&big, > + NULL); That should probably be "linux,bound-addr" The other thing that stands out to me is that you aren't removing the property when the region is unbound. As a general rule I'd prefer we didn't hack the DT at runtime, but if we are going to then we s
Re: [PATCH v8 04/14] powerpc/vas: Alloc and setup IRQ and trigger port address
On Mon, Mar 23, 2020 at 8:28 PM Cédric Le Goater wrote: > > On 3/23/20 10:06 AM, Cédric Le Goater wrote: > > On 3/19/20 7:14 AM, Haren Myneni wrote: > >> > >> Alloc IRQ and get trigger port address for each VAS instance. Kernel > >> register this IRQ per VAS instance and sets this port for each send > >> window. NX interrupts the kernel when it sees page fault. > > > > I don't understand why this is not done by the OPAL driver for each VAS > > of the system. Is the VAS unit very different from OpenCAPI regarding > > the fault ? > > I checked the previous patchsets and I see that v3 was more like I expected > it: one interrupt for faults allocated by the skiboot driver and exposed > in the DT. > > What made you change your mind ? >From init_vas_inst() in arch/powerpc/platforms/powernv/vas.c: if (pdev->num_resources != 4) { pr_err("Unexpected DT configuration for [%s, %d]\n", pdev->name, vasid); return -ENODEV; } This code should never have been written, but here we are. Due to the above adding an interrupt in the DT makes the driver unable to bind on older kernels. In an older version of the patches (don't think it was posted) Haren was using a non-standard interrupt property and we could work around the problem by going back to that. However, we already have the OPAL calls for allocating / freeing hardware interrupt numbers so why not do that? If we ever want to take advantage of the job completion interrupts we'd want to have the ability to allocate them since the completion interrupts are per-window rather than per-VAS. > This version is hijacking the lowlevel routines of the XIVE irqchip which > is not the best approach. OCXL is doing that because it needs to allocate > interrupts for the user space processes using the AFU and we should rework > that part. What'd you have in mind for the reworking the oxcl interrupt allocation? I didn't find it that objectionable since it's more or less the same as what happens when allocating IPIs. > However, the translation fault interrupt is allocated by skiboot. > > Sorry for the noise, I would like to understand more how this works. I also > have passthrough in mind. > > C.
Re: [PATCH v4 00/25] Add support for OpenCAPI Persistent Memory devices
On Thu, Apr 2, 2020 at 2:42 PM Michael Ellerman wrote: > > "Alastair D'Silva" writes: > >> -Original Message- > >> From: Dan Williams > >> > >> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva > >> wrote: > >> > > >> > *snip* > >> Are OPAL calls similar to ACPI DSMs? I.e. methods for the OS to invoke > >> platform firmware services? What's Skiboot? > >> > > > > Yes, OPAL is the interface to firmware for POWER. Skiboot is the > > open-source (and only) implementation of OPAL. > > https://github.com/open-power/skiboot > > In particular the tokens for calls are defined here: > > https://github.com/open-power/skiboot/blob/master/include/opal-api.h#L220 > > And you can grep for the token to find the implementation: > > https://github.com/open-power/skiboot/blob/master/hw/npu2-opencapi.c#L2328 I'm not sure I'd encourage anyone to read npu2-opencapi.c. I find it hard enough to follow even with access to the workbooks. There's an OPAL call API reference here: http://open-power.github.io/skiboot/doc/opal-api/index.html Oliver
Re: [PATCH v3 1/1] powerpc/kernel: Enables memory hot-remove after reboot on pseries guests
On Fri, Apr 3, 2020 at 6:55 AM Leonardo Bras wrote: > > While providing guests, it's desirable to resize it's memory on demand. > > By now, it's possible to do so by creating a guest with a small base > memory, hot-plugging all the rest, and using 'movable_node' kernel > command-line parameter, which puts all hot-plugged memory in > ZONE_MOVABLE, allowing it to be removed whenever needed. > > But there is an issue regarding guest reboot: > If memory is hot-plugged, and then the guest is rebooted, all hot-plugged > memory goes to ZONE_NORMAL, which offers no guaranteed hot-removal. > It usually prevents this memory to be hot-removed from the guest. > > It's possible to use device-tree information to fix that behavior, as > it stores flags for LMB ranges on ibm,dynamic-memory-vN. > It involves marking each memblock with the correct flags as hotpluggable > memory, which mm/memblock.c puts in ZONE_MOVABLE during boot if > 'movable_node' is passed. I don't really understand why the flag is needed at all. According to PAPR any memory provided by dynamic reconfiguration can be hot-removed so why aren't we treating all DR memory as hot removable? The only memory guaranteed to be there 100% of the time is what's in the /memory@0 node since that's supposed to cover the real mode area.
Re: [PATCH v3 1/1] powerpc/kernel: Enables memory hot-remove after reboot on pseries guests
On Fri, Apr 3, 2020 at 10:07 AM Leonardo Bras wrote: > > Hello Oliver, thank you for the feedback. > Comments inline: > > On Fri, 2020-04-03 at 09:46 +1100, Oliver O'Halloran wrote: > > > > I don't really understand why the flag is needed at all. According to > > PAPR any memory provided by dynamic reconfiguration can be hot-removed > > so why aren't we treating all DR memory as hot removable? The only > > memory guaranteed to be there 100% of the time is what's in the > > /memory@0 node since that's supposed to cover the real mode area. > > All LMBs are listed in DR memory, even the base memory. > > The v1 of the patch would work this way, as qemu would configure it's > DR memory with (DRC_INVALID | RESERVED) flags and the hot-added memory > with (ASSIGNED) flag. Looking for assigned flag would be enough. > > But as of today, PowerVM doesn't seem to work that way. > When you boot a PowerVM virtual machine with Linux, all memory is added > with the same flags (ASSIGNED). > > To create a solution that doesn't break PowerVM, this new flag was made > necessary. I'm still not convinced it's necessary. Why not check memory@0 and use the size as a clip level? Any memory above that level gets marked as hotpluggable and anything below doesn't. Seems to me that would work on all current platforms, so what am I missing here? > > Best regards, > Leonardo Bras
Re: [PATCH 1/4] powerpc/eeh: fix pseries_eeh_configure_bridge()
On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote: > If a device is hot unplgged during EEH recovery, it's possible for the > RTAS call to ibm,configure-pe in pseries_eeh_configure() to return > parameter error (-3), however negative return values are not checked > for and this leads to an infinite loop. > > Fix this by correctly bailing out on negative values. > This should probably be a standalone patch. Looks fine otherwise. Reviewed-by: Oliver O'Halloran > Signed-off-by: Sam Bobroff > --- > arch/powerpc/platforms/pseries/eeh_pseries.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c > b/arch/powerpc/platforms/pseries/eeh_pseries.c > index 893ba3f562c4..c4ef03bec0de 100644 > --- a/arch/powerpc/platforms/pseries/eeh_pseries.c > +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c > @@ -605,7 +605,7 @@ static int pseries_eeh_configure_bridge(struct eeh_pe *pe) > config_addr, BUID_HI(pe->phb->buid), > BUID_LO(pe->phb->buid)); > > - if (!ret) > + if (ret <= 0) > return ret; > > /*
Re: [PATCH 2/4] powerpc/eeh: Release EEH device state synchronously
On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote: > EEH device state is currently removed (by eeh_remove_device()) during > the device release handler, which is invoked as the device's reference > count drops to zero. This may take some time, or forever, as other > threads may hold references. > > However, the PCI device state is released synchronously by > pci_stop_and_remove_bus_device(). This mismatch causes problems, for > example the device may be re-discovered as a new device before the > release handler has been called, leaving the PCI and EEH state > mismatched. > > So instead, call eeh_remove_device() from the bus device removal > handlers, which are called synchronously in the removal path. > > Signed-off-by: Sam Bobroff > --- > arch/powerpc/kernel/eeh.c | 26 ++ > arch/powerpc/kernel/pci-hotplug.c | 2 -- > 2 files changed, 26 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c > index 17cb3e9b5697..c36c5a7db5ca 100644 > --- a/arch/powerpc/kernel/eeh.c > +++ b/arch/powerpc/kernel/eeh.c > @@ -1106,6 +1106,32 @@ static int eeh_init(void) > > core_initcall_sync(eeh_init); > > +static int eeh_device_notifier(struct notifier_block *nb, > +unsigned long action, void *data) > +{ > + struct device *dev = data; > + > + switch (action) { > + case BUS_NOTIFY_DEL_DEVICE: > + eeh_remove_device(to_pci_dev(dev)); > + break; > + default: > + break; > + } A comment briefly explaining why we're not doing anything in the add case might be nice. Reviewed-by: Oliver O'Halloran > + return NOTIFY_DONE; > +} > + > +static struct notifier_block eeh_device_nb = { > + .notifier_call = eeh_device_notifier, > +}; > + > +static __init int eeh_set_bus_notifier(void) > +{ > + bus_register_notifier(&pci_bus_type, &eeh_device_nb); > + return 0; > +} > +arch_initcall(eeh_set_bus_notifier); > + > /** > * eeh_add_device_early - Enable EEH for the indicated device node > * @pdn: PCI device node for which to set up EEH > diff --git a/arch/powerpc/kernel/pci-hotplug.c > b/arch/powerpc/kernel/pci-hotplug.c > index d6a67f814983..28e9aa274f64 100644 > --- a/arch/powerpc/kernel/pci-hotplug.c > +++ b/arch/powerpc/kernel/pci-hotplug.c > @@ -57,8 +57,6 @@ void pcibios_release_device(struct pci_dev *dev) > struct pci_controller *phb = pci_bus_to_host(dev->bus); > struct pci_dn *pdn = pci_get_pdn(dev); > > - eeh_remove_device(dev); > - > if (phb->controller_ops.release_device) > phb->controller_ops.release_device(dev); >
Re: [PATCH 4/4] powerpc/eeh: Clean up edev cleanup for VFs
On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote: > Because the bus notifier calls eeh_rmv_from_parent_pe() (via > eeh_remove_device()) when a VF is removed, the call in > remove_sriov_vf_pdns() is redundant. eeh_rmv_from_parent_pe() won't actually remove the device if the recovering flag is set on the PE. Are you sure we're not introducing a race here?
Re: [PATCH 3/4] powerpc/eeh: Remove workaround from eeh_add_device_late()
On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote: > When EEH device state was released asynchronously by the device > release handler, it was possible for an outstanding reference to > prevent it's release and it was necessary to work around that if a > device was re-discovered at the same PCI location. I think this is a bit misleading. The main situation where you'll hit this hack is when recovering a device with a driver that doesn't implement the error handling callbacks. In that case the device is removed, reset, then re-probed by the PCI core, but we assume it's the same physical device so the eeh_device state remains active. If you actually changed the underlying device I suspect something bad would happen. > Now that the state is released synchronously that is no longer > possible and the workaround is no longer necessary. You could probably fold this into the previous patch, but eh. You could probably fold this into the previous patch, but eh. > Signed-off-by: Sam Bobroff > --- > arch/powerpc/kernel/eeh.c | 23 +-- > 1 file changed, 1 insertion(+), 22 deletions(-) > > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c > index c36c5a7db5ca..12c248a16527 100644 > --- a/arch/powerpc/kernel/eeh.c > +++ b/arch/powerpc/kernel/eeh.c > @@ -1206,28 +1206,7 @@ void eeh_add_device_late(struct pci_dev *dev) > eeh_edev_dbg(edev, "Device already referenced!\n"); > return; > } > - > - /* > - * The EEH cache might not be removed correctly because of > - * unbalanced kref to the device during unplug time, which > - * relies on pcibios_release_device(). So we have to remove > - * that here explicitly. > - */ > - if (edev->pdev) { > - eeh_rmv_from_parent_pe(edev); > - eeh_addr_cache_rmv_dev(edev->pdev); > - eeh_sysfs_remove_device(edev->pdev); > - > - /* > - * We definitely should have the PCI device removed > - * though it wasn't correctly. So we needn't call > - * into error handler afterwards. > - */ > - edev->mode |= EEH_DEV_NO_HANDLER; > - > - edev->pdev = NULL; > - dev->dev.archdata.edev = NULL; > - } > + WARN_ON_ONCE(edev->pdev); > > if (eeh_has_flag(EEH_PROBE_MODE_DEV)) > eeh_ops->probe(pdn, NULL);
[PATCH 1/7] powerpc/powernv/npu: Clean up compound table group initialisation
Re-work the control flow a bit so what's going on is a little clearer. This also ensures the table_group is only initialised once in the P9 case. This shouldn't be a functional change since all the GPU PCI devices should have the same table_group configuration, but it does look strange. Cc: Alexey Kardashevskiy Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/npu-dma.c | 46 +++- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index b95b9e3c4c98..de617549c9a3 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -427,7 +427,7 @@ static void pnv_comp_attach_table_group(struct npu_comp *npucomp, struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) { - struct iommu_table_group *table_group; + struct iommu_table_group *compound_group; struct npu_comp *npucomp; struct pci_dev *gpdev = NULL; struct pci_controller *hose; @@ -446,36 +446,32 @@ struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) hose = pci_bus_to_host(npdev->bus); if (hose->npu) { - table_group = &hose->npu->npucomp.table_group; - - if (!table_group->group) { - table_group->ops = &pnv_npu_peers_ops; - iommu_register_group(table_group, - hose->global_number, - pe->pe_number); - } + /* P9 case: compound group is per-NPU (all gpus, all links) */ + npucomp = &hose->npu->npucomp; } else { - /* Create a group for 1 GPU and attached NPUs for POWER8 */ - pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL); - table_group = &pe->npucomp->table_group; - table_group->ops = &pnv_npu_peers_ops; - iommu_register_group(table_group, hose->global_number, - pe->pe_number); + /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */ + npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL); } - /* Steal capabilities from a GPU PE */ - table_group->max_dynamic_windows_supported = - pe->table_group.max_dynamic_windows_supported; - table_group->tce32_start = pe->table_group.tce32_start; - table_group->tce32_size = pe->table_group.tce32_size; - table_group->max_levels = pe->table_group.max_levels; - if (!table_group->pgsizes) - table_group->pgsizes = pe->table_group.pgsizes; + compound_group = &npucomp->table_group; + if (!compound_group->group) { + compound_group->ops = &pnv_npu_peers_ops; + iommu_register_group(compound_group, hose->global_number, + pe->pe_number); + + /* Steal capabilities from a GPU PE */ + compound_group->max_dynamic_windows_supported = + pe->table_group.max_dynamic_windows_supported; + compound_group->tce32_start = pe->table_group.tce32_start; + compound_group->tce32_size = pe->table_group.tce32_size; + compound_group->max_levels = pe->table_group.max_levels; + if (!compound_group->pgsizes) + compound_group->pgsizes = pe->table_group.pgsizes; + } - npucomp = container_of(table_group, struct npu_comp, table_group); pnv_comp_attach_table_group(npucomp, pe); - return table_group; + return compound_group; } struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) -- 2.21.1
Make PowerNV IOMMU group setup saner (and fix it for hotpug)
Currently on PowerNV the IOMMU group of a device is initialised in boot-time fixup which runs after devices are probed. Because this is only run at boot time hotplugged devices do not recieve an iommu group assignment which prevents them from being passed through to a guest. This series fixes that by moving the point where IOMMU groups are registered to when we configure DMA for a PE, and moves the point where we add a device to the PE's IOMMU group into the per-device DMA setup callback for IODA phbs (pnv_pci_ioda_dma_dev_setup()). This change means that we'll do group setup for hotplugged devices and that we can remove the hack we have for VFs which are currently added to their group via a bus notifier. With this change there's no longer any per-device setup that needs to run in a fixup for ordinary PCI devices. The exception is, as per usual, NVLink devices. For those the GPU and any of it's NVLink devices need to be in a "compound" IOMMU group which keeps the DMA address spaces of each device in sync with it's attached devices. As a result that setup can only be done when both the NVLink devices and the GPU device has been probed, so that setup is still done in the fixup. Sucks, but it's still an improvement. Boot tested on a witherspoon with 6xGPUs and it didn't crash so it must be good.
[PATCH 2/7] powerpc/powernv/iov: Don't add VFs to iommu group during PE config
In pnv_ioda_setup_vf_PE() we register an iommu group for the VF PE then call pnv_ioda_setup_bus_iommu_group() to add devices to that group. However, this function is called before the VFs are scanned so there's no devices to add. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 57d3a6af1d52..2c340504fa77 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1622,7 +1622,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) #ifdef CONFIG_IOMMU_API iommu_register_group(&pe->table_group, pe->phb->hose->global_number, pe->pe_number); - pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL); #endif } } -- 2.21.1
[PATCH 3/7] powerpc/powernv/pci: Register iommu group at PE DMA setup
Move the registration of IOMMU groups out of the post-phb init fixup and into when we configure DMA for a PE. For most devices this doesn't result in any functional changes, but for NVLink attached GPUs it requires a bit of care. When the GPU is probed an IOMMU group would be created for the PE that contains it. We need to ensure that group is removed before we add the PE to the compound group that's used to keep the translations see by the PCIe and NVLink buses the same. No functional changes. Probably. Cc: Alexey Kardashevskiy Cc: Reza Arbab Cc: Alistair Popple Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/npu-dma.c | 9 + arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index de617549c9a3..4fbbdfa8b327 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -469,6 +469,15 @@ struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) compound_group->pgsizes = pe->table_group.pgsizes; } + /* + * I'm not sure this is strictly required, but it's probably a good idea + * since the table_group for the PE is going to be attached to the + * compound table group. If we leave the PE's iommu group active then + * we might have the same table_group being modifiable via two sepeate + * iommu groups. + */ + iommu_group_put(pe->table_group.group); + pnv_comp_attach_table_group(npucomp, pe); return compound_group; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2c340504fa77..cf0aaef1b8fa 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1619,10 +1619,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) } pnv_pci_ioda2_setup_dma_pe(phb, pe); -#ifdef CONFIG_IOMMU_API - iommu_register_group(&pe->table_group, - pe->phb->hose->global_number, pe->pe_number); -#endif } } @@ -2661,9 +2657,6 @@ static void pnv_pci_ioda_setup_iommu_api(void) continue; table_group = &pe->table_group; - iommu_register_group(&pe->table_group, - pe->phb->hose->global_number, - pe->pe_number); } pnv_ioda_setup_bus_iommu_group(pe, table_group, pe->pbus); @@ -2748,14 +2741,17 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, IOMMU_TABLE_GROUP_MAX_TABLES; pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS; pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb); -#ifdef CONFIG_IOMMU_API - pe->table_group.ops = &pnv_pci_ioda2_ops; -#endif rc = pnv_pci_ioda2_setup_default_config(pe); if (rc) return; +#ifdef CONFIG_IOMMU_API + pe->table_group.ops = &pnv_pci_ioda2_ops; + iommu_register_group(&pe->table_group, phb->hose->global_number, +pe->pe_number); +#endif + if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus); } -- 2.21.1
[PATCH 4/7] powerpc/powernv/pci: Add device to iommu group during dma_dev_setup()
Historically adding devices to their respective iommu group has been handled by the post-init phb fixup for most devices. This was done because: 1) The IOMMU group is tied to the PE (usually) so we can only setup the iommu groups after we've done resource allocation since BAR location determines the device's PE, and: 2) The sysfs directory for the pci_dev needs to be available since iommu_add_device() wants to add an attribute for the iommu group. However, since commit 30d87ef8b38d ("powerpc/pci: Fix pcibios_setup_device() ordering") both conditions are met when hose->ops->dma_dev_setup() is called so there's no real need to do this in the fixup. Moving the call to iommu_add_device() into pnv_pci_ioda_dma_setup_dev() is a nice cleanup since it puts all the per-device IOMMU setup into one place. It also results in all (non-nvlink) devices getting their iommu group via a common path rather than relying on the bus notifier hack in pnv_tce_iommu_bus_notifier() to handle the adding VFs and hotplugged devices to their group. Cc: Alexey Kardashevskiy Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/npu-dma.c | 8 arch/powerpc/platforms/powernv/pci-ioda.c | 47 +++ arch/powerpc/platforms/powernv/pci.c | 20 -- 3 files changed, 21 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 4fbbdfa8b327..df27b8d7e78f 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -469,6 +469,12 @@ struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) compound_group->pgsizes = pe->table_group.pgsizes; } + /* +* The gpu would have been added to the iommu group that's created +* for the PE. Pull it out now. +*/ + iommu_del_device(&gpdev->dev); + /* * I'm not sure this is strictly required, but it's probably a good idea * since the table_group for the PE is going to be attached to the @@ -478,7 +484,9 @@ struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) */ iommu_group_put(pe->table_group.group); + /* now put the GPU into the compound group */ pnv_comp_attach_table_group(npucomp, pe); + iommu_add_device(compound_group, &gpdev->dev); return compound_group; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index cf0aaef1b8fa..9198b7882b57 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1774,12 +1774,10 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); pdev->dev.archdata.dma_offset = pe->tce_bypass_base; set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); - /* -* Note: iommu_add_device() will fail here as -* for physical PE: the device is already added by now; -* for virtual PE: sysfs entries are not ready yet and -* tce_iommu_bus_notifier will add the device to a group later. -*/ + + /* PEs with a DMA weight of zero won't have a group */ + if (pe->table_group.group) + iommu_add_device(&pe->table_group, &pdev->dev); } /* @@ -2628,39 +2626,20 @@ static void pnv_pci_ioda_setup_iommu_api(void) struct pnv_ioda_pe *pe; /* -* There are 4 types of PEs: -* - PNV_IODA_PE_BUS: a downstream port with an adapter, -* created from pnv_pci_setup_bridge(); -* - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it, -* created from pnv_pci_setup_bridge(); -* - PNV_IODA_PE_VF: a SRIOV virtual function, -* created from pnv_pcibios_sriov_enable(); -* - PNV_IODA_PE_DEV: an NPU or OCAPI device, -* created from pnv_pci_ioda_fixup(). +* For non-nvlink devices the IOMMU group is registered when the PE is +* configured and devices are added to the group when the per-device +* DMA setup is run. That's done in hose->ops.dma_dev_setup() which is +* only initialise for "normal" IODA PHBs. * -* Normally a PE is represented by an IOMMU group, however for -* devices with side channels the groups need to be more strict. +* For NVLink devices we need to ensure the NVLinks and the GPU end up +* in the same IOMMU group, so that's handled here. */ list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; - if (phb->type == PNV_PHB_NPU_NVLINK || - phb->type == PNV_PHB_NPU_
[PATCH 5/7] powerpc/powernv/pci: Delete old iommu recursive iommu setup
No longer used. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 32 --- 1 file changed, 32 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 9198b7882b57..8b45b8e561e9 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1550,11 +1550,6 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); -#ifdef CONFIG_IOMMU_API -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, - struct iommu_table_group *table_group, struct pci_bus *bus); - -#endif static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) { struct pci_bus*bus; @@ -2590,33 +2585,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .release_ownership = pnv_ioda2_release_ownership, }; -static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe, - struct iommu_table_group *table_group, - struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - iommu_add_device(table_group, &dev->dev); - - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_iommu_group_add_devices(pe, - table_group, dev->subordinate); - } -} - -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, - struct iommu_table_group *table_group, struct pci_bus *bus) -{ - - if (pe->flags & PNV_IODA_PE_DEV) - iommu_add_device(table_group, &pe->pdev->dev); - - if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus) - pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group, - bus); -} - static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); static void pnv_pci_ioda_setup_iommu_api(void) -- 2.21.1
[PATCH 6/7] powerpc/powernv/pci: Move tce size parsing to pci-ioda-tce.c
Move it in with the rest of the TCE wrangling rather than carting around a static prototype in pci-ioda.c Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 28 + arch/powerpc/platforms/powernv/pci-ioda.c | 30 --- arch/powerpc/platforms/powernv/pci.h | 2 ++ 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 5dc6847d5f4c..f923359d8afc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -17,6 +17,34 @@ #include #include "pci.h" +unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) +{ + struct pci_controller *hose = phb->hose; + struct device_node *dn = hose->dn; + unsigned long mask = 0; + int i, rc, count; + u32 val; + + count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes"); + if (count <= 0) { + mask = SZ_4K | SZ_64K; + /* Add 16M for POWER8 by default */ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + !cpu_has_feature(CPU_FTR_ARCH_300)) + mask |= SZ_16M | SZ_256M; + return mask; + } + + for (i = 0; i < count; i++) { + rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes", + i, &val); + if (rc == 0) + mask |= 1ULL << val; + } + + return mask; +} + void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned int page_shift) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 8b45b8e561e9..c020ade3a846 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2585,8 +2585,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .release_ownership = pnv_ioda2_release_ownership, }; -static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); - static void pnv_pci_ioda_setup_iommu_api(void) { struct pci_controller *hose; @@ -2638,34 +2636,6 @@ static void pnv_pci_ioda_setup_iommu_api(void) static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif -static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) -{ - struct pci_controller *hose = phb->hose; - struct device_node *dn = hose->dn; - unsigned long mask = 0; - int i, rc, count; - u32 val; - - count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes"); - if (count <= 0) { - mask = SZ_4K | SZ_64K; - /* Add 16M for POWER8 by default */ - if (cpu_has_feature(CPU_FTR_ARCH_207S) && - !cpu_has_feature(CPU_FTR_ARCH_300)) - mask |= SZ_16M | SZ_256M; - return mask; - } - - for (i = 0; i < count; i++) { - rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes", - i, &val); - if (rc == 0) - mask |= 1ULL << val; - } - - return mask; -} - static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index d3bbdeab3a32..0c5845a1f05d 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -244,4 +244,6 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned int page_shift); +extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); + #endif /* __POWERNV_PCI_H */ -- 2.21.1
[PATCH 7/7] powerpc/powernv/npu: Move IOMMU group setup into npu-dma.c
The NVlink IOMMU group setup is only relevant to NVLink devices so move it into the NPU containment zone. This let us remove some prototypes in pci.h and staticfy some function definitions. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/npu-dma.c | 54 +++- arch/powerpc/platforms/powernv/pci-ioda.c | 60 +++ arch/powerpc/platforms/powernv/pci.h | 6 +-- 3 files changed, 60 insertions(+), 60 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index df27b8d7e78f..abeaa533b976 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -15,6 +15,7 @@ #include #include +#include #include #include "pci.h" @@ -425,7 +426,8 @@ static void pnv_comp_attach_table_group(struct npu_comp *npucomp, ++npucomp->pe_num; } -struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) +static struct iommu_table_group * + pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) { struct iommu_table_group *compound_group; struct npu_comp *npucomp; @@ -491,7 +493,7 @@ struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) return compound_group; } -struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) +static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) { struct iommu_table_group *table_group; struct npu_comp *npucomp; @@ -534,6 +536,54 @@ struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) return table_group; } + +void pnv_pci_npu_setup_iommu_groups(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + + /* +* For non-nvlink devices the IOMMU group is registered when the PE is +* configured and devices are added to the group when the per-device +* DMA setup is run. That's done in hose->ops.dma_dev_setup() which is +* only initialise for "normal" IODA PHBs. +* +* For NVLink devices we need to ensure the NVLinks and the GPU end up +* in the same IOMMU group, so that's handled here. +*/ + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->type == PNV_PHB_IODA2) + list_for_each_entry(pe, &phb->ioda.pe_list, list) + pnv_try_setup_npu_table_group(pe); + } + + /* +* Now we have all PHBs discovered, time to add NPU devices to +* the corresponding IOMMU groups. +*/ + list_for_each_entry(hose, &hose_list, list_node) { + unsigned long pgsizes; + + phb = hose->private_data; + + if (phb->type != PNV_PHB_NPU_NVLINK) + continue; + + pgsizes = pnv_ioda_parse_tce_sizes(phb); + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + /* +* IODA2 bridges get this set up from +* pci_controller_ops::setup_bridge but NPU bridges +* do not have this hook defined so we do it here. +*/ + pe->table_group.pgsizes = pgsizes; + pnv_npu_compound_attach(pe); + } + } +} #endif /* CONFIG_IOMMU_API */ int pnv_npu2_init(struct pci_controller *hose) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c020ade3a846..dba0c2c09f61 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1288,7 +1288,7 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) pnv_ioda_setup_npu_PE(pdev); } -static void pnv_pci_ioda_setup_PEs(void) +static void pnv_pci_ioda_setup_nvlink(void) { struct pci_controller *hose; struct pnv_phb *phb; @@ -1312,6 +1312,11 @@ static void pnv_pci_ioda_setup_PEs(void) list_for_each_entry(pe, &phb->ioda.pe_list, list) pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV); } + +#ifdef CONFIG_IOMMU_API + /* setup iommu groups so we can do nvlink pass-thru */ + pnv_pci_npu_setup_iommu_groups(); +#endif } #ifdef CONFIG_PCI_IOV @@ -2584,56 +2589,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .take_ownership = pnv_ioda2_take_ownership, .release_ownership = pnv_ioda2_release_ownership, }; - -static void pnv_pci_ioda_setup_iommu_api(void) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - - /* -* For non-nvlink devices the IOMMU group is registered when t
Re: [PATCH v3] powerpc/powernv: add NULL check after kzalloc in opal_add_one_export
On Mon, Apr 6, 2020 at 11:15 AM Qiujun Huang wrote: > > On Mon, Apr 6, 2020 at 3:06 AM Markus Elfring wrote: > > > > > Here needs a NULL check. > quite obvious? > > > > I find this change description questionable > > (despite of a reasonable patch subject). > > > > > > > Issue found by coccinelle. > > > > Would an information like “Generated by: > > scripts/coccinelle/null/kmerr.cocci” > > be nicer? > Yeah, but I think It was enough. I didn't know we had that script in the kernel tree so I think it's a good to mention that you used it. It might even help idiots like me who write this sort of bug. > > Will a patch change log be helpful here? > I realized I should write some change log, and the change log was meaningless. > So I left it blank. The changelog is fine IMO. The point of a changelog is to tell a reader doing git archeology why a change happened and this is sufficent for that. Reviewed-by: Oliver O'Halloran
[PATCH] powerpc/boot: Fix the initrd being overwritten under qemu
When booting under OF the zImage expects the initrd address and size to be passed to it using registers r3 and r4. SLOF (guest firmware used by QEMU) currently doesn't do this so the zImage is not aware of the initrd location. This can result in initrd corruption either though the zImage extracting the vmlinux over the initrd, or by the vmlinux overwriting the initrd when relocating itself. QEMU does put the linux,initrd-start and linux,initrd-end properties into the devicetree to vmlinux to find the initrd. We can work around the SLOF bug by also looking those properties in the zImage. Cc: sta...@vger.kernel.org Cc: Alexey Kardashevskiy Signed-off-by: Oliver O'Halloran --- First noticed here: https://unix.stackexchange.com/questions/547023/linux-kernel-on-ppc64le-vmlinux-equivalent-in-arch-powerpc-boot --- arch/powerpc/boot/devtree.c | 21 + arch/powerpc/boot/main.c| 7 +++ arch/powerpc/boot/of.h | 16 arch/powerpc/boot/ops.h | 1 + arch/powerpc/boot/swab.h| 17 + 5 files changed, 46 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/boot/devtree.c b/arch/powerpc/boot/devtree.c index 5d91036..ac5c26b 100644 --- a/arch/powerpc/boot/devtree.c +++ b/arch/powerpc/boot/devtree.c @@ -13,6 +13,7 @@ #include "string.h" #include "stdio.h" #include "ops.h" +#include "swab.h" void dt_fixup_memory(u64 start, u64 size) { @@ -318,6 +319,26 @@ int dt_xlate_reg(void *node, int res, unsigned long *addr, unsigned long *size) return dt_xlate(node, res, reglen, addr, size); } +int dt_read_addr(void *node, const char *prop, unsigned long *out_addr) +{ + int reglen; + + *out_addr = 0; + + reglen = getprop(node, prop, prop_buf, sizeof(prop_buf)) / 4; + if (reglen == 2) { + u64 v0 = be32_to_cpu(prop_buf[0]); + u64 v1 = be32_to_cpu(prop_buf[1]); + *out_addr = (v0 << 32) | v1; + } else if (reglen == 1) { + *out_addr = be32_to_cpu(prop_buf[0]); + } else { + return 0; + } + + return 1; +} + int dt_xlate_addr(void *node, u32 *buf, int buflen, unsigned long *xlated_addr) { diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c index a9d2091..518af24 100644 --- a/arch/powerpc/boot/main.c +++ b/arch/powerpc/boot/main.c @@ -112,6 +112,13 @@ static struct addr_range prep_initrd(struct addr_range vmlinux, void *chosen, } else if (initrd_size > 0) { printf("Using loader supplied ramdisk at 0x%lx-0x%lx\n\r", initrd_addr, initrd_addr + initrd_size); + } else if (chosen) { + unsigned long initrd_end; + + dt_read_addr(chosen, "linux,initrd-start", &initrd_addr); + dt_read_addr(chosen, "linux,initrd-end", &initrd_end); + + initrd_size = initrd_end - initrd_addr; } /* If there's no initrd at all, we're done */ diff --git a/arch/powerpc/boot/of.h b/arch/powerpc/boot/of.h index 31b2f5d..dc24770 100644 --- a/arch/powerpc/boot/of.h +++ b/arch/powerpc/boot/of.h @@ -26,22 +26,6 @@ typedef u16 __be16; typedef u32__be32; typedef u64__be64; -#ifdef __LITTLE_ENDIAN__ -#define cpu_to_be16(x) swab16(x) -#define be16_to_cpu(x) swab16(x) -#define cpu_to_be32(x) swab32(x) -#define be32_to_cpu(x) swab32(x) -#define cpu_to_be64(x) swab64(x) -#define be64_to_cpu(x) swab64(x) -#else -#define cpu_to_be16(x) (x) -#define be16_to_cpu(x) (x) -#define cpu_to_be32(x) (x) -#define be32_to_cpu(x) (x) -#define cpu_to_be64(x) (x) -#define be64_to_cpu(x) (x) -#endif - #define PROM_ERROR (-1u) #endif /* _PPC_BOOT_OF_H_ */ diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h index e060676..5100dd7 100644 --- a/arch/powerpc/boot/ops.h +++ b/arch/powerpc/boot/ops.h @@ -95,6 +95,7 @@ void *simple_alloc_init(char *base, unsigned long heap_size, extern void flush_cache(void *, unsigned long); int dt_xlate_reg(void *node, int res, unsigned long *addr, unsigned long *size); int dt_xlate_addr(void *node, u32 *buf, int buflen, unsigned long *xlated_addr); +int dt_read_addr(void *node, const char *prop, unsigned long *out); int dt_is_compatible(void *node, const char *compat); void dt_get_reg_format(void *node, u32 *naddr, u32 *nsize); int dt_get_virtual_reg(void *node, void **addr, int nres); diff --git a/arch/powerpc/boot/swab.h b/arch/powerpc/boot/swab.h index 11d2069..82db2c1 100644 --- a/arch/powerpc/boot/swab.h +++ b/arch/powerpc/boot/swab.h @@ -27,4 +27,21 @@ static inline u64 swab64(u64 x) (u64)((x & (u64)0x00ffULL) >> 40) | (u64)((x & (u64)0xff00ULL) >> 56); } + +#ifdef __LITTLE_ENDIAN__ +#define cpu_to_be16(x) swab16(x) +#define be16_to_cpu(x) swab16(
Re: [PATCH] powerpc/boot: Fix the initrd being overwritten under qemu
On Wed, Oct 23, 2019 at 10:21 PM Segher Boessenkool wrote: > > On Wed, Oct 23, 2019 at 12:36:35PM +1100, Oliver O'Halloran wrote: > > When booting under OF the zImage expects the initrd address and size to be > > passed to it using registers r3 and r4. SLOF (guest firmware used by QEMU) > > currently doesn't do this so the zImage is not aware of the initrd > > location. This can result in initrd corruption either though the zImage > > extracting the vmlinux over the initrd, or by the vmlinux overwriting the > > initrd when relocating itself. > > > > QEMU does put the linux,initrd-start and linux,initrd-end properties into > > the devicetree to vmlinux to find the initrd. We can work around the SLOF > > bug by also looking those properties in the zImage. > > This is not a bug. What boot protocol requires passing the initrd start > and size in GPR3, GPR4? > > The CHRP binding (what SLOF implements) requires passing two zeroes here. > And ePAPR requires passing the address of a device tree and a zero, plus > something in GPR6 to allow distinguishing what it does. This is what is assumed by the zImage.pseries. I have no idea where that assumption comes from,A B > As Alexey says, initramfs works just fine, so please use that? initrd was > deprecated when this code was written already. That's not what Alexey said and the distinction between an initrd and an initramfs is completely arbitrary. > > > Segher
Re: [PATCH 09/10] powerpc: Enable OpenCAPI Storage Class Memory driver on bare metal
On Fri, Oct 25, 2019 at 3:51 PM Alastair D'Silva wrote: > > From: Alastair D'Silva > > Enable OpenCAPI Storage Class Memory driver on bare metal > > Signed-off-by: Alastair D'Silva > --- > arch/powerpc/configs/powernv_defconfig | 4 > 1 file changed, 4 insertions(+) > > diff --git a/arch/powerpc/configs/powernv_defconfig > b/arch/powerpc/configs/powernv_defconfig > index 6658cceb928c..45c0eff94964 100644 > --- a/arch/powerpc/configs/powernv_defconfig > +++ b/arch/powerpc/configs/powernv_defconfig > @@ -352,3 +352,7 @@ CONFIG_KVM_BOOK3S_64=m > CONFIG_KVM_BOOK3S_64_HV=m > CONFIG_VHOST_NET=m > CONFIG_PRINTK_TIME=y > +CONFIG_OCXL_SCM=m > +CONFIG_DEV_DAX=y > +CONFIG_DEV_DAX_PMEM=y These should probably be modules. Having them as builtins will force their dependencies (i.e. libnvdimm) to be built into the kernel too. > +CONFIG_FS_DAX=y > -- > 2.21.0 > ___ > Linux-nvdimm mailing list -- linux-nvd...@lists.01.org > To unsubscribe send an email to linux-nvdimm-le...@lists.01.org
[PATCH v2 1/3] powernv/iov: Ensure the pdn for VFs always contains a valid PE number
On pseries there is a bug with adding hotplugged devices to an IOMMU group. For a number of dumb reasons fixing that bug first requires re-working how VFs are configured on PowerNV. For background, on PowerNV we use the pcibios_sriov_enable() hook to do two things: 1. Create a pci_dn structure for each of the VFs, and 2. Configure the PHB's internal BARs so the MMIO range for each VF maps to a unique PE. Roughly speaking a PE is the hardware counterpart to a Linux IOMMU group since all the devices in a PE share the same IOMMU table. A PE also defines the set of devices that should be isolated in response to a PCI error (i.e. bad DMA, UR/CA, AER events, etc). When isolated all MMIO and DMA traffic to and from devicein the PE is blocked by the root complex until the PE is recovered by the OS. The requirement to block MMIO causes a giant headache because the P8 PHB generally uses a fixed mapping between MMIO addresses and PEs. As a result we need to delay configuring the IOMMU groups for device until after MMIO resources are assigned. For physical devices (i.e. non-VFs) the PE assignment is done in pcibios_setup_bridge() which is called immediately after the MMIO resources for downstream devices (and the bridge's windows) are assigned. For VFs the setup is more complicated because: a) pcibios_setup_bridge() is not called again when VFs are activated, and b) The pci_dev for VFs are created by generic code which runs after pcibios_sriov_enable() is called. The work around for this is a two step process: 1. A fixup in pcibios_add_device() is used to initialised the cached pe_number in pci_dn, then 2. A bus notifier then adds the device to the IOMMU group for the PE specified in pci_dn->pe_number. A side effect fixing the pseries bug mentioned in the first paragraph is moving the fixup out of pcibios_add_device() and into pcibios_bus_add_device(), which is called much later. This results in step 2. failing because pci_dn->pe_number won't be initialised when the bus notifier is run. We can fix this by removing the need for the fixup. The PE for a VF is known before the VF is even scanned so we can initialise pci_dn->pe_number pcibios_sriov_enable() instead. Unfortunately, moving the initialisation causes two problems: 1. We trip the WARN_ON() in the current fixup code, and 2. The EEH core clears pdn->pe_number when recovering a VF and relies on the fixup to correctly re-set it. The only justification for either of these is a comment in eeh_rmv_device() suggesting that pdn->pe_number *must* be set to IODA_INVALID_PE in order for the VF to be scanned. However, this comment appears to have no basis in reality. Both bugs can be fixed by just deleting the code. Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy Signed-off-by: Oliver O'Halloran --- v2: Re-wrote commit message, got very depressed about the state of things. The real fix here is to move the IOMMU group setup for both VFs and PFs into pcibios_bus_add_device() and kill the pcibios_setup_bridge() hack and the bus notifier hack, but doing that requires some pretty gnarly changes. The fix here is much less invasive. --- arch/powerpc/kernel/eeh_driver.c | 6 -- arch/powerpc/platforms/powernv/pci-ioda.c | 19 +++ arch/powerpc/platforms/powernv/pci.c | 4 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index d9279d0..7955fba 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -541,12 +541,6 @@ static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); edev->pdev = NULL; - - /* -* We have to set the VF PE number to invalid one, which is -* required to plug the VF successfully. -*/ - pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5e3172d..70508b3 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1558,6 +1558,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) /* Reserve PE for each VF */ for (vf_index = 0; vf_index < num_vfs; vf_index++) { + int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index); + int vf_bus = pci_iov_virtfn_bus(pdev, vf_index); + struct pci_dn *vf_pdn; + if (pdn->m64_single_mode) pe_num = pdn->pe_num_map[vf_index]; else @@ -1570,13 +1574,11 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *p
[PATCH v2 2/3] powerpc/pci: Fix pcibios_setup_device() ordering
From: Shawn Anastasio Move PCI device setup from pcibios_add_device() and pcibios_fixup_bus() to pcibios_bus_add_device(). This ensures that platform-specific DMA and IOMMU setup occurs after the device has been registered in sysfs, which is a requirement for IOMMU group assignment to work This fixes IOMMU group assignment for hotplugged devices on pseries, where the existing behavior results in IOMMU assignment before registration. Thanks to Lukas Wunner for the suggestion. Signed-off-by: Shawn Anastasio Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- arch/powerpc/kernel/pci-common.c | 25 + 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 1c448cf..b89925ed 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -261,12 +261,6 @@ int pcibios_sriov_disable(struct pci_dev *pdev) #endif /* CONFIG_PCI_IOV */ -void pcibios_bus_add_device(struct pci_dev *pdev) -{ - if (ppc_md.pcibios_bus_add_device) - ppc_md.pcibios_bus_add_device(pdev); -} - static resource_size_t pcibios_io_size(const struct pci_controller *hose) { #ifdef CONFIG_PPC64 @@ -987,15 +981,17 @@ static void pcibios_setup_device(struct pci_dev *dev) ppc_md.pci_irq_fixup(dev); } -int pcibios_add_device(struct pci_dev *dev) +void pcibios_bus_add_device(struct pci_dev *pdev) { - /* -* We can only call pcibios_setup_device() after bus setup is complete, -* since some of the platform specific DMA setup code depends on it. -*/ - if (dev->bus->is_added) - pcibios_setup_device(dev); + /* Perform platform-specific device setup */ + pcibios_setup_device(pdev); + + if (ppc_md.pcibios_bus_add_device) + ppc_md.pcibios_bus_add_device(pdev); +} +int pcibios_add_device(struct pci_dev *dev) +{ #ifdef CONFIG_PCI_IOV if (ppc_md.pcibios_fixup_sriov) ppc_md.pcibios_fixup_sriov(dev); @@ -1037,9 +1033,6 @@ void pcibios_fixup_bus(struct pci_bus *bus) /* Now fixup the bus bus */ pcibios_setup_bus_self(bus); - - /* Now fixup devices on that bus */ - pcibios_setup_bus_devices(bus); } EXPORT_SYMBOL(pcibios_fixup_bus); -- 2.9.5
[PATCH v2 3/3] powerpc/pci: Remove pcibios_setup_bus_devices()
With the previous patch applied pcibios_setup_device() will always be run when pcibios_bus_add_device() is called. There are several code paths where pcibios_setup_bus_device() is still called (the PowerPC specific PCI hotplug support is one) so with just the previous patch applied the setup can be run multiple times on a device, once before the device is added to the bus and once after. There's no need to run the setup in the early case any more so just remove it entirely. Signed-off-by: Oliver O'Halloran Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/pci.h| 1 - arch/powerpc/kernel/pci-common.c | 25 - arch/powerpc/kernel/pci-hotplug.c | 1 - arch/powerpc/kernel/pci_of_scan.c | 1 - 4 files changed, 28 deletions(-) diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 327567b..63ed7e3 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -113,7 +113,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, pgprot_t prot); extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose); -extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); extern void pcibios_setup_phb_io_space(struct pci_controller *hose); extern void pcibios_scan_phb(struct pci_controller *hose); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index b89925ed..f8a59d7 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1000,24 +1000,6 @@ int pcibios_add_device(struct pci_dev *dev) return 0; } -void pcibios_setup_bus_devices(struct pci_bus *bus) -{ - struct pci_dev *dev; - - pr_debug("PCI: Fixup bus devices %d (%s)\n", -bus->number, bus->self ? pci_name(bus->self) : "PHB"); - - list_for_each_entry(dev, &bus->devices, bus_list) { - /* Cardbus can call us to add new devices to a bus, so ignore -* those who are already fully discovered -*/ - if (pci_dev_is_added(dev)) - continue; - - pcibios_setup_device(dev); - } -} - void pcibios_set_master(struct pci_dev *dev) { /* No special bus mastering setup handling */ @@ -1036,13 +1018,6 @@ void pcibios_fixup_bus(struct pci_bus *bus) } EXPORT_SYMBOL(pcibios_fixup_bus); -void pci_fixup_cardbus(struct pci_bus *bus) -{ - /* Now fixup devices on that bus */ - pcibios_setup_bus_devices(bus); -} - - static int skip_isa_ioresource_align(struct pci_dev *dev) { if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) && diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index fc62c4b..d6a67f8 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -134,7 +134,6 @@ void pci_hp_add_devices(struct pci_bus *bus) */ slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); - pcibios_setup_bus_devices(bus); max = bus->busn_res.start; /* * Scan bridges that are already configured. We don't touch diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index f91d7e9..c3024f1 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -414,7 +414,6 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, */ if (!rescan_existing) pcibios_setup_bus_self(bus); - pcibios_setup_bus_devices(bus); /* Now scan child busses */ for_each_pci_bridge(dev, bus) -- 2.9.5
[PATCH v2 2/2] powerpc/powernv: Use common code for the symbol_map export
Long before we had a generic way for firmware to export memory ranges of interest we added a special case for the skiboot symbol map. The code is pretty much identical to the generic export so re-use the code. Signed-off-by: Oliver O'Halloran --- v2: Actually compile. --- arch/powerpc/platforms/powernv/opal.c | 48 --- 1 file changed, 10 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 0373da5..e4ea27d 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -708,42 +708,6 @@ static int opal_sysfs_init(void) return 0; } -static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count) -{ - return memory_read_from_buffer(buf, count, &off, bin_attr->private, - bin_attr->size); -} - -static struct bin_attribute symbol_map_attr = { - .attr = {.name = "symbol_map", .mode = 0400}, - .read = symbol_map_read -}; - -static void opal_export_symmap(void) -{ - const __be64 *syms; - unsigned int size; - struct device_node *fw; - int rc; - - fw = of_find_node_by_path("/ibm,opal/firmware"); - if (!fw) - return; - syms = of_get_property(fw, "symbol-map", &size); - if (!syms || size != 2 * sizeof(__be64)) - return; - - /* Setup attributes */ - symbol_map_attr.private = __va(be64_to_cpu(syms[0])); - symbol_map_attr.size = be64_to_cpu(syms[1]); - - rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr); - if (rc) - pr_warn("Error %d creating OPAL symbols file\n", rc); -} - static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) @@ -832,6 +796,7 @@ static void opal_export_attrs(void) { struct device_node *np; struct kobject *kobj; + int rc; np = of_find_node_by_path("/ibm,opal/firmware/exports"); if (!np) @@ -846,6 +811,15 @@ static void opal_export_attrs(void) opal_add_exported_attrs(np, kobj); + /* +* NB: symbol_map existed before the generic export interface so it +* lives under the top level opal_kobj. +*/ + rc = opal_add_one_export(opal_kobj, "symbol_map", +np->parent, "symbol-map"); + if (rc) + pr_warn("Error %d creating OPAL symbols file\n", rc); + of_node_put(np); } @@ -991,8 +965,6 @@ static int __init opal_init(void) /* Create "opal" kobject under /sys/firmware */ rc = opal_sysfs_init(); if (rc == 0) { - /* Export symbol map to userspace */ - opal_export_symmap(); /* Setup dump region interface */ opal_dump_region_init(); /* Setup error log interface */ -- 2.9.5
[PATCH v2 1/2] powerpc/powernv: Rework exports to support subnodes
Originally we only had a handful of exported memory ranges, but we'd to export the per-core trace buffers. This results in a lot of files in the exports directory which is a but unfortunate. We can clean things up a bit by turning subnodes into subdirectories of the exports directory. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/opal.c | 114 +- 1 file changed, 72 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 38e9027..0373da5 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -752,6 +752,75 @@ static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, bin_attr->size); } +static int opal_add_one_export(struct kobject *parent, const char *export_name, + struct device_node *np, const char *prop_name) +{ + struct bin_attribute *attr = NULL; + const char *name = NULL; + u64 vals[2]; + int rc; + + rc = of_property_read_u64_array(np, prop_name, &vals[0], 2); + if (rc) + goto out; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + name = kstrdup(export_name, GFP_KERNEL); + if (!name) { + rc = -ENOMEM; + goto out; + } + + sysfs_bin_attr_init(attr); + attr->attr.name = name; + attr->attr.mode = 0400; + attr->read = export_attr_read; + attr->private = __va(vals[0]); + attr->size = vals[1]; + + rc = sysfs_create_bin_file(parent, attr); +out: + if (rc) { + kfree(name); + kfree(attr); + } + + return rc; +} + +static void opal_add_exported_attrs(struct device_node *np, + struct kobject *kobj) +{ + struct device_node *child; + struct property *prop; + + for_each_property_of_node(np, prop) { + int rc; + + if (!strcmp(prop->name, "name") || + !strcmp(prop->name, "phandle")) + continue; + + rc = opal_add_one_export(kobj, prop->name, np, prop->name); + if (rc) { + pr_warn("Unable to add export %pOF/%s, rc = %d!\n", + np, prop->name, rc); + } + } + + for_each_child_of_node(np, child) { + struct kobject *child_kobj; + + child_kobj = kobject_create_and_add(child->name, kobj); + if (!child_kobj) { + pr_err("Unable to create export dir for %pOF\n", child); + continue; + } + + opal_add_exported_attrs(child, child_kobj); + } +} + /* * opal_export_attrs: creates a sysfs node for each property listed in * the device-tree under /ibm,opal/firmware/exports/ @@ -761,12 +830,8 @@ static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, */ static void opal_export_attrs(void) { - struct bin_attribute *attr; struct device_node *np; - struct property *prop; struct kobject *kobj; - u64 vals[2]; - int rc; np = of_find_node_by_path("/ibm,opal/firmware/exports"); if (!np) @@ -779,41 +844,7 @@ static void opal_export_attrs(void) return; } - for_each_property_of_node(np, prop) { - if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle")) - continue; - - if (of_property_read_u64_array(np, prop->name, &vals[0], 2)) - continue; - - attr = kzalloc(sizeof(*attr), GFP_KERNEL); - - if (attr == NULL) { - pr_warn("Failed kmalloc for bin_attribute!"); - continue; - } - - sysfs_bin_attr_init(attr); - attr->attr.name = kstrdup(prop->name, GFP_KERNEL); - attr->attr.mode = 0400; - attr->read = export_attr_read; - attr->private = __va(vals[0]); - attr->size = vals[1]; - - if (attr->attr.name == NULL) { - pr_warn("Failed kstrdup for bin_attribute attr.name"); - kfree(attr); - continue; - } - - rc = sysfs_create_bin_file(kobj, attr); - if (rc) { - pr_warn("Error %d creating OPAL sysfs exports/%s file\n", -rc, prop->name); - kfree(attr->attr.name); - kfree(attr); - } - } + opal_add_expo
[PATCH 1/2] powerpc/xmon: Allow passing an argument to ppc_md.restart()
On PowerNV a few different kinds of reboot are supported. We'd like to be able to exercise these from xmon so allow 'zr' to take an argument, and pass that to the ppc_md.restart() function. Signed-off-by: Oliver O'Halloran --- arch/powerpc/xmon/xmon.c | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index d83364e..6a6f675 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1147,16 +1147,19 @@ static int do_step(struct pt_regs *regs) static void bootcmds(void) { + char tmp[64]; int cmd; cmd = inchar(); - if (cmd == 'r') - ppc_md.restart(NULL); - else if (cmd == 'h') + if (cmd == 'r') { + getstring(tmp, 64); + ppc_md.restart(tmp); + } else if (cmd == 'h') { ppc_md.halt(); - else if (cmd == 'p') + } else if (cmd == 'p') { if (pm_power_off) pm_power_off(); + } } static int cpu_cmd(void) -- 2.9.5
[PATCH 2/2] powerpc/powernv: Allow manually invoking special reboots
OPAL provides several different kinds of reboot for the kernel to use, namely forcing a full reboot, platform error reboot and MPIPL. Right now triggering the alternative resets requires some ad-hoc method such as triggering a kernel crash and hoping the stars align. It's sometimes handy to be able to trigger one of these resets directly, so add a way to do that. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/setup.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 8349860..11fdae8 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -233,6 +233,10 @@ static void __noreturn pnv_restart(char *cmd) rc = opal_cec_reboot(); else if (strcmp(cmd, "full") == 0) rc = opal_cec_reboot2(OPAL_REBOOT_FULL_IPL, NULL); + else if (strcmp(cmd, "mpipl") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, NULL); + else if (strcmp(cmd, "error") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, NULL); else rc = OPAL_UNSUPPORTED; -- 2.9.5
Re: [PATCH] powerpc/papr_scm: Delete unnecessary assignment for the field “owner”
On Sun, Nov 3, 2019 at 11:31 PM Markus Elfring wrote: > > From: Markus Elfring > Date: Sun, 3 Nov 2019 13:23:13 +0100 > > The field “owner” is set by the core. > Thus delete an unneeded initialisation. Acked-by: Oliver O'Halloran > > Generated by: scripts/coccinelle/api/platform_no_drv_owner.cocci > > Signed-off-by: Markus Elfring > --- > arch/powerpc/platforms/pseries/papr_scm.c | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/arch/powerpc/platforms/pseries/papr_scm.c > b/arch/powerpc/platforms/pseries/papr_scm.c > index ee07d0718bf1..f87b474d25a7 100644 > --- a/arch/powerpc/platforms/pseries/papr_scm.c > +++ b/arch/powerpc/platforms/pseries/papr_scm.c > @@ -513,7 +513,6 @@ static struct platform_driver papr_scm_driver = { > .remove = papr_scm_remove, > .driver = { > .name = "papr_scm", > - .owner = THIS_MODULE, > .of_match_table = papr_scm_match, > }, > }; > -- > 2.23.0 >
Re: PROBLEM: PCIe Bus Error atleast
On Sat, Nov 2, 2019 at 5:46 AM Jeffrin Thalakkottoor wrote: > > hello , > > i found a error message as the output of "sudo dmesg -l err" > i have attached related to that in this email. > i think i found this in 5.3.8 kernel Use "uname -a" to get the current kernel version, architecture. > But i think when i tried again today i could not reproduce it That's unfortunate, but it might have just been a transient problem. The log has a pile of these AER errors: [ 283.723848] pcieport :00:1c.5: AER: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID) [ 283.723855] pcieport :00:1c.5: AER: device [8086:9d15] error status/mask=1000/2000 [ 283.723859] pcieport :00:1c.5: AER:[12] Timeout Which looks like a root port is getting a timeouts while trying to talk to its downstream device. It's hard to say anything more without knowing what the downstream device is, or what the system is. If this is a laptop it might be due to buggy power management, but it might just be flakey hardware. Can you provide the full dmesg and the output of lspci -vv? Oliver
Re: [PATCH RFC 02/15] powerpc/eeh: Rename eeh_pe_get() to eeh_pe_find()
On Wed, Oct 2, 2019 at 4:03 PM Sam Bobroff wrote: > > There are now functions eeh_get_pe() and eeh_pe_get() which seems > likely to cause confusion. > > Keep eeh_get_pe() because "get" is commonly used to refer to acquiring > a reference (which it does), and rename eeh_pe_get() to eeh_pe_find() > because it performs a search. > > Signed-off-by: Sam Bobroff Good idea. Reviewed-by: Oliver O'Halloran
[PATCH] powerpc/powernv: Disable native PCIe port management
On PowerNV the PCIe topology is (currently) managed the powernv platform code in cooperation with firmware. The PCIe-native service drivers bypass both and this can cause problems. Historically this hasn't been a big deal since the only port service driver that saw much use was the AER driver. The AER driver relies a kernel service to report when errors occur rather than acting autonmously so it's fairly easy to ignore. On PowerNV (and pseries) AER events are handled through EEH, which ignores the AER service, so it's never been an issue. Unfortunately, the hotplug port service driver (pciehp) does act autonomously and conflicts with the platform specific hotplug driver (pnv_php). The main issue is that pciehp claims the interrupt associated with the PCIe capability which in turn prevents pnv_php from claiming it. This results in hotplug events being handled by pciehp which does not notify firmware when the PCIe topology changes, and does not setup/teardown the arch specific PCI device structures (pci_dn) when the topology changes. The end result is that hot-added devices cannot be enabled and hot-removed devices may not be fully torn-down on removal. We can fix these problems by setting the "pcie_ports_disabled" flag during platform initialisation. The flag indicates the platform owns the PCIe ports which stops the portbus driver being registered. Cc: Sergey Miroshnichenko Fixes: 66725152fb9f ("PCI/hotplug: PowerPC PowerNV PCI hotplug driver") Signed-off-by: Oliver O'Halloran --- Sergey, just FYI. I'll try sort out the rest of the hotplug trainwreck in 5.6. The Fixes: here is for the patch that added pnv_php in 4.8. It's been a problem since then, but wasn't noticed until people started testing it after the EEH fixes in commit 799abe283e51 ("powerpc/eeh: Clean up EEH PEs after recovery finishes") went in earlier in the 5.4 cycle. --- arch/powerpc/platforms/powernv/pci.c | 17 + 1 file changed, 17 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 2825d00..ae62583 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -941,6 +941,23 @@ void __init pnv_pci_init(void) pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN); +#ifdef CONFIG_PCIEPORTBUS + /* +* On PowerNV PCIe devices are (currently) managed in cooperation +* with firmware. This isn't *strictly* required, but there's enough +* assumptions baked into both firmware and the platform code that +* it's unwise to allow the portbus services to be used. +* +* We need to fix this eventually, but for now set this flag to disable +* the portbus driver. The AER service isn't required since that AER +* events are handled via EEH. The pciehp hotplug driver can't work +* without kernel changes (and portbus binding breaks pnv_php). The +* other services also require some thinking about how we're going +* to integrate them. +*/ + pcie_ports_disabled = true; +#endif + /* If we don't have OPAL, eg. in sim, just skip PCI probe */ if (!firmware_has_feature(FW_FEATURE_OPAL)) return; -- 2.9.5
Re: [PATCH] powerpc/perf: Disable trace_imc pmu
On Thu, Nov 14, 2019 at 6:19 PM Madhavan Srinivasan wrote: > > When a root user or a user with CAP_SYS_ADMIN > privilege use trace_imc performance monitoring > unit events, to monitor application or KVM threads, > may result in a checkstop (System crash). Reason > being frequent switch of the "trace/accumulation" > mode of In-Memory Collection hardware. > This patch disables trace_imc pmu unit, but will > be re-enabled at a later stage with a fix patchset. > --- > arch/powerpc/platforms/powernv/opal-imc.c | 7 ++- > 1 file changed, 6 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/platforms/powernv/opal-imc.c > b/arch/powerpc/platforms/powernv/opal-imc.c > index e04b20625cb9..5790f078771f 100644 > --- a/arch/powerpc/platforms/powernv/opal-imc.c > +++ b/arch/powerpc/platforms/powernv/opal-imc.c > @@ -285,7 +285,12 @@ static int opal_imc_counters_probe(struct > platform_device *pdev) > domain = IMC_DOMAIN_THREAD; > break; > case IMC_TYPE_TRACE: > - domain = IMC_DOMAIN_TRACE; > + /* Using trace_imc events to monitor > +* application or KVM thread performances > +* may result in a checkstop (system crash). > +* So disabling it for now. > +*/ > + domain = -1; > break; > default: > pr_warn("IMC Unknown Device type \n"); > -- > 2.21.0 > Does this need a Fixes: tag?
Re: [PATCH] powerpc/powernv: Disable native PCIe port management
On Thu, Nov 14, 2019 at 1:31 AM Bjorn Helgaas wrote: > > This is fine, but it feels like sort of a blunt instrument. Is there > any practical way to clear pci_host_bridge.native_pcie_hotplug (and > native_aer if appropriate) for the PHBs in question? That would also > prevent pciehp from binding. It is a large hammer, but I don't see a better way to handle it for the moment. I had another look and my initial assessment was wrong in that it's the portbus driver which claims the MSI rather than pciehp itself. The MSI in the PCIe capability is shared between hotplug events, PMEs, and BW notifications so to make the portbus concept work the portbus driver needs to own the interrupt. Basicly, pnv_php and portbus are fundamentally at odds with each other and can't be used concurrently. I also think there's some latent issues with the interaction of DPC and EEH since they operate off the same set of error messages. We haven't run into any problems yet, but I think that's largely because we haven't shipped any systems with DPC enabled. In any case, I'd prefer we disabled portbus until we've fully unpacked what's going on there. > We might someday pull portdrv into the PCI core directly instead of as > a separate driver, and I'm thinking that might be easier if we have > more specific indications of what the core shouldn't use. It's not intended to be a permanent change. In the long term I want to move everything except the initialisation and reset of the PHB out of firmware and into the kernel so we can use more of the native PCIe management features.
Re: [PATCH] powerpc/powernv: Disable native PCIe port management
On Thu, Nov 14, 2019 at 7:39 AM Tyrel Datwyler wrote: > > Nothing but pedantic spelling and grammar nits of the commit log follow. > > -Tyrel Thanks. My speeling is bad even on a good day and it was not a good day.
Re: [PATCH 01/11] powerpc/powernv/ioda: Fix ref count for devices with their own PE
On Mon, Nov 18, 2019 at 12:06 PM Alistair Popple wrote: > > On Wednesday, 13 November 2019 4:38:21 AM AEDT Frederic Barrat wrote: > > > > However, one question is whether this patch breaks nvlink and if nvlink > > assumes the devices won’t go away because we explicitly take a reference > > forever. In npu_dma.c, there are 2 functions which allow to find the GPU > > associated to a npu device, and vice-versa. Both functions return a > > pointer to a struct pci_dev, but they don’t take a reference on the > > device being returned. So that seems dangerous. I’m probably missing > > something. > > > > Alexey, Alistair: what, if anything, guarantees, that the npu or gpu > > devices stay valid. Is it because we simply don’t provide any means to > > get rid of them ? Otherwise, don’t we need the callers of > > pnv_pci_get_gpu_dev() and pnv_pci_get_npu_dev() to worry about reference > > counting ? I’ve started looking into it and the changes are scary, which > > explains Greg’s related commit 02c5f5394918b. > > To be honest the reference counting looks like it has evolved into something > quite suspect and I don't think you're missing anything. In practice though we > likely haven't hit any issues because the original callers didn't store > references to the pdev which would make the window quite small (although the > pass through work may have changed that). And as you say there simply wasn't > any means to get rid of them anyway (EEH, hotplug, etc. has never been > implemented or supported for GPUs and all sorts of terrible things happen if > you try). In other words: leaking a ref is the only safe thing to do. > The best solution would likely be to review the reference counting and to > teach callers of get_*_dev() to call pci_put_dev(), etc. The issue is that the two callers of get_pci_dev() are non-GPL exported symbols so we don't know what's calling them or what their expectations are. We be doing whatever makes sense for OpenCAPI and if that happens to cause a problem for someone else, then they can deal with it. Oliver
[PATCH v2] powerpc/powernv: Disable native PCIe port management
On PowerNV the PCIe topology is (currently) managed by the powernv platform code in Linux in cooperation with the platform firmware. Linux's native PCIe port service drivers operate independently of both and this can cause problems. The main issue is that the portbus driver will conflict with the platform specific hotplug driver (pnv_php) over ownership of the MSI used to notify the host when a hotplug event occurs. The portbus driver claims this MSI on behalf of the individual port services because the same interrupt is used for hotplug events, PMEs (on root ports), and link bandwidth change notifications. The portbus driver will always claim the interrupt even if the individual port service drivers, such as pciehp, are compiled out. The second, bigger, problem is that the hotplug port service driver fundamentally does not work on PowerNV. The platform assumes that all PCI devices have a corresponding arch-specific handle derived from the DT node for the device (pci_dn) and without one the platform will not allow a PCI device to be enabled. This problem is largely due to historical baggage, but it can't be resolved without significant re-factoring of the platform PCI support. We can fix these problems in the interim by setting the "pcie_ports_disabled" flag during platform initialisation. The flag indicates the platform owns the PCIe ports which stops the portbus driver from being registered. This does have the side effect of disabling all port services drivers that is: AER, PME, BW notifications, hotplug, and DPC. However, this is not a huge disadvantage on PowerNV since these services are either unused or handled through other means. Cc: Sergey Miroshnichenko Fixes: 66725152fb9f ("PCI/hotplug: PowerPC PowerNV PCI hotplug driver") Signed-off-by: Oliver O'Halloran --- Sergey, just FYI. I'll try sort out the rest of the hotplug trainwreck in 5.6. The Fixes: here is for the patch that added pnv_php in 4.8. It's been a problem since then, but wasn't noticed until people started testing it after the EEH fixes in commit 799abe283e51 ("powerpc/eeh: Clean up EEH PEs after recovery finishes") went in earlier in the 5.4 cycle. --- v2: Moved setting the flag until after we check for OPAL support. No actual functional change since we've already probed the platform, but it looks neater. Re-wrote the commit message. --- arch/powerpc/platforms/powernv/pci.c | 17 + 1 file changed, 17 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 2825d00..c0bea75a 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -945,6 +945,23 @@ void __init pnv_pci_init(void) if (!firmware_has_feature(FW_FEATURE_OPAL)) return; +#ifdef CONFIG_PCIEPORTBUS + /* +* On PowerNV PCIe devices are (currently) managed in cooperation +* with firmware. This isn't *strictly* required, but there's enough +* assumptions baked into both firmware and the platform code that +* it's unwise to allow the portbus services to be used. +* +* We need to fix this eventually, but for now set this flag to disable +* the portbus driver. The AER service isn't required since that AER +* events are handled via EEH. The pciehp hotplug driver can't work +* without kernel changes (and portbus binding breaks pnv_php). The +* other services also require some thinking about how we're going +* to integrate them. +*/ + pcie_ports_disabled = true; +#endif + /* Look for IODA IO-Hubs. */ for_each_compatible_node(np, NULL, "ibm,ioda-hub") { pnv_pci_init_ioda_hub(np); -- 2.9.5
Re: [PATCH 02/11] powerpc/powernv/ioda: Protect PE list
On Tue, Nov 19, 2019 at 11:57 PM Frederic Barrat wrote: > > > Do the other accessors of ioda.pe_list also need mutex protection? > > pnv_ioda_setup_bus_PE() > > pnv_pci_dma_bus_setup() > > pnv_pci_init_ioda_phb() > > pnv_pci_ioda_setup_PEs() > > > I think we could also use it there, it wouldn't hurt. Those functions > are called when the kernel is building part of the PCI topology, and > devices are not really active yet, so I don't think it's absolutely > required. > > I'm actually not sure my patch is needed either. With hotplug, the > devices can come and go, whereas the PHB remains. So it feels right to > start protecting the list when adding/removing a device. But I don't > think we can really have concurrency and have 2 different operations > adding/removing devices at the same time under the same PHB, at least > for opencapi. Maybe for PCI, if we have multiple slots under the same > PHB. Not sure. Creation of new pci_dev's is serialised by the global pci rescan and remove lock so on the creation side it's not an issue. However, we can release IODA PEs in the pci_dev's release function which might be called without that lock held. It's pretty hard to hit that case though since it require something to be holding a ref to the pci_dev even after the driver's ->remove() function has been called.
PCIPOCALYPSE
This series does a few things and probably needs to be split into two or three smaller ones. I figured I'd post it as-is since I'm sick of sitting on it and some people wanted people to take a look at it. There's three parts: 1) Reworking EEH to move the "pseudo-generic" into the platform backend. 2) Moving the point where do do PE assignments for PCIe devices out of pcibios_setup_bridge() and into pcibios_bus_add_device(). 3) Killing the use of pci_dn in powernv entirely. It used to be a (much) longer series, but bits and pieces of been upstreamed or at least posted to the list so I've omitted most of the pre-reqs. Here is a tree you can build based on today's -next with everything in it: https://github.com/oohal/linux/tree/eeh-no-pdn-working Keep in mind this is all pretty raw and I've tested it on precisely one P8 PowerNV system. Things not tested: -> pseries (not recently anyway) -> CAPI -> OpenCAPI -> Any kind of NVLink The main TODO is to finish what was started in 2) so that we handle PE assignments, IOMMU configuration, etc in the same place for each PHB type. Right now there's three distinct paths: 1) For normal IODA PHBs (PHB3 and 4) the PE we can assign a device to is pinned by the location of it's MMIO BARs. How this is handled depends on whether the device is a VF or not, so the two sub cases are: a) For normal devices all the devices under a bridge are assigned to a PE in a walk done after configuring the bridge window. This causes a pile of wierd edge cases when a PCI device is removed without also removing it's parent bridge. b) For VFs PEs (and MMIO BARs) are assigned when we call sriov_enable() on the PF and we "fix up" the software state later on. As a result there's some IOMMU group stuff that happens in a bus notifier which runs after adding the device to a bus. 2) For bullshit IODA PHBs (OpenCAPI / NVLink) there is no MMIO pinning so we can assign a BDFN to an arbitrary PE. For devices under those PEs are assigned in a per-PHB fixup that runs only once at boot, just after the the PHB is probed. There doesn't seem to be good reason for this and the lack of pinning means we should be able to do it whenever. This series fixes 1a) by moving the PE assignment into pcibios_bus_add_device(), which is run per-device. With that change fixing the other two cases should be relatively straight forward. VFs will probably still require some special casing since their setup works differently to normal PCI devices, but we should be able to do better than the current trainwreck of random hacks occuring in random places.
[Very RFC 01/46] powerpc/eeh: Don't attempt to restore VF config space after reset
After resetting a VF we call eeh_restore_vf_config() to restore several registers in the VFs config space. For physical functions this is normally handled by the pci_reinit_device() OPAL call which requests firmware to re-program the device with whatever defaults were set at boot time. We can't use that for VFs since OPAL (being firmware) doesn't know (or care) about VFs. However, the fields that are restored here are all marked as reserved for VFs in the SR-IOV spec. In other words, eeh_restore_vf_config() doesn't actually do anything. There is an argument to be made that we should be saving and restoring some of these fields since they are marked as "Reserved, but Preserve" (ResvP) to allow these fields to be used in new versions of the SR-IOV. However, the current code doesn't even do that properly since it assumes they can be set to whatever the EEH core has assumed to be correct. If the fields *are* used in future versions of the SR-IOV spec this code is still broken since it doesn't take into account any changes made by the driver, or the Linux IOV core. Given the above, just delete the code. It's broken, it's mis-leading, and it's getting in the way of doing useful cleanups. Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/eeh.c| 59 arch/powerpc/platforms/powernv/eeh-powernv.c | 39 +++-- arch/powerpc/platforms/pseries/eeh_pseries.c | 26 + 3 files changed, 8 insertions(+), 116 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index ae0a9c421d7b..a3b93db972fc 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -742,65 +742,6 @@ static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) pci_restore_state(pdev); } -int eeh_restore_vf_config(struct pci_dn *pdn) -{ - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - u32 devctl, cmd, cap2, aer_capctl; - int old_mps; - - if (edev->pcie_cap) { - /* Restore MPS */ - old_mps = (ffs(pdn->mps) - 8) << 5; - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, -2, &devctl); - devctl &= ~PCI_EXP_DEVCTL_PAYLOAD; - devctl |= old_mps; - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, - 2, devctl); - - /* Disable Completion Timeout if possible */ - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2, -4, &cap2); - if (cap2 & PCI_EXP_DEVCAP2_COMP_TMOUT_DIS) { - eeh_ops->read_config(pdn, -edev->pcie_cap + PCI_EXP_DEVCTL2, -4, &cap2); - cap2 |= PCI_EXP_DEVCTL2_COMP_TMOUT_DIS; - eeh_ops->write_config(pdn, - edev->pcie_cap + PCI_EXP_DEVCTL2, - 4, cap2); - } - } - - /* Enable SERR and parity checking */ - eeh_ops->read_config(pdn, PCI_COMMAND, 2, &cmd); - cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR); - eeh_ops->write_config(pdn, PCI_COMMAND, 2, cmd); - - /* Enable report various errors */ - if (edev->pcie_cap) { - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, -2, &devctl); - devctl &= ~PCI_EXP_DEVCTL_CERE; - devctl |= (PCI_EXP_DEVCTL_NFERE | - PCI_EXP_DEVCTL_FERE | - PCI_EXP_DEVCTL_URRE); - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, - 2, devctl); - } - - /* Enable ECRC generation and check */ - if (edev->pcie_cap && edev->aer_cap) { - eeh_ops->read_config(pdn, edev->aer_cap + PCI_ERR_CAP, -4, &aer_capctl); - aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE); - eeh_ops->write_config(pdn, edev->aer_cap + PCI_ERR_CAP, - 4, aer_capctl); - } - - return 0; -} - /** * pcibios_set_pcie_reset_state - Set PCI-E reset state * @dev: pci device struct diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index ef727ecd99cd..b2ac4130fda7 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1649,20 +1649,13 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) if (!edev) return -EEXIST
[Very RFC 02/46] powernv/pci: Add helper to find ioda_pe from BDFN
Linux has a look-up table for mapping BDFNs to PEs which is updated when we call into OPAL to update the PHB's internally BDFN<->PE mapping. We can use this table to the PE for a device without needing to use the cached value inside the pci_dn. We'd like to get rid of pci_dn eventually so this patch adds adds a helper to find the ioda_pe of a BDFN based on the table. This is different to the existing helper which takes a pci_dev directly because there are some contexts (e.g. EEH recovery) where we need to check for an existing PE assignment when no corresponding pci_dev exists. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++ arch/powerpc/platforms/powernv/pci.h | 1 + 2 files changed, 11 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fdacf98555e9..65b5b121ebad 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -660,6 +660,16 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) return state; } +struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, u16 bdfn) +{ + int pe_number = phb->ioda.pe_rmap[bdfn]; + + if (pe_number == IODA_INVALID_PE) + return NULL; + + return &phb->ioda.pe_array[pe_number]; +} + struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index f914f0b14e4e..01a01739c03e 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -193,6 +193,7 @@ extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev); extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); +extern struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, u16 bdfn); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, -- 2.21.0
[Very RFC 03/46] powernv/pci: Remove dma_dev_setup() for NPU PHBs
The pnv_pci_dma_dev_setup() only does something when: 1) There PHB contains VFs, or 2) The PHB defines a dma_dev_setup() callback in the pnv_phb structure. Neither is true for NPU PHBs, so don't set the callback in the pci_controller_ops. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 65b5b121ebad..099c0bb1a9b9 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3652,7 +3652,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { }; static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, .enable_device_hook = pnv_pci_enable_device_hook, -- 2.21.0
[Very RFC 04/46] powernv/pci: Move dma_{dev|bus}_setup into pci-ioda.c
These functions are only used from pci-ioda.c. Move them in there and remove the prototypes from the header files. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 43 +++ arch/powerpc/platforms/powernv/pci.c | 43 --- arch/powerpc/platforms/powernv/pci.h | 2 -- 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 099c0bb1a9b9..c2b3a5a13004 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3637,6 +3637,49 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) OPAL_ASSERT_RESET); } +void pnv_pci_dma_dev_setup(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; +#ifdef CONFIG_PCI_IOV + struct pnv_ioda_pe *pe; + + /* Fix the VF pdn PE number */ + if (pdev->is_virtfn) { + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + if (pe->rid == ((pdev->bus->number << 8) | + (pdev->devfn & 0xff))) { + pe->pdev = pdev; + break; + } + } + } +#endif /* CONFIG_PCI_IOV */ + + if (phb && phb->dma_dev_setup) + phb->dma_dev_setup(phb, pdev); +} + +void pnv_pci_dma_bus_setup(struct pci_bus *bus) +{ + struct pci_controller *hose = bus->sysdata; + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) + continue; + + if (!pe->pbus) + continue; + + if (bus->number == ((pe->rid >> 8) & 0xFF)) { + pe->pbus = bus; + break; + } + } +} + static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, .dma_bus_setup = pnv_pci_dma_bus_setup, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index b7761e2e06f8..8b9058b52575 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -810,49 +810,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid) return tbl; } -void pnv_pci_dma_dev_setup(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; -#ifdef CONFIG_PCI_IOV - struct pnv_ioda_pe *pe; - - /* Fix the VF pdn PE number */ - if (pdev->is_virtfn) { - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - if (pe->rid == ((pdev->bus->number << 8) | - (pdev->devfn & 0xff))) { - pe->pdev = pdev; - break; - } - } - } -#endif /* CONFIG_PCI_IOV */ - - if (phb && phb->dma_dev_setup) - phb->dma_dev_setup(phb, pdev); -} - -void pnv_pci_dma_bus_setup(struct pci_bus *bus) -{ - struct pci_controller *hose = bus->sysdata; - struct pnv_phb *phb = hose->private_data; - struct pnv_ioda_pe *pe; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) - continue; - - if (!pe->pbus) - continue; - - if (bus->number == ((pe->rid >> 8) & 0xFF)) { - pe->pbus = bus; - break; - } - } -} - struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 01a01739c03e..f23145575048 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -189,8 +189,6 @@ extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); -extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev); -extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, u16 bdfn); -- 2.21.0
[Very RFC 05/46] powernv/pci: Remove the pnv_phb dma_dev_setup callback
This is only ever set for IODA PHBs. The only call site is in pnv_pci_dma_dev_setup(), which is also only used by normal IODA PHBs, so remove the callback in favour of a direct call. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +--- arch/powerpc/platforms/powernv/pci.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c2b3a5a13004..45f974258766 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3656,8 +3656,7 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev) } #endif /* CONFIG_PCI_IOV */ - if (phb && phb->dma_dev_setup) - phb->dma_dev_setup(phb, pdev); + pnv_pci_ioda_dma_dev_setup(phb, pdev); } void pnv_pci_dma_bus_setup(struct pci_bus *bus) @@ -3940,7 +3939,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops; break; default: - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; hose->controller_ops = pnv_pci_ioda_controller_ops; } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index f23145575048..3c33a0c91a69 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -108,7 +108,6 @@ struct pnv_phb { int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg); - void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); int (*init_m64)(struct pnv_phb *phb); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); -- 2.21.0
[Very RFC 06/46] powerpc/iov: Move VF pdev fixup into pcibios_fixup_iov()
Move this out of the PHB's dma_dev_setup() callback and into the ppc_md.pcibios_fixup_iov callback. This ensures that the VF PE's pdev pointer is always valid for the whole time the device is added the bus. This isn't strictly required, but it's slightly a slightly more logical place to do the fixup and it makes dma_dev_setup a bit simpler. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 35 +++ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 45f974258766..c6ea7a504e04 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2910,9 +2910,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) struct pci_dn *pdn; int mul, total_vfs; - if (!pdev->is_physfn || pci_dev_is_added(pdev)) - return; - pdn = pci_get_pdn(pdev); pdn->vfs_expanded = 0; pdn->m64_single_mode = false; @@ -2987,6 +2984,22 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) res->end = res->start - 1; } } + +static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev) +{ + if (WARN_ON(pci_dev_is_added(pdev))) + return; + + if (pdev->is_virtfn) { + /* Fix the VF PE's pdev pointer */ + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); + pe->pdev = pdev; + + WARN_ON(!(pe->flags & PNV_IODA_PE_VF)); + } else if (pdev->is_physfn) { + pnv_pci_ioda_fixup_iov_resources(pdev); + } +} #endif /* CONFIG_PCI_IOV */ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, @@ -3641,20 +3654,6 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; -#ifdef CONFIG_PCI_IOV - struct pnv_ioda_pe *pe; - - /* Fix the VF pdn PE number */ - if (pdev->is_virtfn) { - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - if (pe->rid == ((pdev->bus->number << 8) | - (pdev->devfn & 0xff))) { - pe->pdev = pdev; - break; - } - } - } -#endif /* CONFIG_PCI_IOV */ pnv_pci_ioda_dma_dev_setup(phb, pdev); } @@ -3945,7 +3944,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, ppc_md.pcibios_default_alignment = pnv_pci_default_alignment; #ifdef CONFIG_PCI_IOV - ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; + ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov; ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable; ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable; -- 2.21.0
[Very RFC 07/46] powernv/pci: Rework IODA PE device accounting
The current process for configuring the IODA PEs for normal PCI devices is abit stupid. After assigning MMIO resources for the devices on a bus the actual PE asignment occurs when pcibios_setup_bridge() is called for the parent bridge. In pcibios_setup_bridge() we: 1. Assign all 256 devfn's for the subordinate bus to the PE corresponding to the bridge window. 2. Initialise the IOMMU tables for that PE. 3. Traverse each device on the bus below the bridge setting the IOMMU table pointer to point at the PE's table. 4. Finally, set pci_dn->pe_number to indicate that we've done the per-device setup and allow EEH and the platform code to look up the PE number. Although mostly functional, there's a couple of issues with this approach. The most glaring is that it mixes the per-bus (per-PE really) setup with the per-device setup in a way that's completely asymmetric to what happens when tearing down a device. In step 4. the number of devices in the PE is counted and stored in the ioda_pe structure. When releasing a pci_dev the device count is dropped until it hits zero where the ioda_pe itself is torn down. However, the bus itself remains active and can be re-scanned to bring back the devices that were removed. On a rescan we do not re-run the bridge setup so the per-device setup is never re-done which results in the re-scanned being unusable. There are a few other minor issues too. Associating all 256 devfns with the PE means that config accesses to non-existant PCI devices results in a spurious PE freezes. We currently prevent this by only allowing config accesses to a BDFN when there is a corresponding pci_dn structure. We would like to eliminate that restriction in the future though. That all said the biggest issue is that the current behaviour is hard to follow at the best of times. On top of that the behaviour is slightly (or majorly) different across each PHB type (PCIe, OpenCAPI, NVLink) and the behaviour for physical devices (described above) and virtual functions is again different. To address this we want to merge the paths as much as possible so that the PowerNV specific PCI initialisation steps all occur at roughly the same point in the PCI device setup path. We can address most of these problems by moving the PE setup out of pcibios_setup_bridge() and into pcibios_bus_add_device(). The latter is called on a per-device basis so we have some symmetry between the setup and teardown paths. Moving the PE assignments to here should also allow us to converge how PE assignment works on all PHB types so it's always done in one place. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 112 +++--- 1 file changed, 58 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c6ea7a504e04..c74521e5f3ab 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -51,6 +51,7 @@ static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK", "NPU_OCAPI" }; static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +static void pnv_pci_configure_bus(struct pci_bus *bus); void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) @@ -1104,34 +1105,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) return pe; } -static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - struct pci_dn *pdn = pci_get_pdn(dev); - - if (pdn == NULL) { - pr_warn("%s: No device node associated with device !\n", - pci_name(dev)); - continue; - } - - /* -* In partial hotplug case, the PCI device might be still -* associated with the PE and needn't attach it to the PE -* again. -*/ - if (pdn->pe_number != IODA_INVALID_PE) - continue; - - pe->device_count++; - pdn->pe_number = pe->pe_number; - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_same_PE(dev->subordinate, pe); - } -} - /* * There're 2 types of PCI bus sensitive PEs: One that is compromised of * single PCI bus. Another one that contains the primary PCI bus and its @@ -1152,7 +1125,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) pe_num = phb->ioda.pe_rmap[bus->number << 8]; if (
[Very RFC 08/46] powerpc/eeh: Calculate VF index rather than looking it up in pci_dn
Find the VF index based on the BDFN of the device rather than using a cached value in the pci_dn. This is probably slower than looking up the cached value in the pci_dn, but it's done infrequently (only in the EEH recovery path) and it's just arithmatic. We need this here because the functions to remove a VF are slightly different to those which remove a physical PCI device. Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/eeh_driver.c | 44 +++- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index a1eaffe868de..1cdeed464aed 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -457,12 +457,35 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, return rc; } +#ifdef CONFIG_PCI_IOV +/* FIXME: this should probably go in drivers/pci/iov.c */ +static int eeh_find_vf_index(struct pci_dev *physfn, u16 vf_bdfn) +{ + u16 vf_bus, vf_devfn; + int i; + + vf_bus = vf_bdfn >> 8; + vf_devfn = vf_bdfn & 0xff; + + for (i = 0; i < pci_num_vf(physfn); i++) { + if (pci_iov_virtfn_bus(physfn, i) != vf_bus) + continue; + if (pci_iov_virtfn_devfn(physfn, i) != vf_devfn) + continue; + return i; + } + + WARN_ON(1); + return -1; +} + static void *eeh_add_virt_device(struct eeh_dev *edev) { - struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + struct pci_driver *driver; + int vf_index; - if (!(edev->physfn)) { + if (!edev->physfn) { eeh_edev_warn(edev, "Not for VF\n"); return NULL; } @@ -476,11 +499,18 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) eeh_pcid_put(dev); } -#ifdef CONFIG_PCI_IOV - pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index); -#endif + vf_index = eeh_find_vf_index(edev->physfn, edev->bdfn); + pci_iov_add_virtfn(edev->physfn, vf_index); + return NULL; } +#else +static void *eeh_add_virt_device(struct eeh_dev *edev) +{ + WARN_ON(1); + return NULL; +} +#endif static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) { @@ -521,9 +551,9 @@ static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) if (edev->physfn) { #ifdef CONFIG_PCI_IOV - struct pci_dn *pdn = eeh_dev_to_pdn(edev); + int vf_index = eeh_find_vf_index(edev->physfn, edev->bdfn); - pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); + pci_iov_remove_virtfn(edev->physfn, vf_index); edev->pdev = NULL; #endif if (rmv_data) -- 2.21.0
[Very RFC 09/46] powerpc/eeh: Pass eeh_dev to eeh_ops->{read|write}_config()
Switch the eeh_ops->{read|write}_config methods to take an eeh_dev structure rather than a pci_dn structure to specify the target device. This removes a lot of the uses of pci_dn in both the EEH core and in the platform EEH support. Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/eeh.h | 4 +- arch/powerpc/kernel/eeh.c| 22 +- arch/powerpc/kernel/eeh_pe.c | 44 ++-- arch/powerpc/platforms/powernv/eeh-powernv.c | 43 ++- arch/powerpc/platforms/pseries/eeh_pseries.c | 16 --- 5 files changed, 67 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index e11deb284631..62c4ee44ad2c 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -224,8 +224,8 @@ struct eeh_ops { int (*configure_bridge)(struct eeh_pe *pe); int (*err_inject)(struct eeh_pe *pe, int type, int func, unsigned long addr, unsigned long mask); - int (*read_config)(struct pci_dn *pdn, int where, int size, u32 *val); - int (*write_config)(struct pci_dn *pdn, int where, int size, u32 val); + int (*read_config)(struct eeh_dev *edev, int where, int size, u32 *val); + int (*write_config)(struct eeh_dev *edev, int where, int size, u32 val); int (*next_error)(struct eeh_pe **pe); int (*restore_config)(struct pci_dn *pdn); int (*notify_resume)(struct pci_dn *pdn); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index a3b93db972fc..7258fa04176d 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -185,21 +185,21 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); - eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, &cfg); + eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, &cfg); n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); pr_warn("EEH: PCI device/vendor: %08x\n", cfg); - eeh_ops->read_config(pdn, PCI_COMMAND, 4, &cfg); + eeh_ops->read_config(edev, PCI_COMMAND, 4, &cfg); n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); pr_warn("EEH: PCI cmd/status register: %08x\n", cfg); /* Gather bridge-specific registers */ if (edev->mode & EEH_DEV_BRIDGE) { - eeh_ops->read_config(pdn, PCI_SEC_STATUS, 2, &cfg); + eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, &cfg); n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); pr_warn("EEH: Bridge secondary status: %04x\n", cfg); - eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &cfg); + eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &cfg); n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); pr_warn("EEH: Bridge control: %04x\n", cfg); } @@ -207,11 +207,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) /* Dump out the PCI-X command and status regs */ cap = edev->pcix_cap; if (cap) { - eeh_ops->read_config(pdn, cap, 4, &cfg); + eeh_ops->read_config(edev, cap, 4, &cfg); n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg); pr_warn("EEH: PCI-X cmd: %08x\n", cfg); - eeh_ops->read_config(pdn, cap+4, 4, &cfg); + eeh_ops->read_config(edev, cap+4, 4, &cfg); n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg); pr_warn("EEH: PCI-X status: %08x\n", cfg); } @@ -223,7 +223,7 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) pr_warn("EEH: PCI-E capabilities and status follow:\n"); for (i=0; i<=8; i++) { - eeh_ops->read_config(pdn, cap+4*i, 4, &cfg); + eeh_ops->read_config(edev, cap+4*i, 4, &cfg); n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); if ((i % 4) == 0) { @@ -250,7 +250,7 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) pr_warn("EEH: PCI-E AER capability register set follows:\n"); for (i=0; i<=13; i++) { - eeh_ops->read_config(pdn, cap+4*i, 4, &cfg); + eeh_ops->read_config(edev, cap+4*i, 4, &cfg); n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); if ((i % 4) ==
[Very RFC 10/46] powerpc/eeh: Pass eeh_dev to eeh_ops->restore_config()
Remove another pdn usage. Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/eeh.h | 2 +- arch/powerpc/kernel/eeh.c| 5 ++--- arch/powerpc/kernel/eeh_pe.c | 6 ++ arch/powerpc/platforms/powernv/eeh-powernv.c | 11 +-- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 62c4ee44ad2c..67847f8dfe71 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -227,7 +227,7 @@ struct eeh_ops { int (*read_config)(struct eeh_dev *edev, int where, int size, u32 *val); int (*write_config)(struct eeh_dev *edev, int where, int size, u32 val); int (*next_error)(struct eeh_pe **pe); - int (*restore_config)(struct pci_dn *pdn); + int (*restore_config)(struct eeh_dev *edev); int (*notify_resume)(struct pci_dn *pdn); }; diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 7258fa04176d..63500e34e329 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -726,7 +726,6 @@ static void eeh_disable_and_save_dev_state(struct eeh_dev *edev, static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); struct pci_dev *dev = userdata; @@ -734,8 +733,8 @@ static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) return; /* Apply customization from firmware */ - if (pdn && eeh_ops->restore_config) - eeh_ops->restore_config(pdn); + if (eeh_ops->restore_config) + eeh_ops->restore_config(edev); /* The caller should restore state for the specified device */ if (pdev != dev) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index e11e0830f125..634963aa4a77 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -841,16 +841,14 @@ static void eeh_restore_device_bars(struct eeh_dev *edev) */ static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - /* Do special restore for bridges */ if (edev->mode & EEH_DEV_BRIDGE) eeh_restore_bridge_bars(edev); else eeh_restore_device_bars(edev); - if (eeh_ops->restore_config && pdn) - eeh_ops->restore_config(pdn); + if (eeh_ops->restore_config) + eeh_ops->restore_config(edev); } /** diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 54d8ec77aef2..6c5d9f1bc378 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1644,12 +1644,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) return ret; } -static int pnv_eeh_restore_config(struct pci_dn *pdn) +static int pnv_eeh_restore_config(struct eeh_dev *edev) { - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); struct pnv_phb *phb; s64 ret = 0; - int config_addr = (pdn->busno << 8) | (pdn->devfn); if (!edev) return -EEXIST; @@ -1658,13 +1656,14 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) if (edev->physfn) return 0; - phb = pdn->phb->private_data; + phb = edev->pe->phb->private_data; ret = opal_pci_reinit(phb->opal_id, - OPAL_REINIT_PCI_DEV, config_addr); + OPAL_REINIT_PCI_DEV, edev->bdfn); + ret = opal_pci_reinit(phb->opal_id, OPAL_REINIT_PCI_DEV, edev->bdfn); if (ret) { pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", - __func__, config_addr, ret); + __func__, edev->bdfn, ret); return -EIO; } -- 2.21.0
[Very RFC 11/46] powerpc/eeh: Convert various printfs to use edev, not pci_dn
We use the pci_dn to retrieve the domain, bus, device, and function numbers for an EEH device. We now have that in the eeh_dev so covert the various printk()s we have around the place to source that information from the eeh_dev. Signed-off-by: Oliver O'Halloran --- arch/powerpc/kernel/eeh.c| 14 -- arch/powerpc/kernel/eeh_pe.c | 14 ++ 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 63500e34e329..c8039fdb23ba 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -167,23 +167,17 @@ void eeh_show_enabled(void) */ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); u32 cfg; int cap, i; int n = 0, l = 0; char buffer[128]; - if (!pdn) { - pr_warn("EEH: Note: No error log for absent device.\n"); - return 0; - } - n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", - pdn->phb->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + edev->pe->phb->global_number, edev->bdfn >> 8, + PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n", - pdn->phb->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + edev->pe->phb->global_number, edev->bdfn >> 8, + PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, &cfg); n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 634963aa4a77..831f363f1732 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -366,9 +366,8 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev) */ int eeh_add_to_parent_pe(struct eeh_dev *edev) { + int config_addr = edev->bdfn; struct eeh_pe *pe, *parent; - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - int config_addr = (pdn->busno << 8) | (pdn->devfn); /* Check if the PE number is valid */ if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { @@ -382,7 +381,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * PE should be composed of PCI bus and its subordinate * components. */ - pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr); + pe = eeh_pe_get(edev->controller, edev->pe_config_addr, config_addr); if (pe) { if (pe->type & EEH_PE_INVALID) { list_add_tail(&edev->entry, &pe->edevs); @@ -416,9 +415,9 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Create a new EEH PE */ if (edev->physfn) - pe = eeh_pe_alloc(pdn->phb, EEH_PE_VF); + pe = eeh_pe_alloc(edev->controller, EEH_PE_VF); else - pe = eeh_pe_alloc(pdn->phb, EEH_PE_DEVICE); + pe = eeh_pe_alloc(edev->controller, EEH_PE_DEVICE); if (!pe) { pr_err("%s: out of memory!\n", __func__); return -ENOMEM; @@ -434,10 +433,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) */ parent = eeh_pe_get_parent(edev); if (!parent) { - parent = eeh_phb_pe_get(pdn->phb); + parent = eeh_phb_pe_get(edev->controller); if (!parent) { pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", - __func__, pdn->phb->global_number); + __func__, edev->controller->global_number); edev->pe = NULL; kfree(pe); return -EEXIST; @@ -698,7 +697,6 @@ void eeh_pe_state_clear(struct eeh_pe *root, int state, bool include_passed) */ static void eeh_bridge_check_link(struct eeh_dev *edev) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); int cap; uint32_t val; int timeout = 0; -- 2.21.0
[Very RFC 12/46] powerpc/eeh: Split eeh_probe into probe_pdn and probe_pdev
The EEH core has a concept of "early probe" and "late probe." When the EEH_PROBE_MODE_DEVTREE flag is set (i.e pseries) we call the eeh_ops->probe() function in eeh_add_device_early() so the eeh_dev state is initialised based on the pci_dn. It's important to realise that this happens *long* before the PCI device has been probed and a pci_dev structure created. This is necessary due to a PAPR requirement that EEH be enabled before to OS starts interacting with the device. The late probe is done in eeh_add_device_late() when the EEH_PROBE_MODE_DEV flag is set (i.e. PowerNV). The main difference is the late probe happens after the pci_dev has been created. As a result there is no actual dependency on a pci_dn in the late probe case. Splitting the single eeh_ops->probe() function into seperate functions allows us to simplify the late probe case since we have access to a pci_dev at that point. Having access to a pci_dev means that we can use the functions provided by the PCI core for finding capabilities, etc rather than doing it manually. It also changes the prototype for the probe functions to be void. Currently they return a void *, but both implementations always return NULL so there's not much point to it. Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/eeh.h | 3 +- arch/powerpc/kernel/eeh.c| 6 ++-- arch/powerpc/platforms/powernv/eeh-powernv.c | 29 ++-- arch/powerpc/platforms/pseries/eeh_pseries.c | 13 - 4 files changed, 20 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 67847f8dfe71..466b0165fbcf 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -215,7 +215,8 @@ enum { struct eeh_ops { char *name; int (*init)(void); - void* (*probe)(struct pci_dn *pdn, void *data); + void (*probe_pdn)(struct pci_dn *pdn);/* used on pseries */ + void (*probe_pdev)(struct pci_dev *pdev); /* used on powernv */ int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); int (*get_state)(struct eeh_pe *pe, int *delay); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index c8039fdb23ba..087a98b42a8c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1066,7 +1066,7 @@ void eeh_add_device_early(struct pci_dn *pdn) (eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid)) return; - eeh_ops->probe(pdn, NULL); + eeh_ops->probe_pdn(pdn); } /** @@ -1135,8 +1135,8 @@ void eeh_add_device_late(struct pci_dev *dev) dev->dev.archdata.edev = NULL; } - if (eeh_has_flag(EEH_PROBE_MODE_DEV)) - eeh_ops->probe(pdn, NULL); + if (eeh_ops->probe_pdev && eeh_has_flag(EEH_PROBE_MODE_DEV)) + eeh_ops->probe_pdev(dev); edev->pdev = dev; dev->dev.archdata.edev = edev; diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 6c5d9f1bc378..8bd5317aa878 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -346,23 +346,13 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap) /** * pnv_eeh_probe - Do probe on PCI device - * @pdn: PCI device node - * @data: unused + * @pdev: pci_dev to probe * - * When EEH module is installed during system boot, all PCI devices - * are checked one by one to see if it supports EEH. The function - * is introduced for the purpose. By default, EEH has been enabled - * on all PCI devices. That's to say, we only need do necessary - * initialization on the corresponding eeh device and create PE - * accordingly. - * - * It's notable that's unsafe to retrieve the EEH device through - * the corresponding PCI device. During the PCI device hotplug, which - * was possiblly triggered by EEH core, the binding between EEH device - * and the PCI device isn't built yet. + * Creates (or finds an existing) edev for this pci_dev. */ -static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) +static void pnv_eeh_probe_pdev(struct pci_dev *pdev) { + struct pci_dn *pdn = pci_get_pdn(pdev); struct pci_controller *hose = pdn->phb; struct pnv_phb *phb = hose->private_data; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -377,11 +367,11 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) * the probing. */ if (!edev || edev->pe) - return NULL; + return; /* Skip for PCI-ISA bridge */ if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) - return NULL; + return; eeh_edev_dbg(edev, "Probing device\n"); @@ -
[Very RFC 13/46] powerpc/eeh: Rework how pdev_probe() is used
Adjust how the EEH core uses the eeh_ops->probe_pdev() so that it returns the eeh_dev for the passed-in pci_dev. Currently mapping an pci_dev to an eeh_dev is done by finding the pci_dn for the pci_dev, then using the back-pointer to the eeh_dev stashed in the pci_dn. We want to move away from using pci_dn on PowerNV and moving the eeh_dev lookup into probe_pdev() allows the EEH core to be oblivious of how the mapping is actually done. Signed-off-by: Oliver O'Halloran --- arch/powerpc/include/asm/eeh.h | 16 +++-- arch/powerpc/kernel/eeh.c| 34 arch/powerpc/platforms/powernv/eeh-powernv.c | 20 +--- arch/powerpc/platforms/pseries/eeh_pseries.c | 19 ++- 4 files changed, 67 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 466b0165fbcf..e109bfd3dd57 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -215,8 +215,20 @@ enum { struct eeh_ops { char *name; int (*init)(void); - void (*probe_pdn)(struct pci_dn *pdn);/* used on pseries */ - void (*probe_pdev)(struct pci_dev *pdev); /* used on powernv */ + + /* +* on pseries the eeh_dev is initialised before the pci_dev exists +* using the contents of the pci_dn. +*/ + void (*probe_pdn)(struct pci_dn *pdn); + + /* +* probe_pdev() is used to find, and possibly create, an eeh_dev +* for a pci_dev. The EEH core binds the returned device to the +* pci_dev. +*/ + struct eeh_dev *(*probe_pdev)(struct pci_dev *pdev); + int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); int (*get_state)(struct eeh_pe *pe, int *delay); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 087a98b42a8c..58a8299ac417 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1099,17 +1099,24 @@ EXPORT_SYMBOL_GPL(eeh_add_device_tree_early); */ void eeh_add_device_late(struct pci_dev *dev) { - struct pci_dn *pdn; struct eeh_dev *edev; if (!dev) return; - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - edev = pdn_to_eeh_dev(pdn); - eeh_edev_dbg(edev, "Adding device\n"); - if (edev->pdev == dev) { - eeh_edev_dbg(edev, "Device already referenced!\n"); + pr_debug("EEH: Adding device %s\n", pci_name(dev)); + + /* pci_dev_to_eeh_dev() can only work if archdata.edev is already set */ + edev = pci_dev_to_eeh_dev(dev); + if (edev) { + /* FIXME: I don't remember why this isn't an error, but it's not */ + eeh_edev_dbg(edev, "Already bound to an eeh_dev!\n"); + return; + } + + edev = eeh_ops->probe_pdev(dev); + if (!edev) { + pr_debug("EEH: Adding device failed\n"); return; } @@ -1118,8 +1125,13 @@ void eeh_add_device_late(struct pci_dev *dev) * unbalanced kref to the device during unplug time, which * relies on pcibios_release_device(). So we have to remove * that here explicitly. +* +* FIXME: This really shouldn't be necessary. We should probably +* tear down the EEH state when we detatch the pci_dev from the +* bus. We might need to move the bus notifiers out of the platforms +* first. */ - if (edev->pdev) { + if (edev->pdev && edev->pdev != dev) { eeh_rmv_from_parent_pe(edev); eeh_addr_cache_rmv_dev(edev->pdev); eeh_sysfs_remove_device(edev->pdev); @@ -1130,17 +1142,11 @@ void eeh_add_device_late(struct pci_dev *dev) * into error handler afterwards. */ edev->mode |= EEH_DEV_NO_HANDLER; - - edev->pdev = NULL; - dev->dev.archdata.edev = NULL; } - if (eeh_ops->probe_pdev && eeh_has_flag(EEH_PROBE_MODE_DEV)) - eeh_ops->probe_pdev(dev); - + /* bind the pdev and the edev together */ edev->pdev = dev; dev->dev.archdata.edev = edev; - eeh_addr_cache_insert_dev(dev); eeh_sysfs_add_device(dev); } diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 8bd5317aa878..5250c4525544 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -348,9 +348,9 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap) * pnv_eeh_probe - Do probe on PCI device * @pdev: pci_dev to probe * - * Creates (or finds an existing) edev for this pci_dev. + * Create, or find the existing, eeh_dev for this pci_dev.
[Very RFC 14/46] powernv/eeh: Remove un-necessary call to eeh_add_device_early()
eeh_add_device_early() is used to initialise the EEH state for a PCI device based on the contents of it's devicetree node. It doesn't do anything unless EEH_FLAG_PROBE_MODE_DEVTREE is set and that only happens on pseries. Remove the call to eeh_add_device_early() in the powernv code to squash another pci_dn usage. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/eeh-powernv.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 5250c4525544..aa2935a08464 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -40,13 +40,10 @@ static int eeh_event_irq = -EINVAL; void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { - struct pci_dn *pdn = pci_get_pdn(pdev); - - if (!pdn || eeh_has_flag(EEH_FORCE_DISABLED)) + if (eeh_has_flag(EEH_FORCE_DISABLED)) return; dev_dbg(&pdev->dev, "EEH: Setting up device\n"); - eeh_add_device_early(pdn); eeh_add_device_late(pdev); } -- 2.21.0
[Very RFC 15/46] powernv/eeh: Use pnv_eeh_*_config() for internal config ops
Use the pnv_eeh_{read|write}_config() functions that take an edev rather than a pci_dn. This allows us to remove most of the explict uses of pci_dn in the PowerNV EEH backend and localises them into a few functions which we can fix later. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/eeh-powernv.c | 153 +-- 1 file changed, 70 insertions(+), 83 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index aa2935a08464..aaccb3768393 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -278,27 +278,73 @@ int pnv_eeh_post_init(void) return ret; } -static int pnv_eeh_find_cap(struct pci_dn *pdn, int cap) +static inline bool pnv_eeh_cfg_blocked(struct eeh_dev *edev) +{ + if (!edev || !edev->pe) + return false; + + /* +* We will issue FLR or AF FLR to all VFs, which are contained +* in VF PE. It relies on the EEH PCI config accessors. So we +* can't block them during the window. +*/ + if (edev->physfn && (edev->pe->state & EEH_PE_RESET)) + return false; + + if (edev->pe->state & EEH_PE_CFG_BLOCKED) + return true; + + return false; +} + +static int pnv_eeh_read_config(struct eeh_dev *edev, + int where, int size, u32 *val) +{ + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (pnv_eeh_cfg_blocked(edev)) { + *val = 0x; + return PCIBIOS_SET_FAILED; + } + + return pnv_pci_cfg_read(pdn, where, size, val); +} + +static int pnv_eeh_write_config(struct eeh_dev *edev, + int where, int size, u32 val) +{ + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (pnv_eeh_cfg_blocked(edev)) + return PCIBIOS_SET_FAILED; + + return pnv_pci_cfg_write(pdn, where, size, val); +} + +static int pnv_eeh_find_cap(struct eeh_dev *edev, int cap) { int pos = PCI_CAPABILITY_LIST; int cnt = 48; /* Maximal number of capabilities */ u32 status, id; - if (!pdn) - return 0; - /* Check if the device supports capabilities */ - pnv_pci_cfg_read(pdn, PCI_STATUS, 2, &status); + pnv_eeh_read_config(edev, PCI_STATUS, 2, &status); if (!(status & PCI_STATUS_CAP_LIST)) return 0; while (cnt--) { - pnv_pci_cfg_read(pdn, pos, 1, &pos); + pnv_eeh_read_config(edev, pos, 1, &pos); if (pos < 0x40) break; pos &= ~3; - pnv_pci_cfg_read(pdn, pos + PCI_CAP_LIST_ID, 1, &id); + pnv_eeh_read_config(edev, pos + PCI_CAP_LIST_ID, 1, &id); if (id == 0xff) break; @@ -313,15 +359,14 @@ static int pnv_eeh_find_cap(struct pci_dn *pdn, int cap) return 0; } -static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap) +static int pnv_eeh_find_ecap(struct eeh_dev *edev, int cap) { - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); u32 header; int pos = 256, ttl = (4096 - 256) / 8; if (!edev || !edev->pcie_cap) return 0; - if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + if (pnv_eeh_read_config(edev, pos, 4, &header) != PCIBIOS_SUCCESSFUL) return 0; else if (!header) return 0; @@ -334,7 +379,7 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap) if (pos < 256) break; - if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + if (pnv_eeh_read_config(edev, pos, 4, &header) != PCIBIOS_SUCCESSFUL) break; } @@ -382,15 +427,14 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) /* Initialize eeh device */ edev->class_code = pdn->class_code; - edev->mode &= 0xFF00; - edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX); - edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP); - edev->af_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF); - edev->aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR); + edev->pcix_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_PCIX); + edev->pcie_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_EXP); + edev->af_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_AF); + edev->aer_cap = pnv_eeh_find_ecap(edev, PCI_EXT_CAP_ID_ERR); if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI)
[Very RFC 16/46] powernv/eeh: Use eeh_edev_warn() rather than open-coding a BDFN print
Neaten things up a bit and remove a pci_dn use. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/eeh-powernv.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index aaccb3768393..f58fe6bda46e 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1001,10 +1001,8 @@ static void pnv_eeh_wait_for_pending(struct eeh_dev *edev, const char *type, msleep((1 << i) * 100); } - pr_warn("%s: Pending transaction while issuing %sFLR to %04x:%02x:%02x.%01x\n", - __func__, type, - pdn->phb->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + eeh_edev_warn(edev, "%s: Pending transaction while issuing %sFLR\n", + __func__, type); } static int pnv_eeh_do_flr(struct eeh_dev *edev, int option) -- 2.21.0
[Very RFC 17/46] powernv/eeh: add pnv_eeh_find_edev()
To get away from using pci_dn we need a way to find the edev for a given bdfn. The easiest way to do this is to find the ioda_pe for that BDFN in the PHB's reverse mapping table and scan the device list of the corresponding eeh_pe. Is this slow? Yeah probably. Is it slower than the existing "traverse the pdn tree" method? Probably not. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/eeh-powernv.c | 31 arch/powerpc/platforms/powernv/pci.h | 2 ++ 2 files changed, 33 insertions(+) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index f58fe6bda46e..a974822c5097 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -278,6 +278,37 @@ int pnv_eeh_post_init(void) return ret; } +struct eeh_dev *pnv_eeh_find_edev(struct pnv_phb *phb, u16 bdfn) +{ + struct pnv_ioda_pe *ioda_pe; + struct eeh_dev *tmp, *edev; + struct eeh_pe *pe; + + /* EEH not enabled ? */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) + return NULL; + + /* Fish the EEH PE from the IODA PE */ + ioda_pe = __pnv_ioda_get_pe(phb, bdfn); + if (!ioda_pe) + return NULL; + + /* +* FIXME: Doing a tree-traversal followed by a list traversal +* on every config access is dumb. Not much dumber than the pci_dn +* tree traversal we did before, but still quite dumb. +*/ + pe = eeh_pe_get(phb->hose, ioda_pe->pe_number, 0); + if (!pe) + return NULL; + + eeh_pe_for_each_dev(pe, edev, tmp) + if (edev->bdfn == bdfn) + return edev; + + return NULL; +} + static inline bool pnv_eeh_cfg_blocked(struct eeh_dev *edev) { if (!edev || !edev->pe) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 3c33a0c91a69..a343f3c8e65c 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -196,6 +196,8 @@ extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, __u64 window_size, __u32 levels); extern int pnv_eeh_post_init(void); +struct eeh_dev; +struct eeh_dev *pnv_eeh_find_edev(struct pnv_phb *phb, u16 bdfn); __printf(3, 4) extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, -- 2.21.0
[Very RFC 18/46] powernv/pci: Add pci_bus_to_pnvhb() helper
Add a helper to go from a pci_bus structure to the pnv_phb that hosts that bus. There's a lot of instances of the following pattern: struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; Without any other uses of the pci_controller inside the function. This is hard to read since it requires you to memorise the contents of the private data fields and kind of error prone since it involves blindly assigning a void pointer. Add a helper to make it more concise and explict. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 88 +++ arch/powerpc/platforms/powernv/pci.c | 18 ++--- arch/powerpc/platforms/powernv/pci.h | 10 +++ 3 files changed, 39 insertions(+), 77 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c74521e5f3ab..a1c9315f3208 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -252,8 +252,7 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb) static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, unsigned long *pe_bitmap) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct resource *r; resource_size_t base, sgsz, start, end; int segno, i; @@ -351,8 +350,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *master_pe, *pe; unsigned long size, *pe_alloc; int i; @@ -673,8 +671,7 @@ struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, u16 bdfn) struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); if (!pdn) @@ -1053,8 +1050,7 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; @@ -1113,8 +1109,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) */ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *pe = NULL; unsigned int pe_num; @@ -1181,8 +1176,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) struct pnv_ioda_pe *pe; struct pci_dev *gpu_pdev; struct pci_dn *npu_pdn; - struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus); /* * Due to a hardware errata PE#0 on the NPU is reserved for @@ -1279,16 +1273,12 @@ static void pnv_pci_ioda_setup_PEs(void) #ifdef CONFIG_PCI_IOV static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) { - struct pci_bus*bus; - struct pci_controller *hose; struct pnv_phb*phb; struct pci_dn *pdn; inti, j; intm64_bars; - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; + phb = pci_bus_to_pnvhb(pdev->bus); pdn = pci_get_pdn(pdev); if (pdn->m64_single_mode) @@ -1312,8 +1302,6 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) { - struct pci_bus*bus; - struct pci_controller *hose; struct pnv_phb*phb; struct pci_dn *pdn; unsigned int win; @@ -1325,9 +1313,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) intpe_num; intm64_bars; - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; + phb = pci_bus_to_pnvhb(pdev->bus); pdn = pci_get_pdn(
[Very RFC 19/46] powernv/eeh: Use standard PCI capability lookup functions
We have a pci_dev so we can use the functions provided by the PCI core for looking up capabilities. This should be safe since these are only called when initialising the eeh_dev when the device is first probed and not in the EEH recovery path where config accesses are blocked. This might cause a problem if an EEH event occured while probing the device, but I'm pretty sure that's going to be broken anyway. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/eeh-powernv.c | 67 ++-- 1 file changed, 4 insertions(+), 63 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index a974822c5097..b79aca8368c6 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -358,65 +358,6 @@ static int pnv_eeh_write_config(struct eeh_dev *edev, return pnv_pci_cfg_write(pdn, where, size, val); } -static int pnv_eeh_find_cap(struct eeh_dev *edev, int cap) -{ - int pos = PCI_CAPABILITY_LIST; - int cnt = 48; /* Maximal number of capabilities */ - u32 status, id; - - /* Check if the device supports capabilities */ - pnv_eeh_read_config(edev, PCI_STATUS, 2, &status); - if (!(status & PCI_STATUS_CAP_LIST)) - return 0; - - while (cnt--) { - pnv_eeh_read_config(edev, pos, 1, &pos); - if (pos < 0x40) - break; - - pos &= ~3; - pnv_eeh_read_config(edev, pos + PCI_CAP_LIST_ID, 1, &id); - if (id == 0xff) - break; - - /* Found */ - if (id == cap) - return pos; - - /* Next one */ - pos += PCI_CAP_LIST_NEXT; - } - - return 0; -} - -static int pnv_eeh_find_ecap(struct eeh_dev *edev, int cap) -{ - u32 header; - int pos = 256, ttl = (4096 - 256) / 8; - - if (!edev || !edev->pcie_cap) - return 0; - if (pnv_eeh_read_config(edev, pos, 4, &header) != PCIBIOS_SUCCESSFUL) - return 0; - else if (!header) - return 0; - - while (ttl-- > 0) { - if (PCI_EXT_CAP_ID(header) == cap && pos) - return pos; - - pos = PCI_EXT_CAP_NEXT(header); - if (pos < 256) - break; - - if (pnv_eeh_read_config(edev, pos, 4, &header) != PCIBIOS_SUCCESSFUL) - break; - } - - return 0; -} - /** * pnv_eeh_probe - Do probe on PCI device * @pdev: pci_dev to probe @@ -458,10 +399,10 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) /* Initialize eeh device */ edev->class_code = pdn->class_code; - edev->pcix_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_PCIX); - edev->pcie_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_EXP); - edev->af_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_AF); - edev->aer_cap = pnv_eeh_find_ecap(edev, PCI_EXT_CAP_ID_ERR); + edev->pcix_cap = pci_find_capability(pdev, PCI_CAP_ID_PCIX); + edev->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP); + edev->af_cap = pci_find_capability(pdev, PCI_CAP_ID_AF); + edev->aer_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR); if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { edev->mode |= EEH_DEV_BRIDGE; if (edev->pcie_cap) { -- 2.21.0
[Very RFC 20/46] powernv/eeh: Look up device info from pci_dev
Most of what we fetch from the pci_dn is also in the pci_dev structure. Convert the pnv_eeh_probe_pdev() to use the pdev fields rather than the pci_dn so we can get rid of pci_dn eventually. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/eeh-powernv.c | 26 ++-- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index b79aca8368c6..6ba74836a9f8 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -372,7 +372,7 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) struct eeh_dev *edev = pdn_to_eeh_dev(pdn); uint32_t pcie_flags; int ret; - int config_addr = (pdn->busno << 8) | (pdn->devfn); + int config_addr = (pdev->bus->number << 8) | (pdev->devfn); /* * When probing the root bridge, which doesn't have any @@ -392,18 +392,18 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) } /* Skip for PCI-ISA bridge */ - if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; eeh_edev_dbg(edev, "Probing device\n"); /* Initialize eeh device */ - edev->class_code = pdn->class_code; + edev->class_code = pdev->class; edev->pcix_cap = pci_find_capability(pdev, PCI_CAP_ID_PCIX); edev->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP); edev->af_cap = pci_find_capability(pdev, PCI_CAP_ID_AF); edev->aer_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR); - if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { edev->mode |= EEH_DEV_BRIDGE; if (edev->pcie_cap) { pnv_eeh_read_config(edev, edev->pcie_cap + PCI_EXP_FLAGS, @@ -443,14 +443,14 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) * Broadcom Shiner 4-ports 1G NICs (14e4:168a) * Broadcom Shiner 2-ports 10G NICs (14e4:168e) */ - if ((pdn->vendor_id == PCI_VENDOR_ID_BROADCOM && -pdn->device_id == 0x1656) || - (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM && -pdn->device_id == 0x1657) || - (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM && -pdn->device_id == 0x168a) || - (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM && -pdn->device_id == 0x168e)) + if ((pdev->vendor == PCI_VENDOR_ID_BROADCOM && +pdev->device == 0x1656) || + (pdev->vendor == PCI_VENDOR_ID_BROADCOM && +pdev->device == 0x1657) || + (pdev->vendor == PCI_VENDOR_ID_BROADCOM && +pdev->device == 0x168a) || + (pdev->vendor == PCI_VENDOR_ID_BROADCOM && +pdev->device == 0x168e)) edev->pe->state |= EEH_PE_CFG_RESTRICTED; /* @@ -461,7 +461,7 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) */ if (!(edev->pe->state & EEH_PE_PRI_BUS)) { edev->pe->bus = pci_find_bus(hose->global_number, -pdn->busno); +pdev->bus->number); if (edev->pe->bus) edev->pe->state |= EEH_PE_PRI_BUS; } -- 2.21.0
[Very RFC 21/46] powernv/eeh: Rework finding an existing edev in probe_pdev()
Use the pnv_eeh_find_edev() helper to look up the eeh_dev for a device rather than doing it via the pci_dn. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/eeh-powernv.c | 44 ++-- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 6ba74836a9f8..1cd80b35 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -374,20 +374,40 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) int ret; int config_addr = (pdev->bus->number << 8) | (pdev->devfn); + pci_dbg(pdev, "%s: probing\n", __func__); + /* -* When probing the root bridge, which doesn't have any -* subordinate PCI devices. We don't have OF node for -* the root bridge. So it's not reasonable to continue -* the probing. +* EEH keeps the eeh_dev alive over a recovery pass even when the +* corresponding pci_dev has been torn down. In that case we need +* to find the existing eeh_dev and re-bind the two. */ - if (!edev || edev->pe) - return NULL; + edev = pnv_eeh_find_edev(phb, config_addr); + if (edev) { + eeh_edev_dbg(edev, "Found existing edev!\n"); + + /* +* XXX: eeh_remove_device() clears pdev so we shouldn't hit this +* normally. I've found that screwing around with the pci probe +* path can result in eeh_probe_pdev() being called twice. This +* is harmless at the moment, but it's pretty strange so emit a +* warning to be on the safe side. +*/ + if (WARN_ON(edev->pdev)) + eeh_edev_dbg(edev, "%s: already bound to a pdev!\n", __func__); + + edev->pdev = pdev; + + /* should we be doing something with REMOVED too? */ + edev->mode &= EEH_DEV_DISCONNECTED; + + /* update the primary bus if we need to */ + // XXX: why do we need to do this? is the pci_bus going away? what cleared the flag? + if (!(edev->pe->state & EEH_PE_PRI_BUS)) { + edev->pe->bus = pdev->bus; + if (edev->pe->bus) + edev->pe->state |= EEH_PE_PRI_BUS; + } - /* already configured? */ - if (edev->pdev) { - pr_debug("%s: found existing edev for %04x:%02x:%02x.%01x\n", - __func__, hose->global_number, config_addr >> 8, - PCI_SLOT(config_addr), PCI_FUNC(config_addr)); return edev; } @@ -395,8 +415,6 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; - eeh_edev_dbg(edev, "Probing device\n"); - /* Initialize eeh device */ edev->class_code = pdev->class; edev->pcix_cap = pci_find_capability(pdev, PCI_CAP_ID_PCIX); -- 2.21.0
[Very RFC 22/46] powernv/eeh: Allocate eeh_dev's when needed
Have the PowerNV EEH backend allocate the eeh_dev if needed rather than using the one attached to the pci_dn. This gets us most of the way towards decoupling pci_dn from the PowerNV EEH code. Signed-off-by: Oliver O'Halloran --- We should probably be free()ing the eeh_dev somewhere. The pci_dev release function is the right place for it. --- arch/powerpc/platforms/powernv/eeh-powernv.c | 22 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 1cd80b35..7aba18e08996 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -366,10 +366,9 @@ static int pnv_eeh_write_config(struct eeh_dev *edev, */ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) { - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pci_controller *hose = pdn->phb; - struct pnv_phb *phb = hose->private_data; - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct pci_controller *hose = phb->hose; + struct eeh_dev *edev; uint32_t pcie_flags; int ret; int config_addr = (pdev->bus->number << 8) | (pdev->devfn); @@ -415,12 +414,27 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + /* otherwise allocate and initialise a new eeh_dev */ + edev = kzalloc(sizeof(*edev), GFP_KERNEL); + if (!edev) { + pr_err("%s: out of memory lol\n", __func__); + return NULL; + } + /* Initialize eeh device */ + edev->bdfn = config_addr; + edev->controller = phb->hose; + edev->class_code = pdev->class; edev->pcix_cap = pci_find_capability(pdev, PCI_CAP_ID_PCIX); edev->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP); edev->af_cap = pci_find_capability(pdev, PCI_CAP_ID_AF); edev->aer_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR); + + /* TODO: stash the vf_index in here? */ + if (pdev->is_virtfn) + edev->physfn = pdev->physfn; + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { edev->mode |= EEH_DEV_BRIDGE; if (edev->pcie_cap) { -- 2.21.0
[Very RFC 23/46] powerpc/eeh: Moving finding the parent PE into the platform
Currently the generic EEH code uses the pci_dn of a device to look up the PE of the device's parent bridge, or physical function. The generic function to insert the edev (and possibly create the eeh_pe) is called from the probe functions already so this is a relatively minor change. The existing lookup method moves into the pseries platform and PowerNV can choose the PE based on the bus heirachy instead. Signed-off-by: Oliver O'Halloran --- "parent" meaning "parent of the PE that actually contains this edev" is stupid, but it's stupid consistent with what's there already. Also I couldn't think of a way to fix it without adding a bunch of boring boilerplate at the call sites. FIXME: I think I introduced a bug here. Currently we coalase a switch's upstream port bus and the downstream port bus into a single PE since they're a single failure domain. That seems to have been broken by this patch, but whatever. --- arch/powerpc/include/asm/eeh.h | 2 +- arch/powerpc/kernel/eeh_pe.c | 54 - arch/powerpc/platforms/powernv/eeh-powernv.c | 25 +++- arch/powerpc/platforms/pseries/eeh_pseries.c | 61 4 files changed, 86 insertions(+), 56 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index e109bfd3dd57..70d3e01dbe9d 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -295,7 +295,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root); struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no, int config_addr); -int eeh_add_to_parent_pe(struct eeh_dev *edev); +int eeh_add_to_parent_pe(struct eeh_pe *parent, struct eeh_dev *edev); int eeh_rmv_from_parent_pe(struct eeh_dev *edev); void eeh_pe_update_time_stamp(struct eeh_pe *pe); void *eeh_pe_traverse(struct eeh_pe *root, diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 831f363f1732..520c249f19d3 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -318,56 +318,23 @@ struct eeh_pe *eeh_pe_get(struct pci_controller *phb, return pe; } -/** - * eeh_pe_get_parent - Retrieve the parent PE - * @edev: EEH device - * - * The whole PEs existing in the system are organized as hierarchy - * tree. The function is used to retrieve the parent PE according - * to the parent EEH device. - */ -static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev) -{ - struct eeh_dev *parent; - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - - /* -* It might have the case for the indirect parent -* EEH device already having associated PE, but -* the direct parent EEH device doesn't have yet. -*/ - if (edev->physfn) - pdn = pci_get_pdn(edev->physfn); - else - pdn = pdn ? pdn->parent : NULL; - while (pdn) { - /* We're poking out of PCI territory */ - parent = pdn_to_eeh_dev(pdn); - if (!parent) - return NULL; - - if (parent->pe) - return parent->pe; - - pdn = pdn->parent; - } - - return NULL; -} - /** * eeh_add_to_parent_pe - Add EEH device to parent PE + * @parent: PE to create additional PEs under * @edev: EEH device * - * Add EEH device to the parent PE. If the parent PE already - * exists, the PE type will be changed to EEH_PE_BUS. Otherwise, - * we have to create new PE to hold the EEH device and the new - * PE will be linked to its parent PE as well. + * Add EEH device to the PE in edev->pe_config_addr. If the PE + * already exists then we'll add it to that. Otherwise a new + * PE is created, and inserted into the PE tree below @parent. + * If @parent is NULL, then it will be inserted under the PHB + * PE for edev->controller. + * + * In either case @edev is added to the PE's device list. */ -int eeh_add_to_parent_pe(struct eeh_dev *edev) +int eeh_add_to_parent_pe(struct eeh_pe *parent, struct eeh_dev *edev) { int config_addr = edev->bdfn; - struct eeh_pe *pe, *parent; + struct eeh_pe *pe; /* Check if the PE number is valid */ if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { @@ -431,7 +398,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * to PHB directly. Otherwise, we have to associate the * PE with its parent. */ - parent = eeh_pe_get_parent(edev); if (!parent) { parent = eeh_phb_pe_get(edev->controller); if (!parent) { diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 7aba18e08996..49a932ff092a 100644 --- a/arch/powerp
[Very RFC 24/46] powernv/pci: Make the pre-cfg EEH freeze check use eeh_dev rather than pci_dn
Squash another usage in preperation for making the config accessors pci_dn. Signed-off-by: Oliver O'Halloran --- We might want to move this into eeh-powernv.c --- arch/powerpc/platforms/powernv/pci.c | 37 +--- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index d36dde9777aa..6170677bfdc7 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -708,30 +708,23 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, } #if CONFIG_EEH -static bool pnv_pci_cfg_check(struct pci_dn *pdn) +bool pnv_eeh_pre_cfg_check(struct eeh_dev *edev) { - struct eeh_dev *edev = NULL; - struct pnv_phb *phb = pdn->phb->private_data; - - /* EEH not enabled ? */ - if (!(phb->flags & PNV_PHB_FLAG_EEH)) + if (!edev || !edev->pe) return true; - /* PE reset or device removed ? */ - edev = pdn->edev; - if (edev) { - if (edev->pe && - (edev->pe->state & EEH_PE_CFG_BLOCKED)) - return false; + /* PE in reset? */ + if (edev->pe->state & EEH_PE_CFG_BLOCKED) + return false; - if (edev->mode & EEH_DEV_REMOVED) - return false; - } + /* Device removed? */ + if (edev->mode & EEH_DEV_REMOVED) + return false; return true; } #else -static inline pnv_pci_cfg_check(struct pci_dn *pdn) +static inline pnv_pci_cfg_check(struct eeh_dev *edev) { return true; } @@ -743,6 +736,7 @@ static int pnv_pci_read_config(struct pci_bus *bus, { struct pci_dn *pdn; struct pnv_phb *phb; + struct eeh_dev *edev; int ret; *val = 0x; @@ -750,14 +744,15 @@ static int pnv_pci_read_config(struct pci_bus *bus, if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; - if (!pnv_pci_cfg_check(pdn)) + edev = pdn_to_eeh_dev(pdn); + if (!pnv_eeh_pre_cfg_check(edev)) return PCIBIOS_DEVICE_NOT_FOUND; ret = pnv_pci_cfg_read(pdn, where, size, val); phb = pdn->phb->private_data; - if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) { + if (phb->flags & PNV_PHB_FLAG_EEH && edev) { if (*val == EEH_IO_ERROR_VALUE(size) && - eeh_dev_check_failure(pdn->edev)) + eeh_dev_check_failure(edev)) return PCIBIOS_DEVICE_NOT_FOUND; } else { pnv_pci_config_check_eeh(pdn); @@ -772,13 +767,15 @@ static int pnv_pci_write_config(struct pci_bus *bus, { struct pci_dn *pdn; struct pnv_phb *phb; + struct eeh_dev *edev; int ret; pdn = pci_get_pdn_by_devfn(bus, devfn); if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; - if (!pnv_pci_cfg_check(pdn)) + edev = pdn_to_eeh_dev(pdn); + if (!pnv_eeh_pre_cfg_check(edev)) return PCIBIOS_DEVICE_NOT_FOUND; ret = pnv_pci_cfg_write(pdn, where, size, val); -- 2.21.0
[Very RFC 25/46] powernv/pci: Remove pdn from pnv_pci_config_check_eeh()
Despite the name this function is generic PowerNV PCI code rather than anything EEH specific. Convert to take a phb and bdfn rather than a pci_dn. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci.c | 32 ++-- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 6170677bfdc7..50142ff045ac 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -591,9 +591,15 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) spin_unlock_irqrestore(&phb->lock, flags); } -static void pnv_pci_config_check_eeh(struct pci_dn *pdn) +/* + * This, very strangely named, function checks if a config access + * caused an EEH and un-freezes the PE if it did. This is mainly + * for the !CONFIG_EEH case where nothing is going to un-freeze + * it for us. + */ +static void pnv_pci_config_check_eeh(struct pnv_phb *phb, u16 bdfn) { - struct pnv_phb *phb = pdn->phb->private_data; + struct pnv_ioda_pe *ioda_pe; u8 fstate = 0; __be16 pcierr = 0; unsigned int pe_no; @@ -604,10 +610,11 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) * setup that yet. So all ER errors should be mapped to * reserved PE. */ - pe_no = pdn->pe_number; - if (pe_no == IODA_INVALID_PE) { + ioda_pe = __pnv_ioda_get_pe(phb, bdfn); + if (ioda_pe) + pe_no = ioda_pe->pe_number; + else pe_no = phb->ioda.reserved_pe_idx; - } /* * Fetch frozen state. If the PHB support compound PE, @@ -629,7 +636,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) } pr_devel(" -> EEH check, bdfn=%04x PE#%x fstate=%x\n", -(pdn->busno << 8) | (pdn->devfn), pe_no, fstate); +bdfn, pe_no, fstate); /* Clear the frozen state if applicable */ if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE || @@ -642,6 +649,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) if (phb->freeze_pe) phb->freeze_pe(phb, pe_no); + /* fish out the EEH log and send an EEH event. */ pnv_pci_handle_eeh_config(phb, pe_no); } } @@ -735,7 +743,8 @@ static int pnv_pci_read_config(struct pci_bus *bus, int where, int size, u32 *val) { struct pci_dn *pdn; - struct pnv_phb *phb; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); + u16 bdfn = bus->number << 8 | devfn; struct eeh_dev *edev; int ret; @@ -755,7 +764,7 @@ static int pnv_pci_read_config(struct pci_bus *bus, eeh_dev_check_failure(edev)) return PCIBIOS_DEVICE_NOT_FOUND; } else { - pnv_pci_config_check_eeh(pdn); + pnv_pci_config_check_eeh(phb, bdfn); } return ret; @@ -766,7 +775,8 @@ static int pnv_pci_write_config(struct pci_bus *bus, int where, int size, u32 val) { struct pci_dn *pdn; - struct pnv_phb *phb; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); + u16 bdfn = bus->number << 8 | devfn; struct eeh_dev *edev; int ret; @@ -779,9 +789,9 @@ static int pnv_pci_write_config(struct pci_bus *bus, return PCIBIOS_DEVICE_NOT_FOUND; ret = pnv_pci_cfg_write(pdn, where, size, val); - phb = pdn->phb->private_data; + if (!(phb->flags & PNV_PHB_FLAG_EEH)) - pnv_pci_config_check_eeh(pdn); + pnv_pci_config_check_eeh(phb, bdfn); return ret; } -- 2.21.0
[Very RFC 26/46] powernv/pci: Remove pdn from pnv_pci_cfg_{read|write}
Remove the use of pci_dn from the low-level config space access functions. These are used by the eeh's config ops and the bus config ops that we provide to the PCI core. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/eeh-powernv.c | 14 +++ arch/powerpc/platforms/powernv/pci.c | 26 arch/powerpc/platforms/powernv/pci.h | 6 ++--- 3 files changed, 16 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 49a932ff092a..8a73bc7517c5 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -331,31 +331,25 @@ static inline bool pnv_eeh_cfg_blocked(struct eeh_dev *edev) static int pnv_eeh_read_config(struct eeh_dev *edev, int where, int size, u32 *val) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - - if (!pdn) - return PCIBIOS_DEVICE_NOT_FOUND; + struct pnv_phb *phb = edev->controller->private_data; if (pnv_eeh_cfg_blocked(edev)) { *val = 0x; return PCIBIOS_SET_FAILED; } - return pnv_pci_cfg_read(pdn, where, size, val); + return pnv_pci_cfg_read(phb, edev->bdfn, where, size, val); } static int pnv_eeh_write_config(struct eeh_dev *edev, int where, int size, u32 val) { - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - - if (!pdn) - return PCIBIOS_DEVICE_NOT_FOUND; + struct pnv_phb *phb = edev->controller->private_data; if (pnv_eeh_cfg_blocked(edev)) return PCIBIOS_SET_FAILED; - return pnv_pci_cfg_write(pdn, where, size, val); + return pnv_pci_cfg_write(phb, edev->bdfn, where, size, val); } static struct eeh_pe *pnv_eeh_pe_get_parent(struct pci_dev *pdev) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 50142ff045ac..36eea4bb514c 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -654,11 +654,9 @@ static void pnv_pci_config_check_eeh(struct pnv_phb *phb, u16 bdfn) } } -int pnv_pci_cfg_read(struct pci_dn *pdn, +int pnv_pci_cfg_read(struct pnv_phb *phb, u16 bdfn, int where, int size, u32 *val) { - struct pnv_phb *phb = pdn->phb->private_data; - u32 bdfn = (pdn->busno << 8) | pdn->devfn; s64 rc; switch (size) { @@ -685,19 +683,16 @@ int pnv_pci_cfg_read(struct pci_dn *pdn, return PCIBIOS_FUNC_NOT_SUPPORTED; } - pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", -__func__, pdn->busno, pdn->devfn, where, size, *val); + pr_devel("%s: bdfn: %x +%x/%x -> %08x\n", +__func__, bdfn, where, size, *val); return PCIBIOS_SUCCESSFUL; } -int pnv_pci_cfg_write(struct pci_dn *pdn, +int pnv_pci_cfg_write(struct pnv_phb *phb, u16 bdfn, int where, int size, u32 val) { - struct pnv_phb *phb = pdn->phb->private_data; - u32 bdfn = (pdn->busno << 8) | pdn->devfn; - - pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", -__func__, pdn->busno, pdn->devfn, where, size, val); + pr_devel("%s: bdfn: %x +%x/%x -> %08x\n", +__func__, bdfn, where, size, val); switch (size) { case 1: opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); @@ -753,12 +748,11 @@ static int pnv_pci_read_config(struct pci_bus *bus, if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; - edev = pdn_to_eeh_dev(pdn); + edev = pnv_eeh_find_edev(phb, bdfn); if (!pnv_eeh_pre_cfg_check(edev)) return PCIBIOS_DEVICE_NOT_FOUND; - ret = pnv_pci_cfg_read(pdn, where, size, val); - phb = pdn->phb->private_data; + ret = pnv_pci_cfg_read(phb, bdfn, where, size, val); if (phb->flags & PNV_PHB_FLAG_EEH && edev) { if (*val == EEH_IO_ERROR_VALUE(size) && eeh_dev_check_failure(edev)) @@ -784,11 +778,11 @@ static int pnv_pci_write_config(struct pci_bus *bus, if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; - edev = pdn_to_eeh_dev(pdn); + edev = pnv_eeh_find_edev(phb, bdfn); if (!pnv_eeh_pre_cfg_check(edev)) return PCIBIOS_DEVICE_NOT_FOUND; - ret = pnv_pci_cfg_write(pdn, where, size, val); + ret = pnv_pci_cfg_write(phb, bdfn, where, size, val); if (!(phb->flags & PNV_PHB_FLAG_EEH)) pnv_pci_config_check_eeh(phb, bdfn); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index be435a8
[Very RFC 27/46] powernv/pci: Clear reserved PE freezes
When we scan an empty slot the PHB gets an Unsupported Request from the downstream bridge when there's no device present at that BDFN. Some older PHBs (p7-IOC) don't allow further config space accesses while the PE is frozen, so clear it here without bothering with the diagnostic log. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci.c | 13 + 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 36eea4bb514c..5b1f4677cdce 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -642,6 +642,19 @@ static void pnv_pci_config_check_eeh(struct pnv_phb *phb, u16 bdfn) if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE || fstate == OPAL_EEH_STOPPED_DMA_FREEZE || fstate == OPAL_EEH_STOPPED_MMIO_DMA_FREEZE) { + + /* +* Scanning an empty slot will result in a freeze on the reserved PE. +* +* Some old and bad PHBs block config space access to frozen PEs in +* addition to MMIOs, so unfreeze it here. +*/ + if (pe_no == phb->ioda.reserved_pe_idx) { + phb->unfreeze_pe(phb, phb->ioda.reserved_pe_idx, +OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + return; + } + /* * If PHB supports compound PE, freeze it for * consistency. -- 2.21.0
[Very RFC 28/46] powernv/iov: Move SR-IOV PF state out of pci_dn
Move the SR-IOV into a platform specific structure. I'm sure stashing all the SR-IOV state in pci_dn seemed like a good idea at the time, but it results in a lot of powernv specifics being leaked out of the platform directory. Moving all the PHB3/4 specific M64 BAR wrangling into a PowerNV specific structure helps to clarify the role of pci_dn and ensures that the platform specifics stay that way. This will make the code easier to understand and modify since we don't need to so much aboute PowerNV changes breaking pseries and EEH, and vis-a-vis. Signed-off-by: Oliver O'Halloran --- TODO: Remove all the sriov stuff from pci_dn. We can't do that yet because the pseries SRIOV support was a giant hack that re-used some of the previously powernv specific fields. --- arch/powerpc/include/asm/device.h | 3 + arch/powerpc/platforms/powernv/pci-ioda.c | 199 -- arch/powerpc/platforms/powernv/pci.h | 36 3 files changed, 148 insertions(+), 90 deletions(-) diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 266542769e4b..4d8934db7ef5 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -49,6 +49,9 @@ struct dev_archdata { #ifdef CONFIG_CXL_BASE struct cxl_context *cxl_ctx; #endif +#ifdef CONFIG_PCI_IOV + void *iov_data; +#endif }; struct pdev_archdata { diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index a1c9315f3208..1c90feed233d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -966,14 +966,15 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) #ifdef CONFIG_PCI_IOV static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) { - struct pci_dn *pdn = pci_get_pdn(dev); - int i; struct resource *res, res2; + struct pnv_iov_data *iov; resource_size_t size; u16 num_vfs; + int i; if (!dev->is_physfn) return -EINVAL; + iov = pnv_iov_get(dev); /* * "offset" is in VFs. The M64 windows are sized so that when they @@ -983,7 +984,7 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) * separate PE, and changing the IOV BAR start address changes the * range of PEs the VFs are in. */ - num_vfs = pdn->num_vfs; + num_vfs = iov->num_vfs; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &dev->resource[i + PCI_IOV_RESOURCES]; if (!res->flags || !res->parent) @@ -1029,19 +1030,19 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) num_vfs, offset); if (offset < 0) { - devm_release_resource(&dev->dev, &pdn->holes[i]); - memset(&pdn->holes[i], 0, sizeof(pdn->holes[i])); + devm_release_resource(&dev->dev, &iov->holes[i]); + memset(&iov->holes[i], 0, sizeof(iov->holes[i])); } pci_update_resource(dev, i + PCI_IOV_RESOURCES); if (offset > 0) { - pdn->holes[i].start = res2.start; - pdn->holes[i].end = res2.start + size * offset - 1; - pdn->holes[i].flags = IORESOURCE_BUS; - pdn->holes[i].name = "pnv_iov_reserved"; + iov->holes[i].start = res2.start; + iov->holes[i].end = res2.start + size * offset - 1; + iov->holes[i].flags = IORESOURCE_BUS; + iov->holes[i].name = "pnv_iov_reserved"; devm_request_resource(&dev->dev, res->parent, - &pdn->holes[i]); + &iov->holes[i]); } } return 0; @@ -1273,37 +1274,37 @@ static void pnv_pci_ioda_setup_PEs(void) #ifdef CONFIG_PCI_IOV static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) { + struct pnv_iov_data *iov; struct pnv_phb*phb; - struct pci_dn *pdn; inti, j; intm64_bars; phb = pci_bus_to_pnvhb(pdev->bus); - pdn = pci_get_pdn(pdev); + iov = pnv_iov_get(pdev); - if (pdn->m64_single_mode) + if (iov->m64_single_mode) m64_bars = num_vfs; else m64_bars = 1; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) for (j = 0; j < m64_bars; j++) { - if (pdn->m64_map[j][
[Very RFC 29/46] powernv/pci: Remove open-coded PE lookup in PELT-V setup
Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 32 +-- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 1c90feed233d..5bd7c1b058da 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -760,6 +760,11 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, } } + /* +* Walk the bridges up to the root. Along the way mark this PE as +* downstream of the bridge PE(s) so that errors upstream errors +* also cause this PE to be frozen. +*/ if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) pdev = pe->pbus->self; else if (pe->flags & PNV_IODA_PE_DEV) @@ -768,16 +773,27 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, else if (pe->flags & PNV_IODA_PE_VF) pdev = pe->parent_dev; #endif /* CONFIG_PCI_IOV */ + while (pdev) { - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *parent; + struct pnv_ioda_pe *parent = pnv_ioda_get_pe(pdev); - if (pdn && pdn->pe_number != IODA_INVALID_PE) { - parent = &phb->ioda.pe_array[pdn->pe_number]; - ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); - if (ret) - return ret; - } + /* +* FIXME: This is called from pcibios_setup_bridge(), which is called +* from the bottom (leaf) bridge to the root. This means that this +* doesn't actually setup the PELT-V entries since the PEs for +* the bridges above assigned after this is run for the leaf. +* +* FIXMEFIXME: might not be true since moving PE configuration +* into pcibios_bus_add_device(). +*/ + if (!parent) + break; + + WARN_ON(!parent || parent->pe_number == IODA_INVALID_PE); + + ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); + if (ret) + return ret; pdev = pdev->bus->self; } -- 2.21.0
[Very RFC 30/46] powernv/pci: Remove open-coded PE lookup in PELT-V teardown
Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5bd7c1b058da..d4b5ee926222 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -853,11 +853,13 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) /* Release from all parents PELT-V */ while (parent) { - struct pci_dn *pdn = pci_get_pdn(parent); - if (pdn && pdn->pe_number != IODA_INVALID_PE) { - rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); - /* XXX What to do in case of error ? */ + struct pnv_ioda_pe *parent_pe = pnv_ioda_get_pe(parent); + + if (parent_pe) { + rc = opal_pci_set_peltv(phb->opal_id, + parent_pe->pe_number, + pe->pe_number, + OPAL_REMOVE_PE_FROM_DOMAIN); } parent = parent->bus->self; } -- 2.21.0
[Very RFC 31/46] powernv/pci: Remove open-coded PE lookup in pnv_pci_ioda_dma_dev_setup()
Use the helper to look up the pnv_ioda_pe for the device we're configuring DMA for. In the VF case there's no need set pdn->pe_number since nothing looks at it any more. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d4b5ee926222..98d858999a2d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1709,10 +1709,9 @@ int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) { - struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; - pe = &phb->ioda.pe_array[pdn->pe_number]; + pe = pnv_ioda_get_pe(pdev); WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); pdev->dev.archdata.dma_offset = pe->tce_bypass_base; set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); -- 2.21.0
[Very RFC 32/46] powernv/pci: Remove open-coded PE lookup in iommu_bypass_supported()
Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 98d858999a2d..7e88de18ead6 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1801,13 +1801,11 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, u64 dma_mask) { struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *pe; + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + if (WARN_ON(!pe)) return false; - pe = &phb->ioda.pe_array[pdn->pe_number]; if (pe->tce_bypass_enabled) { u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; if (dma_mask >= top) -- 2.21.0
[Very RFC 33/46] powernv/pci: Remove open-coded PE lookup in iommu notifier
Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 5b1f4677cdce..0eeea8652426 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -943,23 +943,22 @@ static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb, { struct device *dev = data; struct pci_dev *pdev; - struct pci_dn *pdn; struct pnv_ioda_pe *pe; struct pnv_phb *phb; switch (action) { case BUS_NOTIFY_ADD_DEVICE: pdev = to_pci_dev(dev); - pdn = pci_get_pdn(pdev); phb = pci_bus_to_pnvhb(pdev->bus); WARN_ON_ONCE(!phb); - if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb) + if (!phb) return 0; - pe = &phb->ioda.pe_array[pdn->pe_number]; - if (!pe->table_group.group) + pe = pnv_ioda_get_pe(pdev); + if (!pe || !pe->table_group.group) return 0; + iommu_add_device(&pe->table_group, dev); return 0; case BUS_NOTIFY_DEL_DEVICE: -- 2.21.0
[Very RFC 34/46] powernv/pci: Remove open-coded PE lookup in pnv_pci_enable_device_hook()
Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 7 +-- 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7e88de18ead6..4f38652c7cd7 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3382,7 +3382,6 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, static bool pnv_pci_enable_device_hook(struct pci_dev *dev) { struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); - struct pci_dn *pdn; /* The function is probably called while the PEs have * not be created yet. For example, resource reassignment @@ -3392,11 +3391,7 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) if (!phb->initialized) return true; - pdn = pci_get_pdn(dev); - if (!pdn || pdn->pe_number == IODA_INVALID_PE) - return false; - - return true; + return !!pnv_ioda_get_pe(dev); } static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group, -- 2.21.0
[Very RFC 35/46] powernv/pci: Remove open-coded PE lookup in pnv_pci_release_device
Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4f38652c7cd7..8525642b1256 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3562,14 +3562,14 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) static void pnv_pci_release_device(struct pci_dev *pdev) { struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *pe; /* The VF PE state is torn down when sriov_disable() is called */ if (pdev->is_virtfn) return; - if (!pdn || pdn->pe_number == IODA_INVALID_PE) + if (WARN_ON(!pe)) return; /* @@ -3588,7 +3588,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev) * be increased on adding devices. It leads to unbalanced PE's device * count and eventually make normal PCI hotplug path broken. */ - pe = &phb->ioda.pe_array[pdn->pe_number]; pdn->pe_number = IODA_INVALID_PE; WARN_ON(--pe->device_count < 0); -- 2.21.0
[Very RFC 36/46] powernv/npu: Remove open-coded PE lookup for GPU device
Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/npu-dma.c | 13 ++--- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index b95b9e3c4c98..68bfaef44862 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -97,25 +97,16 @@ EXPORT_SYMBOL(pnv_pci_get_npu_dev); static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, struct pci_dev **gpdev) { - struct pnv_phb *phb; - struct pci_controller *hose; struct pci_dev *pdev; struct pnv_ioda_pe *pe; - struct pci_dn *pdn; pdev = pnv_pci_get_gpu_dev(npe->pdev); if (!pdev) return NULL; - pdn = pci_get_pdn(pdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return NULL; - - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - pe = &phb->ioda.pe_array[pdn->pe_number]; + pe = pnv_ioda_get_pe(pdev); - if (gpdev) + if (pe && pdev) *gpdev = pdev; return pe; -- 2.21.0
[Very RFC 37/46] powernv/pci: Use the PHB's rmap for pnv_ioda_to_pe()
Rather than using the pdn->pe_number for a device as an index into the IODA PE array we can use the reverse map. This maps the RID (i.e. bdfn) to the PE number associated with it. Firmware maintains a copy of the rmap which is used by the hardware for determining which PE to use when handling a DMA so this gets us a bit closer to the model used by the HW, which is comprensible by mortals, rather than... whatever the hell is going on currently. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 8525642b1256..d111a50fbe68 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -672,13 +672,9 @@ struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, u16 bdfn) struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); - struct pci_dn *pdn = pci_get_pdn(dev); + u16 bdfn = (dev->bus->number << 8) | dev->devfn; - if (!pdn) - return NULL; - if (pdn->pe_number == IODA_INVALID_PE) - return NULL; - return &phb->ioda.pe_array[pdn->pe_number]; + return __pnv_ioda_get_pe(phb, bdfn); } static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, -- 2.21.0
[Very RFC 38/46] powerpc/pci-hotplug: Scan the whole bus when using PCI_PROBE_NORMAL
Currently when using the normal (i.e not building pci_dev's from the DT node) probe method we only scan the devfn corresponding to the first child of the bridge's DT node. This doesn't make much sense to me, but it seems to have worked so far. At a guess it seems to work because in a PCIe environment the first downstream child will be at devfn 00.0. In any case it's completely broken when no pci_dn is available. Remove the PCI_DN checking and scan each of the device number that might be on the downstream bus. Cc: Benjamin Herrenschmidt Signed-off-by: Oliver O'Halloran --- I'm not sure we should be using pci_scan_slot() directly here. Maybe there's some insane legacy reason for it. --- arch/powerpc/kernel/pci-hotplug.c | 15 --- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index d6a67f814983..85299c769768 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -123,17 +123,10 @@ void pci_hp_add_devices(struct pci_bus *bus) if (mode == PCI_PROBE_DEVTREE) { /* use ofdt-based probe */ of_rescan_bus(dn, bus); - } else if (mode == PCI_PROBE_NORMAL && - dn->child && PCI_DN(dn->child)) { - /* -* Use legacy probe. In the partial hotplug case, we -* probably have grandchildren devices unplugged. So -* we don't check the return value from pci_scan_slot() in -* order for fully rescan all the way down to pick them up. -* They can have been removed during partial hotplug. -*/ - slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); - pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); + } else if (mode == PCI_PROBE_NORMAL) { + for (slotno = 0; slotno < 255; slotno += 8) + pci_scan_slot(bus, slotno); + max = bus->busn_res.start; /* * Scan bridges that are already configured. We don't touch -- 2.21.0
[Very RFC 39/46] powernv/npu: Avoid pci_dn when mapping device_node to a pci_dev
There's no need to use the pci_dn to find a device_node from a pci_dev. Just search for the node pointed to by the pci_dev's of_node pointer. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/npu-dma.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 68bfaef44862..72d3749da02c 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -21,11 +21,11 @@ static struct pci_dev *get_pci_dev(struct device_node *dn) { - struct pci_dn *pdn = PCI_DN(dn); - struct pci_dev *pdev; + struct pci_dev *pdev = NULL; - pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus), - pdn->busno, pdn->devfn); + for_each_pci_dev(pdev) + if (pdev->dev.of_node == dn) + break; /* * pci_get_domain_bus_and_slot() increased the reference count of -- 2.21.0
[Very RFC 40/46] powernv/npu: Don't drop refcount when looking up GPU pci_devs
The comment here implies that we don't need to take a ref to the pci_dev because the ioda_pe will always have one. This implies that the current expection is that the pci_dev for an NPU device will *never* be torn down since the ioda_pe having a ref to the device will prevent the release function from being called. In other words, the desired behaviour here appears to be leaking a ref. Nice! Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/npu-dma.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 72d3749da02c..2eb6e6d45a98 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -28,15 +28,10 @@ static struct pci_dev *get_pci_dev(struct device_node *dn) break; /* -* pci_get_domain_bus_and_slot() increased the reference count of -* the PCI device, but callers don't need that actually as the PE -* already holds a reference to the device. Since callers aren't -* aware of the reference count change, call pci_dev_put() now to -* avoid leaks. +* NB: for_each_pci_dev() elevates the pci_dev refcount. +* Caller is responsible for dropping the ref when it's +* finished with it. */ - if (pdev) - pci_dev_put(pdev); - return pdev; } -- 2.21.0
[Very RFC 41/46] powernv/eeh: Remove pdn setup for SR-IOV VFs
We don't need a pci_dn for the VF any more, so we can skip adding them. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 16 1 file changed, 16 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d111a50fbe68..d3e375d71cdc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1526,7 +1526,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) for (vf_index = 0; vf_index < num_vfs; vf_index++) { int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index); int vf_bus = pci_iov_virtfn_bus(pdev, vf_index); - struct pci_dn *vf_pdn; if (iov->m64_single_mode) pe_num = iov->pe_num_map[vf_index]; @@ -1558,15 +1557,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) list_add_tail(&pe->list, &phb->ioda.pe_list); mutex_unlock(&phb->ioda.pe_list_mutex); - /* associate this pe to it's pdn */ - list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) { - if (vf_pdn->busno == vf_bus && - vf_pdn->devfn == vf_devfn) { - vf_pdn->pe_number = pe_num; - break; - } - } - pnv_pci_ioda2_setup_dma_pe(phb, pe); #ifdef CONFIG_IOMMU_API iommu_register_group(&pe->table_group, @@ -1688,17 +1678,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) int pnv_pcibios_sriov_disable(struct pci_dev *pdev) { pnv_pci_sriov_disable(pdev); - - /* Release PCI data */ - remove_sriov_vf_pdns(pdev); return 0; } int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { - /* Allocate PCI data */ - add_sriov_vf_pdns(pdev); - return pnv_pci_sriov_enable(pdev, num_vfs); } #endif /* CONFIG_PCI_IOV */ -- 2.21.0
[Very RFC 42/46] powernv/pci: Don't clear pdn->pe_number in pnv_pci_release_device
Nothing looks at it anymore. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 12 1 file changed, 12 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d3e375d71cdc..45d940730c30 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3541,9 +3541,7 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) static void pnv_pci_release_device(struct pci_dev *pdev) { - struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); - struct pci_dn *pdn = pci_get_pdn(pdev); /* The VF PE state is torn down when sriov_disable() is called */ if (pdev->is_virtfn) @@ -3560,16 +3558,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev) if (pdev->is_physfn) kfree(pdev->dev.archdata.iov_data); - /* -* PCI hotplug can happen as part of EEH error recovery. The @pdn -* isn't removed and added afterwards in this scenario. We should -* set the PE number in @pdn to an invalid one. Otherwise, the PE's -* device count is decreased on removing devices while failing to -* be increased on adding devices. It leads to unbalanced PE's device -* count and eventually make normal PCI hotplug path broken. -*/ - pdn->pe_number = IODA_INVALID_PE; - WARN_ON(--pe->device_count < 0); if (pe->device_count == 0) pnv_ioda_release_pe(pe); -- 2.21.0
[Very RFC 43/46] powernv/pci: Do not set pdn->pe_number for NPU/CAPI devices
The only thing we need the pdn for in this function is setting the pe_number field, which we don't use anymore. Fix the weird refcounting behaviour while we're here. Signed-off-by: Oliver O'Halloran --- Either Fred, or Reza also fixed this in some patch lately and that'll probably get merged before this one does. --- arch/powerpc/platforms/powernv/pci-ioda.c | 27 +-- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 45d940730c30..2a9201306543 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1066,16 +1066,13 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); - struct pci_dn *pdn = pci_get_pdn(dev); - struct pnv_ioda_pe *pe; + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); - if (!pdn) { - pr_err("%s: Device tree node not associated properly\n", - pci_name(dev)); + /* Already has a PE assigned? huh? */ + if (pe) { + WARN_ON(1); return NULL; } - if (pdn->pe_number != IODA_INVALID_PE) - return NULL; pe = pnv_ioda_alloc_pe(phb); if (!pe) { @@ -1084,29 +1081,25 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) return NULL; } - /* NOTE: We get only one ref to the pci_dev for the pdn, not for the -* pointer in the PE data structure, both should be destroyed at the -* same time. However, this needs to be looked at more closely again -* once we actually start removing things (Hotplug, SR-IOV, ...) + /* +* NB: We **do not** hold a pci_dev ref for pe->pdev. * -* At some point we want to remove the PDN completely anyways +* The pci_dev's release function cleans up the ioda_pe state, so: +* a) We can't take a ref otherwise the release function is never called +* b) The pe->pdev pointer will always point to valid pci_dev (or NULL) */ - pci_dev_get(dev); - pdn->pe_number = pe->pe_number; pe->flags = PNV_IODA_PE_DEV; pe->pdev = dev; pe->pbus = NULL; pe->mve_number = -1; - pe->rid = dev->bus->number << 8 | pdn->devfn; + pe->rid = dev->bus->number << 8 | dev->devfn; pe_info(pe, "Associated device to PE\n"); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ pnv_ioda_free_pe(pe); - pdn->pe_number = IODA_INVALID_PE; pe->pdev = NULL; - pci_dev_put(dev); return NULL; } -- 2.21.0